# Stock Analysis - PCA and KMeans Unsupervised Methods

In [1]:
# import context manager: SparkSession
from pyspark.sql import SparkSession

# import data types
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType
import pyspark.sql.types as typ
import pyspark.sql.functions as F
import os

#from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.sql.types import *

spark = SparkSession.builder \
        .master("local[*]") \
        .appName("mllib_classifier") \
        .config("spark.executor.memory", '10g') \
        .config('spark.executor.cores', '10') \
        .config('spark.executor.instances', '1') \
        .config("spark.driver.memory",'2g') \
        .getOrCreate()

sc = spark.sparkContext

# import data manipulation methods
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator


from pyspark.ml.linalg import DenseVector
#from pyspark.mllib.linalg import Vectors

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

#from pyspark.ml.clustering import KMeans
#from pyspark.ml.evaluation import ClusteringEvaluator

Import the file as an RDD. Since its CSV, the split is very easy.

In [2]:
all_data = sc.textFile('/../../project/ds5559/Alice_Ed_Michael_Sam_project/BigTrips.csv')
rdd = all_data.map(lambda x: x.split(","))

For development purposes, take a subsample.

In [3]:
#rdd = sc.parallelize(all_data.take(1000))

#rdd = all_data

#all_data.take(100)

Since it's an RDD, remove the header row. Convert to a dataframe.

In [4]:
header = rdd.take(1)[0]
rdd = rdd.filter(lambda x: x != header)
final_DF = rdd.toDF()

Extract out just the trip id and start timestamp. Using pyspark sql functions, convert to a proper timestamp data type. Repeat for end timestamp.

In [5]:
#https://stackoverflow.com/questions/53304688/spark-date-format-mmm-dd-yyyy-hhmmss-am-to-timestamp-in-df
#https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
start_times = rdd.map(lambda x: (x[0],x[1]))
start_times = start_times.toDF()
st = start_times.withColumn("Trip_Start_Timestamp",F.to_timestamp(F.col("_2"), "MM/dd/yyyy hh:mm:ss a"))

In [6]:
st.show(5)

+--------------------+--------------------+--------------------+
|                  _1|                  _2|Trip_Start_Timestamp|
+--------------------+--------------------+--------------------+
|bea79abbef050980e...|12/01/2019 12:15:...| 2019-12-01 00:15:00|
|00f26da5601bbcf98...|12/01/2019 12:15:...| 2019-12-01 00:15:00|
|02256ef89c5c4be82...|12/01/2019 12:15:...| 2019-12-01 00:15:00|
|072cb06b1a88042c4...|12/01/2019 12:15:...| 2019-12-01 00:15:00|
|099257be99c66c8b2...|12/01/2019 12:15:...| 2019-12-01 00:15:00|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [7]:
end_times = rdd.map(lambda x: (x[0],x[2]))
end_times = end_times.toDF()
et = end_times.withColumn("Trip_End_Timestamp",F.to_timestamp(F.col("_2"), "MM/dd/yyyy hh:mm:ss a"))

Put data back together with our timestamp fields. Cast all our fields to their proper data types and rename. The original timestamp fields we keep with suffix "_str"

In [8]:
final_DF = final_DF.join(st.select("_1","Trip_Start_Timestamp"),on="_1").join(et.select("_1","Trip_End_Timestamp"),on="_1")

In [9]:
final_DF.show(5)

+--------------------+--------------------+--------------------+----+----+-----------+-----------+---+---+----+---+----+-----+-----+---+-------------+--------------+--------------------+-------------+--------------+--------------------+--------------------+-------------------+
|                  _1|                  _2|                  _3|  _4|  _5|         _6|         _7| _8| _9| _10|_11| _12|  _13|  _14|_15|          _16|           _17|                 _18|          _19|           _20|                 _21|Trip_Start_Timestamp| Trip_End_Timestamp|
+--------------------+--------------------+--------------------+----+----+-----------+-----------+---+---+----+---+----+-----+-----+---+-------------+--------------+--------------------+-------------+--------------+--------------------+--------------------+-------------------+
|00002a2b4769d1834...|03/06/2020 12:45:...|03/06/2020 01:45:...|3117|22.9|           |           |   | 43|42.5|  0|3.54|46.04|false|  1|             |              | 

In [10]:
final_DF = final_DF.withColumn('Trip_Seconds',F.col('_4').cast("integer"))
final_DF = final_DF.withColumn('Trip_Miles',F.col('_5').cast("double"))
final_DF = final_DF.withColumn('Pickup_Community_Area',F.col('_8').cast("integer"))
final_DF = final_DF.withColumn('Dropoff_Community_Area',F.col('_9').cast("integer"))
final_DF = final_DF.withColumn('Fare',F.col('_10').cast("double"))
final_DF = final_DF.withColumn('Tip',F.col('_11').cast("double"))
final_DF = final_DF.withColumn('Additional_Charges',F.col('_12').cast("double"))
final_DF = final_DF.withColumn('Trip_Total',F.col('_13').cast("double"))
final_DF = final_DF.withColumn('Shared_Trip_Authorized',F.col('_14').cast("boolean"))
final_DF = final_DF.withColumn('Trips_Pooled',F.col('_15').cast("integer"))

In [11]:
final_DF = final_DF.withColumnRenamed("_1","trip_id") \
                    .withColumnRenamed("_2","Trip_Start_Timestamp_str") \
                    .withColumnRenamed("_3","Trip_End_Timestamp_str") \
                    .withColumnRenamed("_4","Trip_Seconds_str") \
                    .withColumnRenamed("_5","Trip_Miles_str") \
                    .withColumnRenamed("_6","Pickup_Census_Tract") \
                    .withColumnRenamed("_7","Dropoff_Census_Tract") \
                    .withColumnRenamed("_8","Pickup_Community_Area_str") \
                    .withColumnRenamed("_9","Dropoff_Community_Area_str") \
                    .withColumnRenamed("_10","Fare_str") \
                    .withColumnRenamed("_11","Tip_str") \
                    .withColumnRenamed("_12","Additional_Charges_str") \
                    .withColumnRenamed("_13","Trip_Total_str") \
                    .withColumnRenamed("_14","Shared_Trip_Authorized_str") \
                    .withColumnRenamed("_15","Trips_Pooled_str") \
                    .withColumnRenamed("_16","Pickup_Centroid_Latitude") \
                    .withColumnRenamed("_17","Pickup_Centroid_Longitude") \
                    .withColumnRenamed("_18","Pickup_Centroid_Location") \
                    .withColumnRenamed("_19","Dropoff_Centroid_Latitude") \
                    .withColumnRenamed("_20","Dropoff_Centroid_Longitude") \
                    .withColumnRenamed("_21","Dropoff_Centroid_Location")

In [12]:
final_DF.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- Trip_Start_Timestamp_str: string (nullable = true)
 |-- Trip_End_Timestamp_str: string (nullable = true)
 |-- Trip_Seconds_str: string (nullable = true)
 |-- Trip_Miles_str: string (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area_str: string (nullable = true)
 |-- Dropoff_Community_Area_str: string (nullable = true)
 |-- Fare_str: string (nullable = true)
 |-- Tip_str: string (nullable = true)
 |-- Additional_Charges_str: string (nullable = true)
 |-- Trip_Total_str: string (nullable = true)
 |-- Shared_Trip_Authorized_str: string (nullable = true)
 |-- Trips_Pooled_str: string (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullable = true)
 |-- Dropoff_Centroid_L

In [13]:
final_DF.cache()

DataFrame[trip_id: string, Trip_Start_Timestamp_str: string, Trip_End_Timestamp_str: string, Trip_Seconds_str: string, Trip_Miles_str: string, Pickup_Census_Tract: string, Dropoff_Census_Tract: string, Pickup_Community_Area_str: string, Dropoff_Community_Area_str: string, Fare_str: string, Tip_str: string, Additional_Charges_str: string, Trip_Total_str: string, Shared_Trip_Authorized_str: string, Trips_Pooled_str: string, Pickup_Centroid_Latitude: string, Pickup_Centroid_Longitude: string, Pickup_Centroid_Location: string, Dropoff_Centroid_Latitude: string, Dropoff_Centroid_Longitude: string, Dropoff_Centroid_Location: string, Trip_Start_Timestamp: timestamp, Trip_End_Timestamp: timestamp, Trip_Seconds: int, Trip_Miles: double, Pickup_Community_Area: int, Dropoff_Community_Area: int, Fare: double, Tip: double, Additional_Charges: double, Trip_Total: double, Shared_Trip_Authorized: boolean, Trips_Pooled: int]

Add in columns with various date and time values from our trip start and end. Arbitarily choose start time to pull year, month, week number, day of week, and date from.

In [14]:
#https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#data-types
final_DF = final_DF.withColumn("Trip_Year", F.year(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_Month", F.month(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_WeekNumber", F.weekofyear(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_DayofWeek", F.dayofweek(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_Start_Hour", F.hour(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_End_Hour", F.hour(F.col("Trip_End_Timestamp"))) \
                    .withColumn("Date", F.to_date(F.col("Trip_Start_Timestamp")))

In [15]:
final_DF.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- Trip_Start_Timestamp_str: string (nullable = true)
 |-- Trip_End_Timestamp_str: string (nullable = true)
 |-- Trip_Seconds_str: string (nullable = true)
 |-- Trip_Miles_str: string (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area_str: string (nullable = true)
 |-- Dropoff_Community_Area_str: string (nullable = true)
 |-- Fare_str: string (nullable = true)
 |-- Tip_str: string (nullable = true)
 |-- Additional_Charges_str: string (nullable = true)
 |-- Trip_Total_str: string (nullable = true)
 |-- Shared_Trip_Authorized_str: string (nullable = true)
 |-- Trips_Pooled_str: string (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullable = true)
 |-- Dropoff_Centroid_L

In [16]:
# final_DF.count()

# result 49,108,003

In [17]:
#Doesn't update if you don't resave the variable'

df = final_DF.drop('Trip_Start_Timestamp_str',
                   'Trip_End_Timestamp_str',
                   'Trip_Seconds_str',
                   'Trip_Miles_str',
                   'Pickup_Community_Area_str',
                   'Dropoff_Community_Area_str',
                   'Fare_str',
                   'Tip_str',
                   'Additional_Charges_str',
                   'Trip_Total_str',
                   'Shared_Trip_Authorized_str',
                   'Trips_Pooled_str', 
                   'Pickup_Census_Tract',
                   'Dropoff_Census_Tract',
                   'Pickup_Centroid_Latitude',
                   'Pickup_Centroid_Longitude', 
                   'Pickup_Centroid_Location', 
                   'Dropoff_Centroid_Latitude', 
                   'Dropoff_Centroid_Longitude', 
                   'Dropoff_Centroid_Location')


In [18]:
df.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- Trip_Start_Timestamp: timestamp (nullable = true)
 |-- Trip_End_Timestamp: timestamp (nullable = true)
 |-- Trip_Seconds: integer (nullable = true)
 |-- Trip_Miles: double (nullable = true)
 |-- Pickup_Community_Area: integer (nullable = true)
 |-- Dropoff_Community_Area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: double (nullable = true)
 |-- Additional_Charges: double (nullable = true)
 |-- Trip_Total: double (nullable = true)
 |-- Shared_Trip_Authorized: boolean (nullable = true)
 |-- Trips_Pooled: integer (nullable = true)
 |-- Trip_Year: integer (nullable = true)
 |-- Trip_Month: integer (nullable = true)
 |-- Trip_WeekNumber: integer (nullable = true)
 |-- Trip_DayofWeek: integer (nullable = true)
 |-- Trip_Start_Hour: integer (nullable = true)
 |-- Trip_End_Hour: integer (nullable = true)
 |-- Date: date (nullable = true)



In [19]:
%whos

Variable                            Type            Data/Info
-------------------------------------------------------------
ArrayType                           type            <class 'pyspark.sql.types.ArrayType'>
Binarizer                           type            <class 'pyspark.ml.feature.Binarizer'>
BinaryClassificationEvaluator       type            <class 'pyspark.ml.evalua<...>ClassificationEvaluator'>
BinaryClassificationMetrics         type            <class 'pyspark.mllib.eva<...>ryClassificationMetrics'>
BinaryType                          type            <class 'pyspark.sql.types.BinaryType'>
BooleanType                         type            <class 'pyspark.sql.types.BooleanType'>
BucketedRandomProjectionLSH         type            <class 'pyspark.ml.featur<...>etedRandomProjectionLSH'>
BucketedRandomProjectionLSHModel    type            <class 'pyspark.ml.featur<...>andomProjectionLSHModel'>
Bucketizer                          type            <class 'pyspark.ml.feature.B

In [20]:
df.write.csv('restructured_data.csv')

In [21]:
del (all_data)
del (final_DF)
del (end_times)
del (et)
del (header)
del (st)
del (start_times)

In [22]:
df2 = df.sample(False, .0005, seed = 2021) #decreased our sample size
df2.cache()

DataFrame[trip_id: string, Trip_Start_Timestamp: timestamp, Trip_End_Timestamp: timestamp, Trip_Seconds: int, Trip_Miles: double, Pickup_Community_Area: int, Dropoff_Community_Area: int, Fare: double, Tip: double, Additional_Charges: double, Trip_Total: double, Shared_Trip_Authorized: boolean, Trips_Pooled: int, Trip_Year: int, Trip_Month: int, Trip_WeekNumber: int, Trip_DayofWeek: int, Trip_Start_Hour: int, Trip_End_Hour: int, Date: date]

In [23]:
#df2.count()

In [24]:
#delete the big df for now
del (df)

#hopefully that will make things faster 

In [25]:
df2.show(5)


+--------------------+--------------------+-------------------+------------+----------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+---------+----------+---------------+--------------+---------------+-------------+----------+
|             trip_id|Trip_Start_Timestamp| Trip_End_Timestamp|Trip_Seconds|Trip_Miles|Pickup_Community_Area|Dropoff_Community_Area|Fare|Tip|Additional_Charges|Trip_Total|Shared_Trip_Authorized|Trips_Pooled|Trip_Year|Trip_Month|Trip_WeekNumber|Trip_DayofWeek|Trip_Start_Hour|Trip_End_Hour|      Date|
+--------------------+--------------------+-------------------+------------+----------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+---------+----------+---------------+--------------+---------------+-------------+----------+
|03c8ea04dbf3be99f...| 2020-02-07 22:45:00|2020-02-07 23:00:00|         512|       1.2|          

In [26]:
#fill our NA community areas

df2 = df2.na.fill(value=78,subset=['Pickup_Community_Area', 'Dropoff_Community_Area'])

In [27]:
# make a binary tip/no tip indicator
# https://spark.apache.org/docs/2.2.0/ml-features.html#binarizer

#binarized tip seems to be causing problems.  Change its name to label as that is that the packages are expecting

binarizer = Binarizer(threshold=0, inputCol="Tip", outputCol="label")
df2 = binarizer.transform(df2)

In [28]:
df2.cache()

DataFrame[trip_id: string, Trip_Start_Timestamp: timestamp, Trip_End_Timestamp: timestamp, Trip_Seconds: int, Trip_Miles: double, Pickup_Community_Area: int, Dropoff_Community_Area: int, Fare: double, Tip: double, Additional_Charges: double, Trip_Total: double, Shared_Trip_Authorized: boolean, Trips_Pooled: int, Trip_Year: int, Trip_Month: int, Trip_WeekNumber: int, Trip_DayofWeek: int, Trip_Start_Hour: int, Trip_End_Hour: int, Date: date, label: double]

In [29]:
df2.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- Trip_Start_Timestamp: timestamp (nullable = true)
 |-- Trip_End_Timestamp: timestamp (nullable = true)
 |-- Trip_Seconds: integer (nullable = true)
 |-- Trip_Miles: double (nullable = true)
 |-- Pickup_Community_Area: integer (nullable = true)
 |-- Dropoff_Community_Area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: double (nullable = true)
 |-- Additional_Charges: double (nullable = true)
 |-- Trip_Total: double (nullable = true)
 |-- Shared_Trip_Authorized: boolean (nullable = true)
 |-- Trips_Pooled: integer (nullable = true)
 |-- Trip_Year: integer (nullable = true)
 |-- Trip_Month: integer (nullable = true)
 |-- Trip_WeekNumber: integer (nullable = true)
 |-- Trip_DayofWeek: integer (nullable = true)
 |-- Trip_Start_Hour: integer (nullable = true)
 |-- Trip_End_Hour: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- label: double (nullable = true)



In [30]:
# split the data

# our model didn't work on the standard test train split.  Prof. Tashman recomended upscalling the help with the imbalanced dataset.
#from https://spark.apache.org/docs/2.1.0/ml-tuning.html#train-validation-split

train_inital, test = df2.randomSplit([0.8, 0.2], seed=2021)

# cahce our test values for later speed
test.cache()

# oversampleing code sample
# https://stackoverflow.com/questions/53273133/how-to-perform-up-sampling-using-sample-functionpy-spark

df_a = train_inital.filter(train_inital['label'] == 0)
df_b = train_inital.filter(train_inital['label'] == 1)

org_a_count = df_a.count()
org_b_count = df_b.count() 


ratio = df_a.count() / df_b.count()
# print(ratio)

df_b_overampled = df_b.sample(withReplacement=True, fraction=ratio, seed=2021)

# cahce our train values for later speed
train = df_a.unionAll(df_b_overampled).cache()

df_af = train.filter(train_inital['label'] == 0)
df_bf = train.filter(train_inital['label'] == 1)
fin_a_count = df_af.count()
fin_b_count = df_bf.count() 

print("Original No Tip Count: ", org_a_count)
print("Original Tip Count   : ", org_b_count)
print("")
print("Final No Tip Count   : ", fin_a_count)
print("Final Tip Count      : ", fin_b_count)


Original No Tip Count:  16403
Original Tip Count   :  3268

Final No Tip Count   :  16403
Final Tip Count      :  16459


In [31]:
test.show(5)
train.show(5)

+--------------------+--------------------+-------------------+------------+----------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+---------+----------+---------------+--------------+---------------+-------------+----------+-----+
|             trip_id|Trip_Start_Timestamp| Trip_End_Timestamp|Trip_Seconds|Trip_Miles|Pickup_Community_Area|Dropoff_Community_Area|Fare|Tip|Additional_Charges|Trip_Total|Shared_Trip_Authorized|Trips_Pooled|Trip_Year|Trip_Month|Trip_WeekNumber|Trip_DayofWeek|Trip_Start_Hour|Trip_End_Hour|      Date|label|
+--------------------+--------------------+-------------------+------------+----------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+---------+----------+---------------+--------------+---------------+-------------+----------+-----+
|075e008fcc3dde8eb...| 2019-12-19 12:30:00|2019-12-19 12:45:00|        1189|   

Our training size has increased.  This is to be expected in upscaling.

Good reference: https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets

In [32]:
#just as a reminder what was the truth in our test data?

dft_a = test.filter(train_inital['label'] == 0)
dft_b = test.filter(train_inital['label'] == 1)
count_test_a = dft_a.count()
count_test_b = dft_b.count()
print(count_test_a)
print(count_test_b)


4188
827


In [33]:
del (df2)

# Pipeline for LR

In [34]:
def cmacc(pred):
    # make a confusion matrix and return the accuracy
    # select predictions and labels from prediction transform as rdd as there isn't a DF function for this
    pred_rdd= pred.select('prediction').rdd.flatMap(lambda x: x)
    label_rdd = pred.select('label').rdd.flatMap(lambda x: x)
    
    #zip them together
    predictionAndLabels =  pred_rdd.zip(label_rdd)
    
    #metrics transform
    metrics = MulticlassMetrics(predictionAndLabels)
    
    metrics2 = BinaryClassificationMetrics(predictionAndLabels)
    
    #make our confusion matrix
    cm = metrics.confusionMatrix().toArray()

    #calc accuracy from confusion matrix
    
    acc = (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
    
    # McM accuracy
    
    acc2 = metrics.accuracy
    
    #calc area under curve
    auc = metrics2.areaUnderROC
    
    prc = metrics2.areaUnderPR
    
    print("Confusion Matrix")
    print(cm)
    print()
    print("Accuracy from Confusion Matrix: ", acc)
    print()
    print("Accuracy from MulticlassMetrics: ", acc2)
    print()     
    print("Area Under the ROC", auc)
    print()

In [35]:
def cmacc2(pred):
    # make a confusion matrix and return the accuracy
    # select predictions and labels from prediction transform as rdd as there isn't a DF function for this
    pred_rdd= pred.select('prediction').rdd.flatMap(lambda x: x)
    label_rdd = pred.select('label').rdd.flatMap(lambda x: x)
    
    #zip them together
    predictionAndLabels =  pred_rdd.zip(label_rdd)
    
    #metrics transform
    metrics = MulticlassMetrics(predictionAndLabels)
    
    metrics2 = BinaryClassificationMetrics(predictionAndLabels)
    
    #make our confusion matrix
    cm = metrics.confusionMatrix().toArray()

    #calc accuracy from confusion matrix
    
    acc = (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
    
    #McM accuracy
    acc2 = metrics.accuracy
    
    #calc area under curve
    auc = metrics2.areaUnderROC
    
    prc = metrics2.areaUnderPR
    
    #precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
       
    print("Confusion Matrix")
    print(cm)
    print()
    print("Accuracy from Confusion Matrix: ", acc)
    print("Accuracy from MulticlassMetrics: ", acc2)
    print()
    print("Area Under the ROC", auc)
    print()
    print("Area Under the PR Curve", prc)
   
    print("Summary Stats")
    #print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

In [36]:
train.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- Trip_Start_Timestamp: timestamp (nullable = true)
 |-- Trip_End_Timestamp: timestamp (nullable = true)
 |-- Trip_Seconds: integer (nullable = true)
 |-- Trip_Miles: double (nullable = true)
 |-- Pickup_Community_Area: integer (nullable = true)
 |-- Dropoff_Community_Area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: double (nullable = true)
 |-- Additional_Charges: double (nullable = true)
 |-- Trip_Total: double (nullable = true)
 |-- Shared_Trip_Authorized: boolean (nullable = true)
 |-- Trips_Pooled: integer (nullable = true)
 |-- Trip_Year: integer (nullable = true)
 |-- Trip_Month: integer (nullable = true)
 |-- Trip_WeekNumber: integer (nullable = true)
 |-- Trip_DayofWeek: integer (nullable = true)
 |-- Trip_Start_Hour: integer (nullable = true)
 |-- Trip_End_Hour: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- label: double (nullable = true)



In [37]:
# pipeline steps for Logistic Regression:

#onehotencoder to pickup
ohe_pu = OneHotEncoder(inputCol="Pickup_Community_Area", outputCol="Pickup_Community_Area_vec")

#onehotencoder to dropoff
ohe_do = OneHotEncoder(inputCol="Dropoff_Community_Area", outputCol="Dropoff_Community_Area_vec")

#onehotencoder to weekNumber
ohe_twn = OneHotEncoder(inputCol="Trip_WeekNumber", outputCol="Trip_WeekNumber_vec")

#onehotencoder to dayOfWeek
ohe_dw = OneHotEncoder(inputCol="Trip_DayofWeek", outputCol="Trip_DayofWeek_vec")

#onehotencoder to startHour
ohe_sh = OneHotEncoder(inputCol="Trip_Start_Hour", outputCol="Trip_Start_Hour_vec")

#onehotencoder to endHour
ohe_eh = OneHotEncoder(inputCol="Trip_End_Hour", outputCol="Trip_End_Hour_vec")

#assemble the vector or LR

# our colulms for lr
predictor_col_for_lr = ['Trip_Seconds',
                        'Trip_Miles',
                        'Fare',
                        'Additional_Charges',
                        'Shared_Trip_Authorized',
                        'Trips_Pooled',
                        'Pickup_Community_Area_vec',
                        'Dropoff_Community_Area_vec',
                        'Trip_Year', 
                        'Trip_Month',
                        'Trip_WeekNumber_vec', 
                        'Trip_DayofWeek_vec', 
                        'Trip_Start_Hour_vec',
                        'Trip_End_Hour_vec'
                        ] # 'Date' not supported datatype

lr_va = VectorAssembler(inputCols=predictor_col_for_lr, outputCol="features") 

#scale our LR

lr_scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)

#what do we want to do if we are doing a parameter search? make the parameters as variables and just do a loop?
#we learned that this week.  May also need to add in cv step

lr = LogisticRegression(maxIter=10,
                        regParam=0.1, #org 0.1
                        elasticNetParam=0.3, #org 0.3
                        featuresCol="features",
                        labelCol="label")

# Build the pipeline
lr_pipeline = Pipeline(stages=[ohe_pu, ohe_do, ohe_twn, ohe_dw, ohe_sh, ohe_eh, lr_va, lr_scaler, lr])

# Fit the pipeline
lr_model = lr_pipeline.fit(train)

# Make a prediction
lr_prediction = lr_model.transform(test)

In [38]:
cmacc(lr_prediction)

Confusion Matrix
[[3165. 1023.]
 [ 505.  322.]]

Accuracy from Confusion Matrix:  0.6953140578265204

Accuracy from MulticlassMetrics:  0.6953140578265204

Area Under the ROC 0.5725448942045506



In [39]:
cmacc2(lr_prediction)

TypeError: recall() missing 1 required positional argument: 'label'

The above confirms the results of the first pass of the grid search from tuning below

# Pipeline for RF

In [81]:
# pipeline steps for Random Forest:

#onehotencoder to pickup
ohe_pu = OneHotEncoder(inputCol="Pickup_Community_Area", outputCol="Pickup_Community_Area_vec")

#onehotencoder to dropoff
ohe_do = OneHotEncoder(inputCol="Dropoff_Community_Area", outputCol="Dropoff_Community_Area_vec")

#onehotencoder to weekNumber
ohe_twn = OneHotEncoder(inputCol="Trip_WeekNumber", outputCol="Trip_WeekNumber_vec")

#onehotencoder to dayOfWeek
ohe_dw = OneHotEncoder(inputCol="Trip_DayofWeek", outputCol="Trip_DayofWeek_vec")

#onehotencoder to startHour
ohe_sh = OneHotEncoder(inputCol="Trip_Start_Hour", outputCol="Trip_Start_Hour_vec")

#onehotencoder to endHour
ohe_eh = OneHotEncoder(inputCol="Trip_End_Hour", outputCol="Trip_End_Hour_vec")

# our colulms for rf
predictor_col_for_rf =  ['Trip_Seconds',
                        'Trip_Miles',
                        'Fare',
                        'Additional_Charges',
                        'Shared_Trip_Authorized',
                        'Trips_Pooled',
                        'Pickup_Community_Area_vec',
                        'Dropoff_Community_Area_vec',
                        'Trip_Year', 
                        'Trip_Month',
                        'Trip_WeekNumber_vec', 
                        'Trip_DayofWeek_vec', 
                        'Trip_Start_Hour_vec',
                        'Trip_End_Hour_vec']

# assemble feature vector
rf_va = VectorAssembler(inputCols=predictor_col_for_rf, outputCol="features") 

# set classifier
rf = RandomForestClassifier(labelCol="label", 
                            featuresCol="features", 
                            numTrees=10)

# Build the pipeline
rf_pipeline = Pipeline(stages=[ohe_pu, ohe_do, ohe_twn, ohe_dw, ohe_sh, ohe_eh, rf_va, rf])

# Fit the pipeline
rf_model = rf_pipeline.fit(train)

# Make a prediction
rf_prediction = rf_model.transform(test)

# Select example rows to display.
#rf_prediction.select("prediction", "label","features").show(5) 

# Select (prediction, true label) and compute test error
rf_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
#metric options f1|accuracy|weightedPrecision|weightedRecall

rf_accuracy = rf_evaluator.evaluate(rf_prediction)

print("Test Error = %g" % (1.0 - rf_accuracy))
print("Accuracy: " , rf_accuracy)

rfModel2 = rf_model.stages[7]
print(rfModel2)  # summary only

Test Error = 0.380857
Accuracy:  0.6191425722831505
RandomForestClassificationModel: uid=RandomForestClassifier_628354a6ca93, numTrees=10, numClasses=2, numFeatures=269


In [82]:
cmacc(rf_prediction)

Confusion Matrix
[[2631. 1557.]
 [ 353.  474.]]

Accuracy from Confusion Matrix:  0.6191425722831505

Accuracy from MulticlassMetrics:  0.6191425722831505

Area Under the ROC 0.6006897405958639



# Pipeline for GBT

In [None]:
#try a GBT 

# https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier

In [86]:
#onehotencoder to pickup
ohe_pu = OneHotEncoder(inputCol="Pickup_Community_Area", outputCol="Pickup_Community_Area_vec")

#onehotencoder to dropoff
ohe_do = OneHotEncoder(inputCol="Dropoff_Community_Area", outputCol="Dropoff_Community_Area_vec")

#onehotencoder to weekNumber
ohe_twn = OneHotEncoder(inputCol="Trip_WeekNumber", outputCol="Trip_WeekNumber_vec")

#onehotencoder to dayOfWeek
ohe_dw = OneHotEncoder(inputCol="Trip_DayofWeek", outputCol="Trip_DayofWeek_vec")

#onehotencoder to startHour
ohe_sh = OneHotEncoder(inputCol="Trip_Start_Hour", outputCol="Trip_Start_Hour_vec")

#onehotencoder to endHour
ohe_eh = OneHotEncoder(inputCol="Trip_End_Hour", outputCol="Trip_End_Hour_vec")

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
# labelIndexer = StringIndexer(inputCol="binarized_tip", outputCol="indexedLabel")#.fit(data)

#assemble the vector or GBT

# our colulms for gbt
predictor_col_for_gbt = ['Trip_Seconds',
                        'Trip_Miles',
                        'Fare',
                        'Additional_Charges',
                        'Shared_Trip_Authorized',
                        'Trips_Pooled',
                        'Pickup_Community_Area_vec',
                        'Dropoff_Community_Area_vec',
                        'Trip_Year', 
                        'Trip_Month',
                        'Trip_WeekNumber_vec', 
                        'Trip_DayofWeek_vec', 
                        'Trip_Start_Hour_vec',
                        'Trip_End_Hour_vec']

gbt_va = VectorAssembler(inputCols=predictor_col_for_gbt, outputCol="features") 

# Automatically identify categorical features, and index them.
# # Set maxCategories so features with > 4 distinct values are treated as continuous.
# featureIndexer =\
#     VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)#.fit(data)

# Train a GBT model.
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=5)

# Chain indexers and GBT in a Pipeline
gbt_pipeline = Pipeline(stages=[ohe_pu, ohe_do, ohe_twn, ohe_dw, ohe_sh, ohe_eh, gbt_va, gbt]) #labelIndexer, featureIndexer

# Train model.  This also runs the indexers.
gbt_model = gbt_pipeline.fit(train)

# Make predictions.
gbt_prediction = gbt_model.transform(test)

#gbt_prediction.show(3)

# Select example rows to display.
gbt_prediction.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
gbt_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
gbt_accuracy = gbt_evaluator.evaluate(gbt_prediction)
print("Test Error = %g" % (1.0 - gbt_accuracy))
print('gbt accuracy = ', gbt_accuracy)
gbtModel = gbt_model.stages[7]
print(gbtModel)  # summary only

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  0.0|(269,[0,1,2,3,5,1...|
|       1.0|  0.0|(269,[0,1,2,3,5,6...|
|       0.0|  0.0|(269,[0,1,2,4,5,4...|
|       0.0|  0.0|(269,[0,1,2,3,5,4...|
|       0.0|  0.0|(269,[0,1,2,3,5,7...|
+----------+-----+--------------------+
only showing top 5 rows

Test Error = 0.358923
gbt accuracy =  0.6410767696909272
GBTClassificationModel: uid = GBTClassifier_ea71d901a054, numTrees=5, numClasses=2, numFeatures=269


In [87]:
cmacc(gbt_prediction)

Confusion Matrix
[[2791. 1397.]
 [ 403.  424.]]

Accuracy from Confusion Matrix:  0.6410767696909272

Accuracy from MulticlassMetrics:  0.6410767696909272

Area Under the ROC 0.5895621912783574



# Pipeline LR with Tuning


In [None]:
# pipeline steps for Logistic Regression:

#onehotencoder to pickup
ohe_pu = OneHotEncoder(inputCol="Pickup_Community_Area", outputCol="Pickup_Community_Area_vec")

#onehotencoder to dropoff
ohe_do = OneHotEncoder(inputCol="Dropoff_Community_Area", outputCol="Dropoff_Community_Area_vec")

#assemble the vector or LR

# our colulms for lr
predictor_col_for_lr = ['Trip_Seconds',
                        'Trip_Miles',
                        'Fare',
                        'Additional_Charges',
                        'Shared_Trip_Authorized',
                        'Trips_Pooled',
                        'Pickup_Community_Area_vec',
                        'Dropoff_Community_Area_vec']

lr_va = VectorAssembler(inputCols=predictor_col_for_lr, outputCol="features") 

#scale our LR

lr_scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

#what do we want to do if we are doing a parameter search? make the parameters as variables and just do a loop?
#we learned that this week.  May also need to add in cv step

lr = LogisticRegression(featuresCol="features",
                        labelCol="label") # regParam=0.1, elasticNetParam=0.3, maxIter=10,

# Build the pipeline
lr_pipeline = Pipeline(stages=[ohe_pu, ohe_do, lr_va, lr_scaler, lr])

# Set up the parameter grid
lr_paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.02, 0.03]) \
    .addGrid(lr.elasticNetParam, [0.1, 0.15]) \
    .addGrid(lr.maxIter, [5, 10]) \
    .build()

print('len(lr_paramGrid): {}'.format(len(lr_paramGrid)))
'''
25k set
best from tuning inital (accuracy):
addGrid(lr.regParam, [0.03, 0.05, 0.07]) \
.addGrid(lr.elasticNetParam, [0.15, 0.2, 0.25]) \
.addGrid(lr.maxIter, [8, 9, 10, 11, 12]) 

regParam= 0.03
elasticNetParam = 0.15
maxIter = 12
 
Confusion Matrix
[[1842. 2203.]
 [ 193.  595.]]

Accuracy from Confusion Matrix:  0.5042416718394372
Accuracy from MulticlassMetrics:  0.5042416718394372

Area Under the ROC 0.6052265753923187

Area Under the PR Curve 0.2065770273222743
Summary Stats
Precision = 0.5042416718394372
Recall = 0.5042416718394372
F1 Score = 0.5042416718394372
Weighted recall = 0.5042416718394371
Weighted precision = 0.7922492654683645
Weighted F(1) Score = 0.5612342974368173
Weighted F(0.5) Score = 0.6730989071456968
Weighted false positive rate = 0.2937885210547999

2nd round:

lr_paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.02, 0.03]) \
    .addGrid(lr.elasticNetParam, [0.05, 0.1, 0.15]) \
    .addGrid(lr.maxIter, [11, 12, 13, 15]) \

regParam= 0.03
elasticNetParam = 0.15
maxIter = 11

Confusion Matrix
[[1815. 2230.]
 [ 183.  605.]]

Accuracy from Confusion Matrix:  0.5007241878750258
Accuracy from MulticlassMetrics:  0.5007241878750258

Area Under the ROC 0.6082342994108162

Area Under the PR Curve 0.2075564549699384
Summary Stats
Precision = 0.5007241878750258
Recall = 0.5007241878750258
F1 Score = 0.5007241878750258
Weighted recall = 0.5007241878750258
Weighted precision = 0.7950908896146499
Weighted F(1) Score = 0.5572078454446675
Weighted F(0.5) Score = 0.6716684076809212
Weighted false positive rate = 0.28425558905339354

'''

In [None]:
# Treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
lr_crossval = CrossValidator(estimator=lr_pipeline,
                          estimatorParamMaps=lr_paramGrid,
                          #evaluator=BinaryClassificationEvaluator(metricName='areaUnderROC'), #we can pass in our own function if necessary
                          evaluator= MulticlassClassificationEvaluator(metricName='accuracy'),
                          numFolds=5)

# you can do a custom evaluator, but it seems to be a lot of work.  https://stackoverflow.com/questions/51404344/custom-evaluator-in-pyspark
# we can use either areaUnderROC or areaUnderPR as defaults for binary.
# f1|accuracy|weightedPrecision|weightedRecall for multiclass

In [None]:
#how to find all our items we can call
#dir(crossval.evaluator)

In [None]:
# Run cross-validation, and choose the best set of parameters. Print the training time.

t0 = time.time()
lr_cvModel = lr_crossval.setParallelism(6).fit(train) # train 6 models in parallel
print("train time:", time.time() - t0)

In [None]:
#not sure what this metric is... apparently it is RMSE https://projector-video-pdf-converter.datacamp.com/14989/chapter4.pdf
#lr_cvModel.avgMetrics

In [None]:
# magic code from https://stackoverflow.com/questions/36697304/how-to-extract-model-hyper-parameters-from-spark-ml-in-pyspark

# lr_cvModel.getEstimatorParamMaps()[ np.argmax(lr_cvModel.avgMetrics) ]

In [None]:
#https://dsharpc.github.io/SparkMLFlights/
#best?
#cvModel.getEstimatorParamMaps()[ np.argmin(cvModel.avgMetrics) ]

lr_cvModel.getEstimatorParamMaps()[ np.argmin(lr_cvModel.avgMetrics) ]

In [None]:
lr_cvModel.getEstimatorParamMaps()[ np.argmax(lr_cvModel.avgMetrics) ]

In [None]:
# Make predictions on test documents. cvModel uses the best model found (lrModel).
lr_prediction = lr_cvModel.transform(test)

In [None]:
#prediction.show(10)

In [None]:
#cmacc(lr_prediction)

In [None]:
cmacc2(lr_prediction)

# Pipeline RF with Tuning


In [None]:
# # pipeline steps for RF:

# # Index labels, adding metadata to the label column.
# # Fit on whole dataset to include all labels in index.
# labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

# #onehotencoder to pickup
# ohe_pu = OneHotEncoder(inputCol="Pickup_Community_Area", outputCol="Pickup_Community_Area_vec")

# #onehotencoder to dropoff
# ohe_do = OneHotEncoder(inputCol="Dropoff_Community_Area", outputCol="Dropoff_Community_Area_vec")

# #assemble the vector or LR

# # our colulms for rf
# predictor_col_for_rf = ['Trip_Seconds',
#                         'Trip_Miles',
#                         'Fare',
#                         'Additional_Charges',
#                         'Shared_Trip_Authorized',
#                         'Trips_Pooled',
#                         'Pickup_Community_Area_vec',
#                         'Dropoff_Community_Area_vec']

# #assemble the vector or RF
                             
# rf_va = VectorAssembler(inputCols=predictor_col_for_rf, outputCol="features") 

# # Automatically identify categorical features, and index them.
# # Set maxCategories so features with > 4 distinct values are treated as continuous.
# featureIndexer =\
#     VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) #what if 10 like in other basic version

# #what do we want to do if we are doing a parameter search? make the parameters as variables and just do a loop?
# #we learned that this week.  May also need to add in cv step

# rf = RandomForestClassifier(labelCol="indexedLabel", 
#                             featuresCol="indexedFeatures") #numTrees=10

# # Build the pipeline
# rf_pipeline = Pipeline(stages=[ohe_pu, ohe_do, labelIndexer, rf_va, featureIndexer, rf])

# pipeline steps for Random Forest:

#onehotencoder to pickup
ohe_pu = OneHotEncoder(inputCol="Pickup_Community_Area", outputCol="Pickup_Community_Area_vec")

#onehotencoder to dropoff
ohe_do = OneHotEncoder(inputCol="Dropoff_Community_Area", outputCol="Dropoff_Community_Area_vec")

# our colulms for rf
predictor_col_for_rf = ['Trip_Seconds',
                        'Trip_Miles',
                        'Fare',
                        'Additional_Charges',
                        'Shared_Trip_Authorized',
                        'Trips_Pooled',
                        'Pickup_Community_Area_vec',
                        'Dropoff_Community_Area_vec']

# assemble feature vector
rf_va = VectorAssembler(inputCols=predictor_col_for_rf, outputCol="features") 

# set classifier
rf = RandomForestClassifier(labelCol="label", 
                            featuresCol="features")

# Build the pipeline
rf_pipeline = Pipeline(stages=[ohe_pu, ohe_do, rf_va, rf])

# Set up the parameter grid
rf_paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [5, 10]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .addGrid(rf.impurity, ["gini"])\
    .build()
    #.addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt'])\
    

print('len(rf_paramGrid): {}'.format(len(rf_paramGrid)))

#https://medium.com/rahasak/random-forest-classifier-with-apache-spark-c63b4a23a7cc
#maxDepth, maxBins, impurity, auto and seed 
#.addGrid(randomForestClassifier.impurity, Array("entropy", "gini"))
#name='featureSubsetStrategy', auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n]

'''
best from tuning inital (accuracy):
rf_paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [5, 10]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .addGrid(rf.impurity, ["entropy", "gini"])\
    .build()

numTrees = 10
maxDepth = 10
impurity = gini
 
Confusion Matrix
[[2901. 1144.]
 [ 456.  332.]]

Accuracy from Confusion Matrix:  0.6689426857024623
Accuracy from MulticlassMetrics:  0.6689426857024623

Area Under the ROC 0.5692507513819781

Area Under the PR Curve 0.2070259967551608
Summary Stats
Precision = 0.6689426857024623
Recall = 0.6689426857024623
F1 Score = 0.6689426857024623
Weighted recall = 0.6689426857024622
Weighted precision = 0.7599403563099746
Weighted F(1) Score = 0.7038591473392332
Weighted F(0.5) Score = 0.735232181263078
Weighted false positive rate = 0.530441182938506

2nd attempt:

rf_paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [5, 10, 15]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .addGrid(rf.impurity, ["entropy", "gini"])\
    .addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt'])\
    .build()


numTrees = 10
maxDepth = 15
impurity = gini
featureSubsetStrategy = auto

Confusion Matrix
[[2663. 1382.]
 [ 388.  400.]]

Accuracy from Confusion Matrix:  0.6337678460583489
Accuracy from MulticlassMetrics:  0.6337678460583489

Area Under the ROC 0.5829789236570811

Area Under the PR Curve 0.209345437091233
Summary Stats
Precision = 0.6337678460583489
Recall = 0.6337678460583489
F1 Score = 0.6337678460583489
Weighted recall = 0.6337678460583488
Weighted precision = 0.7671159775546591
Weighted F(1) Score = 0.6789410276493224
Weighted F(0.5) Score = 0.7270236282319229
Weighted false positive rate = 0.46780999874418644

'''



In [None]:
# Treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
rf_crossval = CrossValidator(estimator=rf_pipeline,
                          estimatorParamMaps=rf_paramGrid,
                          #evaluator=BinaryClassificationEvaluator(metricName='areaUnderROC'), #we can pass in our own function if necessary
                          evaluator= MulticlassClassificationEvaluator(metricName='accuracy'),
                          numFolds=5)

# you can do a custom evaluator, but it seems to be a lot of work.  https://stackoverflow.com/questions/51404344/custom-evaluator-in-pyspark
# we can use either areaUnderROC or areaUnderPR as defaults for binary.
# f1|accuracy|weightedPrecision|weightedRecall for multiclass

In [None]:
#how to find all our items we can call
#dir(crossval.evaluator)

In [None]:
# Run cross-validation, and choose the best set of parameters. Print the training time.

t0 = time.time()
cvModel_rf = rf_crossval.setParallelism(6).fit(train) # train 6 models in parallel
print("train time:", time.time() - t0)

In [None]:
#not sure what this metric is... rmse
cvModel_rf.avgMetrics

In [None]:
# magic code from https://stackoverflow.com/questions/36697304/how-to-extract-model-hyper-parameters-from-spark-ml-in-pyspark

cvModel_rf.getEstimatorParamMaps()[ np.argmax(cvModel_rf.avgMetrics) ]

In [None]:
# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction_rf = cvModel_rf.transform(test)

In [None]:
#prediction.show(10)

In [None]:
cmacc2(prediction_rf)

In [None]:
#https://dsharpc.github.io/SparkMLFlights/
#best?
#cvModel.getEstimatorParamMaps()[ np.argmin(cvModel.avgMetrics) ]

cvModel_rf.getEstimatorParamMaps()[ np.argmin(cvModel_rf.avgMetrics) ]

In [None]:
cvModel_rf.getEstimatorParamMaps()[ np.argmax(cvModel_rf.avgMetrics) ]

# Pipeline GBT with Tuning

In [None]:
# pipeline steps for RF:

# # Index labels, adding metadata to the label column.
# # Fit on whole dataset to include all labels in index.
# labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

# #onehotencoder to pickup
# ohe_pu = OneHotEncoder(inputCol="Pickup_Community_Area", outputCol="Pickup_Community_Area_vec")

# #onehotencoder to dropoff
# ohe_do = OneHotEncoder(inputCol="Dropoff_Community_Area", outputCol="Dropoff_Community_Area_vec")

# #assemble the vector or LR

# # our colulms for rf
# predictor_col_for_gbt = ['Trip_Seconds',
#                         'Trip_Miles',
#                         'Fare',
#                         'Additional_Charges',
#                         'Shared_Trip_Authorized',
#                         'Trips_Pooled',
#                         'Pickup_Community_Area_vec',
#                         'Dropoff_Community_Area_vec']

# #assemble the vector or RF
                             
# gbt_va = VectorAssembler(inputCols=predictor_col_for_gbt, outputCol="features") 

# # Automatically identify categorical features, and index them.
# # Set maxCategories so features with > 4 distinct values are treated as continuous.
# featureIndexer =\
#     VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)

# #what do we want to do if we are doing a parameter search? make the parameters as variables and just do a loop?
# #we learned that this week.  May also need to add in cv step

# # Train a GBT model.
# gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")#maxIter=5

# # Build the pipeline
# gbt_pipeline = Pipeline(stages=[ohe_pu, ohe_do, labelIndexer, gbt_va, featureIndexer, gbt])

#onehotencoder to pickup
ohe_pu = OneHotEncoder(inputCol="Pickup_Community_Area", outputCol="Pickup_Community_Area_vec")

#onehotencoder to dropoff
ohe_do = OneHotEncoder(inputCol="Dropoff_Community_Area", outputCol="Dropoff_Community_Area_vec")

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
# labelIndexer = StringIndexer(inputCol="binarized_tip", outputCol="indexedLabel")#.fit(data)

#assemble the vector or GBT

# our colulms for gbt
predictor_col_for_gbt = ['Trip_Seconds',
                        'Trip_Miles',
                        'Fare',
                        'Additional_Charges',
                        'Shared_Trip_Authorized',
                        'Trips_Pooled',
                        'Pickup_Community_Area_vec',
                        'Dropoff_Community_Area_vec']

gbt_va = VectorAssembler(inputCols=predictor_col_for_rf, outputCol="features") 

# Train a GBT model.
gbt = GBTClassifier(labelCol="label", featuresCol="features") #, maxIter=5

# Chain indexers and GBT in a Pipeline
gbt_pipeline = Pipeline(stages=[ohe_pu, ohe_do, gbt_va, gbt]) #labelIndexer, featureIndexer

# Set up the parameter grid
gbt_paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [5, 10])\
    .addGrid(gbt.maxDepth, [5, 10])\
    .build()

print('len(gbt_paramGrid): {}'.format(len(gbt_paramGrid)))


'''
best from tuning inital (accuracy):
gbt_paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [5, 10, 20]) \
    .build()

maxIter = 20

Confusion Matrix
[[2798. 1247.]
 [ 423.  365.]]

Accuracy from Confusion Matrix:  0.654458928201945
Accuracy from MulticlassMetrics:  0.654458928201945

Area Under the ROC 0.5774580700620556

Area Under the PR Curve 0.2094152550126291
Summary Stats
Precision = 0.654458928201945
Recall = 0.654458928201945
F1 Score = 0.654458928201945
Weighted recall = 0.654458928201945
Weighted precision = 0.7639586098089827
Weighted F(1) Score = 0.6941837869282138
Weighted F(0.5) Score = 0.7327747544723259
Weighted false positive rate = 0.4995427880778336


maxIter = 40

Confusion Matrix
[[2571. 1474.]
 [ 382.  406.]]

Accuracy from Confusion Matrix:  0.6159735154148562
Accuracy from MulticlassMetrics:  0.6159735154148562

Area Under the ROC 0.5754139659791808

Area Under the PR Curve 0.20313239804234073
Summary Stats
Precision = 0.6159735154148562
Recall = 0.6159735154148562
F1 Score = 0.6159735154148562
Weighted recall = 0.6159735154148562
Weighted precision = 0.7638968296438198
Weighted F(1) Score = 0.6646010165217533
Weighted F(0.5) Score = 0.7183436330713325
Weighted false positive rate = 0.4651455834564943


gbt_paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [5, 10])\
    .addGrid(gbt.maxDepth, [5, 10])\
    .build()
    
maxIter = 5
maxDepth = 5

Confusion Matrix
[[2863. 1182.]
 [ 469.  319.]]

Accuracy from Confusion Matrix:  0.6583902338092282
Accuracy from MulticlassMetrics:  0.6583902338092282

Area Under the ROC 0.5563048634335804

Area Under the PR Curve 0.19780050930331394
Summary Stats
Precision = 0.6583902338092282
Recall = 0.6583902338092282
F1 Score = 0.6583902338092282
Weighted recall = 0.6583902338092282
Weighted precision = 0.7537989743798752
Weighted F(1) Score = 0.6950856095348379
Weighted F(0.5) Score = 0.7279222226014918
Weighted false positive rate = 0.5457805069420676
'''



In [None]:

# Treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
gbt_crossval = CrossValidator(estimator=gbt_pipeline,
                          estimatorParamMaps=gbt_paramGrid,
                          #evaluator=BinaryClassificationEvaluator(metricName='areaUnderROC'), #we can pass in our own function if necessary
                          evaluator= MulticlassClassificationEvaluator(metricName='accuracy'),
                          numFolds=5)

# you can do a custom evaluator, but it seems to be a lot of work.  https://stackoverflow.com/questions/51404344/custom-evaluator-in-pyspark
# we can use either areaUnderROC or areaUnderPR as defaults for binary.
# f1|accuracy|weightedPrecision|weightedRecall for multiclass

In [None]:
#how to find all our items we can call
#dir(crossval.evaluator)

In [None]:
# Run cross-validation, and choose the best set of parameters. Print the training time.

t0 = time.time()
cvModel_gbt = gbt_crossval.setParallelism(6).fit(train) # train 6 models in parallel
print("train time:", time.time() - t0)

In [None]:
#not sure what this metric is...
cvModel_gbt.avgMetrics

In [None]:
# magic code from https://stackoverflow.com/questions/36697304/how-to-extract-model-hyper-parameters-from-spark-ml-in-pyspark

#cvModel_gbt.getEstimatorParamMaps()[ np.argmax(cvModel_rf.avgMetrics) ]

cvModel_gbt.getEstimatorParamMaps()[np.argmax(cvModel_gbt.avgMetrics)]

In [None]:
cvModel_gbt.getEstimatorParamMaps()[np.argmin(cvModel_gbt.avgMetrics)]

In [None]:
# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction_gbt = cvModel_gbt.transform(test)

In [None]:
#prediction.show(10)

In [None]:
cmacc2(prediction_gbt)