# Stock Analysis - PCA and KMeans Unsupervised Methods

In [1]:
# import context manager: SparkSession
from pyspark.sql import SparkSession

# import data types
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType
import pyspark.sql.types as typ
import pyspark.sql.functions as F
import os

#from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.sql.types import *

spark = SparkSession.builder \
        .master("local[*]") \
        .appName("mllib_classifier") \
        .config("spark.executor.memory", '10g') \
        .config('spark.executor.cores', '10') \
        .config('spark.executor.instances', '1') \
        .config("spark.driver.memory",'2g') \
        .getOrCreate()

sc = spark.sparkContext

# import data manipulation methods
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.linalg import DenseVector
#from pyspark.mllib.linalg import Vectors

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

import from other file to ensure everying works... can be edited down later

In [3]:
# import context manager: SparkSession
from pyspark.sql import SparkSession

# import data types
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType
import pyspark.sql.types as typ
import pyspark.sql.functions as F
import os


from pyspark.sql.types import *

spark = SparkSession.builder \
        .master("local[*]") \
        .appName("mllib_classifier") \
        .config("spark.executor.memory", '21g') \
        .config('spark.executor.cores', '2') \
        .config('spark.executor.instances', '3') \
        .config("spark.driver.memory",'1g') \
        .getOrCreate()
sc = spark.sparkContext

# import data manipulation methods
from pyspark.ml.feature import Binarizer
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import VectorAssembler 
from pyspark.mllib.linalg import Vectors

from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import OneHotEncoder
#from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import time

import numpy as np

Import the file as an RDD. Since its CSV, the split is very easy.

In [4]:
all_data = sc.textFile('/../../project/ds5559/Alice_Ed_Michael_Sam_project/BigTrips.csv')
all_data = all_data.map(lambda x: x.split(","))

For development purposes, take a subsample.

In [5]:
rdd = sc.parallelize(all_data.take(1000))

Since it's an RDD, remove the header row. Convert to a dataframe.

In [6]:
header = rdd.take(1)[0]
rdd = rdd.filter(lambda x: x != header)
final_DF = rdd.toDF()

Extract out just the trip id and start timestamp. Using pyspark sql functions, convert to a proper timestamp data type. Repeat for end timestamp.

In [7]:
#https://stackoverflow.com/questions/53304688/spark-date-format-mmm-dd-yyyy-hhmmss-am-to-timestamp-in-df
#https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
start_times = rdd.map(lambda x: (x[0],x[1]))
start_times = start_times.toDF()
st = start_times.withColumn("Trip_Start_Timestamp",F.to_timestamp(F.col("_2"), "MM/dd/yyyy hh:mm:ss a"))

In [8]:
end_times = rdd.map(lambda x: (x[0],x[2]))
end_times = end_times.toDF()
et = end_times.withColumn("Trip_End_Timestamp",F.to_timestamp(F.col("_2"), "MM/dd/yyyy hh:mm:ss a"))

Put data back together with our timestamp fields. Cast all our fields to their proper data types and rename. The original timestamp fields we keep with suffix "_str"

In [9]:
final_DF = final_DF.join(st.select("_1","Trip_Start_Timestamp"),on="_1").join(et.select("_1","Trip_End_Timestamp"),on="_1")

In [10]:
final_DF = final_DF.withColumn('Trip_Seconds',F.col('_4').cast("integer"))
final_DF = final_DF.withColumn('Trip_Miles',F.col('_5').cast("double"))
final_DF = final_DF.withColumn('Pickup_Community_Area',F.col('_8').cast("integer"))
final_DF = final_DF.withColumn('Dropoff_Community_Area',F.col('_9').cast("integer"))
final_DF = final_DF.withColumn('Fare',F.col('_10').cast("double"))
final_DF = final_DF.withColumn('Tip',F.col('_11').cast("double"))
final_DF = final_DF.withColumn('Additional_Charges',F.col('_12').cast("double"))
final_DF = final_DF.withColumn('Trip_Total',F.col('_13').cast("double"))
final_DF = final_DF.withColumn('Shared_Trip_Authorized',F.col('_14').cast("boolean"))
final_DF = final_DF.withColumn('Trips_Pooled',F.col('_15').cast("integer"))

In [11]:
final_DF = final_DF.withColumnRenamed("_1","trip_id") \
                    .withColumnRenamed("_2","Trip_Start_Timestamp_str") \
                    .withColumnRenamed("_3","Trip_End_Timestamp_str") \
                    .withColumnRenamed("_4","Trip_Seconds_str") \
                    .withColumnRenamed("_5","Trip_Miles_str") \
                    .withColumnRenamed("_6","Pickup_Census_Tract") \
                    .withColumnRenamed("_7","Dropoff_Census_Tract") \
                    .withColumnRenamed("_8","Pickup_Community_Area_str") \
                    .withColumnRenamed("_9","Dropoff_Community_Area_str") \
                    .withColumnRenamed("_10","Fare_str") \
                    .withColumnRenamed("_11","Tip_str") \
                    .withColumnRenamed("_12","Additional_Charges_str") \
                    .withColumnRenamed("_13","Trip_Total_str") \
                    .withColumnRenamed("_14","Shared_Trip_Authorized_str") \
                    .withColumnRenamed("_15","Trips_Pooled_str") \
                    .withColumnRenamed("_16","Pickup_Centroid_Latitude") \
                    .withColumnRenamed("_17","Pickup_Centroid_Longitude") \
                    .withColumnRenamed("_18","Pickup_Centroid_Location") \
                    .withColumnRenamed("_19","Dropoff_Centroid_Latitude") \
                    .withColumnRenamed("_20","Dropoff_Centroid_Longitude") \
                    .withColumnRenamed("_21","Dropoff_Centroid_Location")

In [12]:
final_DF.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- Trip_Start_Timestamp_str: string (nullable = true)
 |-- Trip_End_Timestamp_str: string (nullable = true)
 |-- Trip_Seconds_str: string (nullable = true)
 |-- Trip_Miles_str: string (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area_str: string (nullable = true)
 |-- Dropoff_Community_Area_str: string (nullable = true)
 |-- Fare_str: string (nullable = true)
 |-- Tip_str: string (nullable = true)
 |-- Additional_Charges_str: string (nullable = true)
 |-- Trip_Total_str: string (nullable = true)
 |-- Shared_Trip_Authorized_str: string (nullable = true)
 |-- Trips_Pooled_str: string (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullable = true)
 |-- Dropoff_Centroid_L

In [13]:
final_DF.cache()

DataFrame[trip_id: string, Trip_Start_Timestamp_str: string, Trip_End_Timestamp_str: string, Trip_Seconds_str: string, Trip_Miles_str: string, Pickup_Census_Tract: string, Dropoff_Census_Tract: string, Pickup_Community_Area_str: string, Dropoff_Community_Area_str: string, Fare_str: string, Tip_str: string, Additional_Charges_str: string, Trip_Total_str: string, Shared_Trip_Authorized_str: string, Trips_Pooled_str: string, Pickup_Centroid_Latitude: string, Pickup_Centroid_Longitude: string, Pickup_Centroid_Location: string, Dropoff_Centroid_Latitude: string, Dropoff_Centroid_Longitude: string, Dropoff_Centroid_Location: string, Trip_Start_Timestamp: timestamp, Trip_End_Timestamp: timestamp, Trip_Seconds: int, Trip_Miles: double, Pickup_Community_Area: int, Dropoff_Community_Area: int, Fare: double, Tip: double, Additional_Charges: double, Trip_Total: double, Shared_Trip_Authorized: boolean, Trips_Pooled: int]

Add in columns with various date and time values from our trip start and end. Arbitarily choose start time to pull year, month, week number, day of week, and date from.

In [14]:
#https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#data-types
final_DF = final_DF.withColumn("Trip_Year", F.year(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_Month", F.month(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_WeekNumber", F.weekofyear(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_DayofWeek", F.dayofweek(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_Start_Hour", F.hour(F.col("Trip_Start_Timestamp"))) \
                    .withColumn("Trip_End_Hour", F.hour(F.col("Trip_End_Timestamp"))) \
                    .withColumn("Date", F.to_date(F.col("Trip_Start_Timestamp")))

In [15]:
final_DF.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- Trip_Start_Timestamp_str: string (nullable = true)
 |-- Trip_End_Timestamp_str: string (nullable = true)
 |-- Trip_Seconds_str: string (nullable = true)
 |-- Trip_Miles_str: string (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area_str: string (nullable = true)
 |-- Dropoff_Community_Area_str: string (nullable = true)
 |-- Fare_str: string (nullable = true)
 |-- Tip_str: string (nullable = true)
 |-- Additional_Charges_str: string (nullable = true)
 |-- Trip_Total_str: string (nullable = true)
 |-- Shared_Trip_Authorized_str: string (nullable = true)
 |-- Trips_Pooled_str: string (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullable = true)
 |-- Dropoff_Centroid_L

In [16]:
final_DF.show(5)

+--------------------+------------------------+----------------------+----------------+--------------+-------------------+--------------------+-------------------------+--------------------------+--------+-------+----------------------+--------------+--------------------------+----------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+--------------------+-------------------+------------+----------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+---------+----------+---------------+--------------+---------------+-------------+----------+
|             trip_id|Trip_Start_Timestamp_str|Trip_End_Timestamp_str|Trip_Seconds_str|Trip_Miles_str|Pickup_Census_Tract|Dropoff_Census_Tract|Pickup_Community_Area_str|Dropoff_Community_Area_str|Fare_str|Tip_str|Additional_Charges_str|Trip_Total_str|Shared_Trip_Authoriz

For the first few hundred thousand records, the date will be the same. Distinct dates in our data can be checked here.

In [17]:
final_DF.select("Date").distinct().show(truncate=False)

+----------+
|Date      |
+----------+
|2019-12-01|
+----------+



Can you create the same dataset without a trip through RDD land?

In [38]:
# create a custom schema.  

customSchema = StructType([
    StructField('Trip_ID', StringType(), True),        
    StructField('Trip_Start_Timestamp', StringType(), True),
    StructField('Trip_End_Timestamp', StringType(), True),
    StructField('Trip_Seconds', DoubleType(), True),
    StructField('Trip_Miles', DoubleType(), True),
    StructField('Pickup_Census_Tract', StringType(), True),
    StructField('Dropoff_Census_Tract', StringType(), True),
    StructField('Pickup_Community_Area', DoubleType(), True),
    StructField('Dropoff_Community_Area', DoubleType(), True),
    StructField("Fare", DoubleType(), True),
    StructField("Tip", DoubleType(), True),
    StructField("Additional_Charges", DoubleType(), True),
    StructField("Trip_Total", StringType(), True),
    StructField("Shared_Trip_Authorized", BooleanType(), True),
    StructField("Trips_Pooled", DoubleType(), True),
    StructField('Pickup_Centroid_Latitude', StringType(), True),
    StructField('Pickup_Centroid_Longitude', StringType(), True),
    StructField('Pickup_Centroid_Location', StringType(), True),
    StructField('Dropoff_Centroid_Latitude', StringType(), True),
    StructField('Dropoff_Centroid_Longitude', StringType(), True),
    StructField('Dropoff_Centroid_Location', StringType(), True)])

#read in the data to a dataframe
ed_df = spark.read.csv('/../../project/ds5559/Alice_Ed_Michael_Sam_project/BigTrips.csv', header = True, schema=customSchema)
ed_df.show(5)

+--------------------+--------------------+--------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+
|             Trip_ID|Trip_Start_Timestamp|  Trip_End_Timestamp|Trip_Seconds|Trip_Miles|Pickup_Census_Tract|Dropoff_Census_Tract|Pickup_Community_Area|Dropoff_Community_Area|Fare|Tip|Additional_Charges|Trip_Total|Shared_Trip_Authorized|Trips_Pooled|Pickup_Centroid_Latitude|Pickup_Centroid_Longitude|Pickup_Centroid_Location|Dropoff_Centroid_Latitude|Dropoff_Centroid_Longitude|Dropoff_Centroid_Location|
+--------------------+--------------------+--------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+-------

In [39]:
ed_df.printSchema()

root
 |-- Trip_ID: string (nullable = true)
 |-- Trip_Start_Timestamp: string (nullable = true)
 |-- Trip_End_Timestamp: string (nullable = true)
 |-- Trip_Seconds: double (nullable = true)
 |-- Trip_Miles: double (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area: double (nullable = true)
 |-- Dropoff_Community_Area: double (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: double (nullable = true)
 |-- Additional_Charges: double (nullable = true)
 |-- Trip_Total: string (nullable = true)
 |-- Shared_Trip_Authorized: boolean (nullable = true)
 |-- Trips_Pooled: double (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullable = true)
 |-- Dropoff_Centroid_Longitude: string (nullable = true)
 |-- Dropoff

In [40]:
#fill our NA community areas

ed_df = ed_df.na.fill(value=78,subset=['Pickup_Community_Area', 'Dropoff_Community_Area'])

In [41]:
# make a binary tip/no tip indicator
# https://spark.apache.org/docs/2.2.0/ml-features.html#binarizer

#binarized tip seems to be causing problems.  Change its name to label as that is that the packages are expecting

binarizer = Binarizer(threshold=0, inputCol="Tip", outputCol="label")
ed_df = binarizer.transform(ed_df)

In [42]:
ed_df.printSchema()

root
 |-- Trip_ID: string (nullable = true)
 |-- Trip_Start_Timestamp: string (nullable = true)
 |-- Trip_End_Timestamp: string (nullable = true)
 |-- Trip_Seconds: double (nullable = true)
 |-- Trip_Miles: double (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area: double (nullable = false)
 |-- Dropoff_Community_Area: double (nullable = false)
 |-- Fare: double (nullable = true)
 |-- Tip: double (nullable = true)
 |-- Additional_Charges: double (nullable = true)
 |-- Trip_Total: string (nullable = true)
 |-- Shared_Trip_Authorized: boolean (nullable = true)
 |-- Trips_Pooled: double (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullable = true)
 |-- Dropoff_Centroid_Longitude: string (nullable = true)
 |-- Dropo

In [43]:
ed_df = ed_df.withColumn("Trip_Start_TS", F.to_timestamp(F.col("Trip_Start_Timestamp"), "MM/dd/yyyy hh:mm:ss a"))

In [44]:
ed_df = ed_df.withColumn("Trip_End_TS", F.to_timestamp(F.col("Trip_End_Timestamp"), "MM/dd/yyyy hh:mm:ss a"))

In [45]:
ed_df.printSchema()

root
 |-- Trip_ID: string (nullable = true)
 |-- Trip_Start_Timestamp: string (nullable = true)
 |-- Trip_End_Timestamp: string (nullable = true)
 |-- Trip_Seconds: double (nullable = true)
 |-- Trip_Miles: double (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area: double (nullable = false)
 |-- Dropoff_Community_Area: double (nullable = false)
 |-- Fare: double (nullable = true)
 |-- Tip: double (nullable = true)
 |-- Additional_Charges: double (nullable = true)
 |-- Trip_Total: string (nullable = true)
 |-- Shared_Trip_Authorized: boolean (nullable = true)
 |-- Trips_Pooled: double (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullable = true)
 |-- Dropoff_Centroid_Longitude: string (nullable = true)
 |-- Dropo

In [46]:
ed_df = ed_df.withColumn('Trip_Year',F.year(F.to_timestamp('Trip_Start_TS'))) \
         .withColumn('Trip_Month',F.month(F.to_timestamp('Trip_Start_TS'))) \
         .withColumn('Trip_WeekNumber',F.weekofyear(F.to_timestamp('Trip_Start_TS'))) \
         .withColumn('Trip_DayofWeek', F.dayofweek(F.col('Trip_Start_TS'))) \
         .withColumn('Trip_Start_Hour', F.hour(F.col('Trip_Start_TS'))) \
         .withColumn('Trip_Start_Minute', F.minute(F.col('Trip_Start_TS'))) \
         .withColumn('Date', F.to_date(F.col('Trip_Start_TS')))
         
ed_df.show(5, False)

+----------------------------------------+----------------------+----------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+------------------------+-------------------------+------------------------------------+-------------------------+--------------------------+------------------------------------+-----+-------------------+-------------------+---------+----------+---------------+--------------+---------------+-----------------+----------+
|Trip_ID                                 |Trip_Start_Timestamp  |Trip_End_Timestamp    |Trip_Seconds|Trip_Miles|Pickup_Census_Tract|Dropoff_Census_Tract|Pickup_Community_Area|Dropoff_Community_Area|Fare|Tip|Additional_Charges|Trip_Total|Shared_Trip_Authorized|Trips_Pooled|Pickup_Centroid_Latitude|Pickup_Centroid_Longitude|Pickup_Centroid_Location            |Dropoff_Centroid_Latitude|Dropoff_Centroid_L

In [47]:
ed_df = ed_df.withColumn('Trip_End_Hour', F.hour(F.col('Trip_End_TS'))) \
         .withColumn('Trip_End_Minute', F.minute(F.col('Trip_End_TS'))) 
         
ed_df.show(5, False)

+----------------------------------------+----------------------+----------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+------------------------+-------------------------+------------------------------------+-------------------------+--------------------------+------------------------------------+-----+-------------------+-------------------+---------+----------+---------------+--------------+---------------+-----------------+----------+-------------+---------------+
|Trip_ID                                 |Trip_Start_Timestamp  |Trip_End_Timestamp    |Trip_Seconds|Trip_Miles|Pickup_Census_Tract|Dropoff_Census_Tract|Pickup_Community_Area|Dropoff_Community_Area|Fare|Tip|Additional_Charges|Trip_Total|Shared_Trip_Authorized|Trips_Pooled|Pickup_Centroid_Latitude|Pickup_Centroid_Longitude|Pickup_Centroid_Location            |Dropoff_Centro

In [48]:
ed_df.printSchema()

root
 |-- Trip_ID: string (nullable = true)
 |-- Trip_Start_Timestamp: string (nullable = true)
 |-- Trip_End_Timestamp: string (nullable = true)
 |-- Trip_Seconds: double (nullable = true)
 |-- Trip_Miles: double (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area: double (nullable = false)
 |-- Dropoff_Community_Area: double (nullable = false)
 |-- Fare: double (nullable = true)
 |-- Tip: double (nullable = true)
 |-- Additional_Charges: double (nullable = true)
 |-- Trip_Total: string (nullable = true)
 |-- Shared_Trip_Authorized: boolean (nullable = true)
 |-- Trips_Pooled: double (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullable = true)
 |-- Dropoff_Centroid_Longitude: string (nullable = true)
 |-- Dropo

Pull in Uber and Lyft stock price information. One record per day. Rename fields with their company as prefixes so names will be unique in final dataframe.

In [15]:
#pulled from Yahoo Finance
UBER_stock = sqlContext.read.csv("UBER_stock.csv",header=True,inferSchema=True)
LYFT_stock = sqlContext.read.csv("LYFT_stock.csv",header=True,inferSchema=True)

In [16]:
UBER_stock = UBER_stock.toDF(*["UBER_" + x if x != "Date" else x for x in UBER_stock.columns])
LYFT_stock = LYFT_stock.toDF(*["LYFT_" + x if x != "Date" else x for x in LYFT_stock.columns])

In [49]:
UBER_stock.show(5)

+----------+---------+---------+---------+----------+--------------+-----------+
|      Date|UBER_Open|UBER_High| UBER_Low|UBER_Close|UBER_Adj Close|UBER_Volume|
+----------+---------+---------+---------+----------+--------------+-----------+
|2019-05-10|     42.0|     45.0|41.060001|     41.57|         41.57|  186322500|
|2019-05-13|38.790001|39.240002|36.080002| 37.099998|     37.099998|   79442400|
|2019-05-14|38.310001|39.959999|36.849998| 39.959999|     39.959999|   46661100|
|2019-05-15|39.369999|41.880001|38.950001| 41.290001|     41.290001|   36086100|
|2019-05-16|    41.48|44.060001|    41.25|      43.0|          43.0|   38115500|
+----------+---------+---------+---------+----------+--------------+-----------+
only showing top 5 rows



In [50]:
LYFT_stock.show(5)

+----------+---------+---------+---------+----------+--------------+-----------+
|      Date|LYFT_Open|LYFT_High| LYFT_Low|LYFT_Close|LYFT_Adj Close|LYFT_Volume|
+----------+---------+---------+---------+----------+--------------+-----------+
|2019-03-29|87.330002|88.599998|78.019997| 78.290001|     78.290001|   71485200|
|2019-04-01|74.900002|     75.0|67.779999| 69.010002|     69.010002|   41799300|
|2019-04-02|66.900002|70.199997|66.099998| 68.970001|     68.970001|   22483300|
|2019-04-03|70.059998|     72.0|69.120003|      70.0|          70.0|   15648300|
|2019-04-04|70.480003|72.889999|70.220001|      72.0|          72.0|    9229300|
+----------+---------+---------+---------+----------+--------------+-----------+
only showing top 5 rows



Join the stock data to our full dataset using the date as our joiner.

In [22]:
final_DF = final_DF.join(UBER_stock,on="Date",how="left").join(LYFT_stock,on="Date",how="left")

In [59]:
ed_final_DF = ed_df.join(UBER_stock,on="Date",how="left").join(LYFT_stock,on="Date",how="left")

In [None]:
final_DF.show(5)

In [60]:
ed_df.show(5)

+--------------------+--------------------+--------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+-----+-------------------+-------------------+---------+----------+---------------+--------------+---------------+-----------------+----------+-------------+---------------+
|             Trip_ID|Trip_Start_Timestamp|  Trip_End_Timestamp|Trip_Seconds|Trip_Miles|Pickup_Census_Tract|Dropoff_Census_Tract|Pickup_Community_Area|Dropoff_Community_Area|Fare|Tip|Additional_Charges|Trip_Total|Shared_Trip_Authorized|Trips_Pooled|Pickup_Centroid_Latitude|Pickup_Centroid_Longitude|Pickup_Centroid_Location|Dropoff_Centroid_Latitude|Dropoff_Centroid_Longitude|Dropoff_Centroid_Location|label|      Trip_S

Code for heatmap visuals, template for any two variables with a third aggregation e.g. year x month with total fare, trip hour x pickup area with count of trips, etc.

In [22]:
#####multiple aggs at once - aggregate one time and pivot multiple times
#heatmap_data = final_DF.groupBy('Trip_Year','Trip_Month').agg(F.sum('Fare').alias("Total_Fare"), F.count("trip_id").alias("Trip Count"))

######aggregation for year x month - only useful for millions of records
#heatmap_data = final_DF.groupby('Trip_Year','Trip_Month').sum('Fare').sort('Trip_Year','Trip_Month') \
#                        .groupby('Trip_Year').pivot('Trip_Month').sum("sum(Fare)").sort('Trip_Year')

In [23]:
heatmap_data = final_DF.groupby('Trip_Start_Hour','Pickup_Community_Area').sum('Tip').sort('Trip_Start_Hour','Pickup_Community_Area') \
                        .groupby('Trip_Start_Hour').pivot('Pickup_Community_Area').sum("sum(Tip)").sort('Trip_Start_Hour')

In [61]:
ed_heatmap_data = ed_df.groupby('Trip_Start_Hour','Pickup_Community_Area').sum('Tip').sort('Trip_Start_Hour','Pickup_Community_Area') \
                        .groupby('Trip_Start_Hour').pivot('Pickup_Community_Area').sum("sum(Tip)").sort('Trip_Start_Hour')

In [55]:
ed_heatmap_data.show(5)

+---------------+------+------+-------+------+-------+-------+-------+--------+------+------+------+------+------+------+------+------+------+-----+------+------+-------+-------+------+-------+------+-----+------+-------+------+------+------+-------+-------+------+------+-----+-----+------+------+-----+------+------+------+------+-----+------+----+-----+------+-----+-----+-----+-----+-----+-----+-------+-----+------+------+------+------+-----+-----+-----+-----+------+-----+------+------+------+------+------+-----+------+------+-------+-------+--------+
|Trip_Start_Hour|   1.0|   2.0|    3.0|   4.0|    5.0|    6.0|    7.0|     8.0|   9.0|  10.0|  11.0|  12.0|  13.0|  14.0|  15.0|  16.0|  17.0| 18.0|  19.0|  20.0|   21.0|   22.0|  23.0|   24.0|  25.0| 26.0|  27.0|   28.0|  29.0|  30.0|  31.0|   32.0|   33.0|  34.0|  35.0| 36.0| 37.0|  38.0|  39.0| 40.0|  41.0|  42.0|  43.0|  44.0| 45.0|  46.0|47.0| 48.0|  49.0| 50.0| 51.0| 52.0| 53.0| 54.0| 55.0|   56.0| 57.0|  58.0|  59.0|  60.0|  61.0|

Code for PCA and KMeans clustering. Fill in all null values.

In [24]:
PCA_DF = final_DF

In [56]:
ed_PCA_DF = ed_df

In [25]:
PCA_DF = PCA_DF.fillna("100",subset=["Pickup_Community_Area","Dropoff_Community_Area"])

In [None]:
# ed_df already filled na as 78

In [26]:
PCA_DF = PCA_DF.na.fill(0)

In [62]:
ed_PCA_DF = ed_PCA_DF.na.fill(0)

In [27]:
PCA_DF = PCA_DF.na.fill(0,subset=["Trip_Seconds", "Trip_Miles", "Fare", "Shared_Trip_Authorized", "Trips_Pooled", \
                   "Trip_WeekNumber","Trip_DayofWeek","Trip_Start_Hour","Trip_End_Hour","UBER_Open","UBER_High","UBER_Low","UBER_Close","UBER_Adj Close","UBER_Volume", \
                   "LYFT_Open","LYFT_High","LYFT_Low","LYFT_Close","LYFT_Adj Close","LYFT_Volume"])

In [63]:
ed_PCA_DF = ed_PCA_DF.na.fill(0,subset=["Trip_Seconds", "Trip_Miles", "Fare", "Shared_Trip_Authorized", "Trips_Pooled", \
                   "Trip_WeekNumber","Trip_DayofWeek","Trip_Start_Hour","Trip_End_Hour","UBER_Open","UBER_High","UBER_Low","UBER_Close","UBER_Adj Close","UBER_Volume", \
                   "LYFT_Open","LYFT_High","LYFT_Low","LYFT_Close","LYFT_Adj Close","LYFT_Volume"])

AnalysisException: Cannot resolve column name "UBER_Open" among (Trip_ID, Trip_Start_Timestamp, Trip_End_Timestamp, Trip_Seconds, Trip_Miles, Pickup_Census_Tract, Dropoff_Census_Tract, Pickup_Community_Area, Dropoff_Community_Area, Fare, Tip, Additional_Charges, Trip_Total, Shared_Trip_Authorized, Trips_Pooled, Pickup_Centroid_Latitude, Pickup_Centroid_Longitude, Pickup_Centroid_Location, Dropoff_Centroid_Latitude, Dropoff_Centroid_Longitude, Dropoff_Centroid_Location, label, Trip_Start_TS, Trip_End_TS, Trip_Year, Trip_Month, Trip_WeekNumber, Trip_DayofWeek, Trip_Start_Hour, Trip_Start_Minute, Date, Trip_End_Hour, Trip_End_Minute);

Encode our categorical variables.

In [28]:
pickup_indexer = StringIndexer(inputCol="Pickup_Community_Area", outputCol="Pickup_Community_Area_idx")
dropoff_indexer = StringIndexer(inputCol="Dropoff_Community_Area", outputCol="Dropoff_Community_Area_idx")

PCA_DF = pickup_indexer.fit(PCA_DF).transform(PCA_DF)
PCA_DF = dropoff_indexer.fit(PCA_DF).transform(PCA_DF)

In [29]:
PCA_DF.select([F.count(F.when(F.col(c).isNull(),c)) for c in ["UBER_Open","UBER_High","UBER_Low","UBER_Close","UBER_Adj Close","UBER_Volume", \
                   "LYFT_Open","LYFT_High","LYFT_Low","LYFT_Close","LYFT_Adj Close","LYFT_Volume"]]).show()

+-------------------------------------------------------+-------------------------------------------------------+-----------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-----------------------------------------------------+---------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------+
|count(CASE WHEN (UBER_Open IS NULL) THEN UBER_Open END)|count(CASE WHEN (UBER_High IS NULL) THEN UBER_High END)|count(CASE WHEN (UBER_Low IS NULL) THEN UBER_Low END)|count(CASE WHEN (UBER_Close IS NULL) THEN UBER_Close END)|count(CASE WHEN (UBER_Adj Close IS NULL) THEN UBER_Adj Close END)|count(C

In [30]:
PCA_DF.printSchema()

root
 |-- Date: date (nullable = true)
 |-- trip_id: string (nullable = true)
 |-- Trip_Start_Timestamp_str: string (nullable = true)
 |-- Trip_End_Timestamp_str: string (nullable = true)
 |-- Trip_Seconds_str: string (nullable = true)
 |-- Trip_Miles_str: string (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area_str: string (nullable = true)
 |-- Dropoff_Community_Area_str: string (nullable = true)
 |-- Fare_str: string (nullable = true)
 |-- Tip_str: string (nullable = true)
 |-- Additional_Charges_str: string (nullable = true)
 |-- Trip_Total_str: string (nullable = true)
 |-- Shared_Trip_Authorized_str: string (nullable = true)
 |-- Trips_Pooled_str: string (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullab

In [31]:
encoder = OneHotEncoder(inputCols=["Pickup_Community_Area_idx","Dropoff_Community_Area_idx"],outputCols=["Pickup_Community_Area_ctgy","Dropoff_Community_Area_ctgy"])
PCA_DF = encoder.fit(PCA_DF).transform(PCA_DF)

In [32]:
PCA_DF.columns

['Date',
 'trip_id',
 'Trip_Start_Timestamp_str',
 'Trip_End_Timestamp_str',
 'Trip_Seconds_str',
 'Trip_Miles_str',
 'Pickup_Census_Tract',
 'Dropoff_Census_Tract',
 'Pickup_Community_Area_str',
 'Dropoff_Community_Area_str',
 'Fare_str',
 'Tip_str',
 'Additional_Charges_str',
 'Trip_Total_str',
 'Shared_Trip_Authorized_str',
 'Trips_Pooled_str',
 'Pickup_Centroid_Latitude',
 'Pickup_Centroid_Longitude',
 'Pickup_Centroid_Location',
 'Dropoff_Centroid_Latitude',
 'Dropoff_Centroid_Longitude',
 'Dropoff_Centroid_Location',
 'Trip_Start_Timestamp',
 'Trip_End_Timestamp',
 'Trip_Seconds',
 'Trip_Miles',
 'Pickup_Community_Area',
 'Dropoff_Community_Area',
 'Fare',
 'Tip',
 'Additional_Charges',
 'Trip_Total',
 'Shared_Trip_Authorized',
 'Trips_Pooled',
 'Trip_Year',
 'Trip_Month',
 'Trip_WeekNumber',
 'Trip_DayofWeek',
 'Trip_Start_Hour',
 'Trip_End_Hour',
 'UBER_Open',
 'UBER_High',
 'UBER_Low',
 'UBER_Close',
 'UBER_Adj Close',
 'UBER_Volume',
 'LYFT_Open',
 'LYFT_High',
 'LYFT_Low',
 

Make a list of all final variables appliciable to the algorithms.

In [33]:
regressors_list = ["Trip_Seconds", "Trip_Miles", "Pickup_Community_Area_ctgy", "Dropoff_Community_Area_ctgy", "Fare", "Shared_Trip_Authorized", "Trips_Pooled", \
                   "Trip_WeekNumber","Trip_DayofWeek","Trip_Start_Hour","Trip_End_Hour","UBER_Open","UBER_High","UBER_Low","UBER_Close","UBER_Adj Close","UBER_Volume", \
                   "LYFT_Open","LYFT_High","LYFT_Low","LYFT_Close","LYFT_Adj Close","LYFT_Volume"]

In [34]:
assembler = VectorAssembler(inputCols=regressors_list,outputCol="features")
PCA_DF = assembler.transform(PCA_DF)

Create PCA and KMeans models. Pull out important information for both.

In [35]:
pca = PCA(k=2, inputCol="features",outputCol="pcaFeature")
pca_model = pca.fit(PCA_DF)

In [36]:
PCA = pca_model.transform(PCA_DF).select("pcaFeature")

The original data points will need to be rejoined to the PCs to see the 

In [44]:
import matplotlib.pyplot as plt
import numpy as np

In [59]:
np.array(PCA.rdd.map(lambda x: x[0]).collect())

In [99]:
pca_model.explainedVariance

DenseVector([0.9999, 0.0001])

In [105]:
pca_model.pc

DenseMatrix(157, 2, [-0.9999, -0.0105, 0.0001, 0.0, 0.0, 0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 0)

In [106]:
kmeans =KMeans().setK(2)
kmeans_model = kmeans.fit(PCA_DF)
kmeans_summary = kmeans_model.summary

In [107]:
kmeans_summary.clusterSizes

[238, 761]

In [115]:
kmeans_summary.cluster.show(20)

+----------+
|prediction|
+----------+
|         1|
|         1|
|         0|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         0|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         0|
|         1|
+----------+
only showing top 20 rows

