# Sparkify Dev Build Workspace

This workspace contains a tiny subset (128MB) of the full dataset available (12GB). It will be used for EDA to garner insights on what features will be useful to look into.

In [15]:
import pandas as pd
import seaborn as sns
import json
import matplotlib.pyplot as plt
import datetime
import time
%matplotlib inline

from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import IntegerType, TimestampType
from pyspark.sql.functions import udf
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import desc, asc, sum as Fsum
from pyspark.sql.functions import month, dayofmonth, dayofweek, hour

from pyspark.ml.feature import VectorAssembler, Normalizer, StandardScaler
from pyspark.ml.feature import StringIndexer, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [2]:
# create a Spark session
spark = SparkSession \
    .builder \
    .appName("Our first Python Spark SQL example") \
    .getOrCreate()

In [3]:
spark.sparkContext.getConf().getAll()

[('spark.driver.host', '192.168.0.12'),
 ('spark.app.name', 'Our first Python Spark SQL example'),
 ('spark.driver.port', '58439'),
 ('spark.app.id', 'local-1593636902710'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

In [4]:
spark

# Load and Clean

In [5]:
print('Loading the data ...')
path = "mini_sparkify_event_data.json"
event_log = spark.read.json(path)
print('Data loaded!\n')

print('Cleaning the data ...')

# removes the empty user ID
event_log = event_log.filter(event_log.userId != "")

# Flags if a user cancelled at any point
flag_cancel_event = udf(lambda x:
                        1 if x == 'Cancellation Confirmation' else 0
                        , IntegerType())
event_log = event_log.withColumn('cancelled'
                                 , flag_cancel_event(event_log.page))

# all rows after cancel event set churned to 1 else 0
windowval = Window.partitionBy('userId')\
                    .orderBy(desc('ts'))\
                    .rangeBetween(Window.unboundedPreceding, 0)
event_log = event_log.withColumn('churned', Fsum('cancelled').over(windowval))
print('Data cleaned!\n')


print('Extracting useable features from datetime timestamp ...')
get_timestamp = udf(lambda x:
                    datetime.datetime.fromtimestamp(int(int(x)/1000))
                    , TimestampType())
event_log = event_log.withColumn('ts', get_timestamp(event_log.ts)) \
                    .withColumn('hour', hour('ts')) \
                    .withColumn('day', dayofmonth('ts')) \
                    .withColumn('month', month('ts')) \
                    .withColumn('weekday', dayofweek('ts'))
print('Features extracted and added to table!\n')

Loading the data ...
Data loaded!

Cleaning the data ...
Data cleaned!

Extracting useable features from datetime timestamp ...
Features extracted and added to table!



# Feature Engineering

In [6]:
avg_daily_use = event_log.groupby(['userId','month','day']).count() \
                        .sort('userId','month','day') \
                        .groupby('userId').avg('count') \
                        .withColumnRenamed('avg(count)','avg_daily_songplays')
avg_daily_use.createOrReplaceTempView('avg_daily_use_table')

In [7]:
event_log.createOrReplaceTempView('event_log_table')

In [8]:
build_user_table = spark.sql('''
    SELECT
        distinct e.userId
        , sum(CASE WHEN page="NextSong" THEN 1 ELSE 0 END) as num_songplays
        , sum(CASE WHEN page="Thumbs Up" THEN 1 ELSE 0 END) as num_thumbs_up
        , sum(CASE WHEN page="Thumbs Down" THEN 1 ELSE 0 END) as num_thumbs_down
        , sum(CASE WHEN page="Add to Playlist" THEN 1 ELSE 0 END) as num_plylst_add
        , sum(CASE WHEN page="Add Friend" THEN 1 ELSE 0 END) as num_frnd_add
        , sum(CASE WHEN page="Save Settings" THEN 1 ELSE 0 END) as num_sav_set
        , sum(CASE WHEN page="Roll Advert" THEN 1 ELSE 0 END) as num_ad_rolls
        , count(distinct sessionId) as num_sessions
        , max(CASE WHEN level="paid" THEN 1 else 0 END) as had_paid
        , max(adu.avg_daily_songplays) as avg_daily_songplays
        , max(churned) as churned
    FROM
        event_log_table e
    LEFT JOIN avg_daily_use_table adu
        ON e.userId = adu.userId
    GROUP BY
        e.userId
''')
build_user_table.show(n=5)

+------+-------------+-------------+---------------+--------------+------------+-----------+------------+------------+--------+-------------------+-------+
|userId|num_songplays|num_thumbs_up|num_thumbs_down|num_plylst_add|num_frnd_add|num_sav_set|num_ad_rolls|num_sessions|had_paid|avg_daily_songplays|churned|
+------+-------------+-------------+---------------+--------------+------------+-----------+------------+------------+--------+-------------------+-------+
|100010|          275|           17|              5|             7|           4|          0|          52|           7|       0|  54.42857142857143|      0|
|200002|          387|           21|              6|             8|           4|          0|           7|           6|       1|  67.71428571428571|      0|
|   125|            8|            0|              0|             0|           0|          0|           1|           1|       0|               11.0|      1|
|   124|         4079|          171|             41|           1

# Modelling

In [9]:
assembler = VectorAssembler(inputCols=['num_songplays','num_thumbs_up',
                                       'num_thumbs_down','num_plylst_add',
                                       'num_frnd_add','num_sav_set',
                                       'num_ad_rolls','num_sessions',
                                       'avg_daily_songplays']
                           , outputCol='NumFeatures')
build_user_table = assembler.transform(build_user_table)

In [10]:
scaler = Normalizer(inputCol='NumFeatures', outputCol='ScaledNumFeatures')
build_user_table = scaler.transform(build_user_table)

In [11]:
data = build_user_table.select(col('churned').alias('label')
                               , col('ScaledNumFeatures').alias('features'))
data.head(n=3)

[Row(label=0, features=DenseVector([0.962, 0.0595, 0.0175, 0.0245, 0.014, 0.0, 0.1819, 0.0245, 0.1904])),
 Row(label=0, features=DenseVector([0.983, 0.0533, 0.0152, 0.0203, 0.0102, 0.0, 0.0178, 0.0152, 0.172])),
 Row(label=1, features=SparseVector(9, {0: 0.585, 6: 0.0731, 7: 0.0731, 8: 0.8044}))]

In [12]:
data.show(n=5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[0.96198859849549...|
|    0|[0.98299295498964...|
|    1|(9,[0,6,7,8],[0.5...|
|    0|[0.99786259670431...|
|    1|[0.99498824871416...|
+-----+--------------------+
only showing top 5 rows



In [13]:
train, test = data.randomSplit([0.8,0.2], seed=42)

In [16]:
lr = LogisticRegression(maxIter=10, regParam=0.0)
dt = DecisionTreeClassifier(seed=7)
rf = RandomForestClassifier(seed=7)
SVM = LinearSVC()

In [17]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')

In [18]:
all_results = {}
for model in [lr, dt, rf, SVM]:
    model_results = {}
    # get the classifier name
    model_name = model.__class__.__name__
    
    
    # fit the dataset
    print(f'{model_name} is training...')
    start = time.time() 
    model = model.fit(train)
    end = time.time() 
    model_results['train_time'] = round(end-start,6)
    
    # predict
    print(f'{model_name} is predicting...')
    start = time.time() 
    pred_test = model.transform(test)
    end = time.time()
    model_results['pred_time'] = round(end-start,6)
    
    #metrics
    print(f'{model_name} is evaluating...')    
    model_results['f1_test'] = evaluator.evaluate(pred_test.select('label'
                                                                ,'prediction')
                                                  ,{evaluator.metricName: 'f1'})
    print('Test F1-score: {}\n'.format(model_results['f1_test']))
    all_results[model_name] = model_results
    
all_results_df = pd.DataFrame(all_results)

LogisticRegression is training...
LogisticRegression is predicting...
LogisticRegression is evaluating...
Test F1-score: 0.6223938223938223

DecisionTreeClassifier is training...
DecisionTreeClassifier is predicting...
DecisionTreeClassifier is evaluating...
Test F1-score: 0.6425966636492954

RandomForestClassifier is training...
RandomForestClassifier is predicting...
RandomForestClassifier is evaluating...
Test F1-score: 0.6023166023166023

LinearSVC is training...
LinearSVC is predicting...
LinearSVC is evaluating...
Test F1-score: 0.6157094594594593



In [19]:
all_results_df

Unnamed: 0,LogisticRegression,DecisionTreeClassifier,RandomForestClassifier,LinearSVC
train_time,150.231972,288.026072,237.628994,422.983274
pred_time,0.056735,0.062789,0.044093,0.054641
f1_test,0.622394,0.642597,0.602317,0.615709


### Thoughts

- The LinearSVC is the most expensive to run and ranks 3rd out of 4 models for F1 score and so shall be discarded.
- Logistic Regression and the Decision Tree shall be taken forward for tuning given they performed the best and are not too expensive.

# Tuning the model

The model will be tuned here with **CrossValidator** rather than **TrainValidationSplit** as the data set is smaller and it is a well-established method for choosing parameters and is more statistically sound.

## Tune the Linear Regression model

In [20]:
paramGrid = ParamGridBuilder(). \
            addGrid(lr.elasticNetParam,[0.1,0.5,1]). \
            addGrid(lr.regParam,[0.01,0.05,0.1]). \
            build()

crossval = CrossValidator(estimator=lr,
                 estimatorParamMaps=paramGrid,
                 evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                 numFolds=3)

In [21]:
start = time.time()
cv_lr = crossval.fit(train)
end = time.time()
print('Model tuning complete - took {}s'.format(end-start))

Model tuning complete - took 1271.9184019565582s


In [22]:
cv_lr.avgMetrics

[0.7136412830477706,
 0.7087677756686499,
 0.7113642778900438,
 0.7136412830477706,
 0.7049906457351616,
 0.6848707987840734,
 0.7136412830477706,
 0.6807680998501884,
 0.6795832082043632]

In [23]:
cv_lr.bestModel

LogisticRegressionModel: uid=LogisticRegression_a7affad617e6, numClasses=2, numFeatures=9

In [24]:
cv_lr_results = cv_lr.transform(test)

In [25]:
print('Accuracy: {}'.format(evaluator.evaluate(cv_lr_results.select('label','prediction'), {evaluator.metricName: "accuracy"})))
print('F-1 Score:{}'.format(evaluator.evaluate(cv_lr_results.select('label','prediction'), {evaluator.metricName: "f1"})))

Accuracy: 0.6756756756756757
F-1 Score:0.6223938223938223


## Tune the Decision Tree model

In [26]:
paramGrid = ParamGridBuilder(). \
            addGrid(dt.impurity, ['entropy','gini']). \
            addGrid(dt.maxDepth, [x for x in range(5,25,5)]). \
            build()

crossval = CrossValidator(estimator=lr,
                 estimatorParamMaps=paramGrid,
                 evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                 numFolds=3)

In [27]:
start = time.time()
cv_dt = crossval.fit(train)
end = time.time()
print('Model tuning complete - took {}s'.format(end-start))

Model tuning complete - took 1182.3541371822357s


In [28]:
cv_dt.avgMetrics

[0.712487944706165,
 0.712487944706165,
 0.712487944706165,
 0.712487944706165,
 0.712487944706165,
 0.712487944706165,
 0.712487944706165,
 0.712487944706165]

In [29]:
cv_dt_results = cv_dt.transform(test)

In [30]:
print('Accuracy: {}'.format(evaluator\
                            .evaluate(cv_dt_results.select('label'
                                                           ,'prediction')
                                      , {evaluator.metricName: "accuracy"})))
print('F-1 Score:{}'.format(evaluator\
                            .evaluate(cv_dt_results.select('label'
                                                           ,'prediction')
                                      , {evaluator.metricName: "f1"})))

Accuracy: 0.6756756756756757
F-1 Score:0.6223938223938223


# Conclusions

The **Logistic Regression** model looks to perform best under the tuning with an F1 score of $0.675$.

Both of the logistic regression and decision tree classifier models will be taken forward and explored on the AWS EMR cluster to model the full 12GB dataset.