# <font color='darkblue'> Sparkify Stacking Models</font>

<div class="alert alert-block alert-info">

This notebook is dedicated to the task of creating a stacking model based on individual classifiers available in PySpark. 
    <li> split the data in train, validation and test sets</li>
    <li> train each of the classifiers on the train set</li>
    <li> have each classifier make predictions on the validation set</li>
    <li> create a meta-features dataset from the individual predictions</li>
    <li> train a linear regression classifier on the meta-features dataset</li>
    <li> evaluate the full model on the test set</li>
</div>

## <font color='blue'>Set Environment</font>

In [1]:
# import PySpark libraries and packages

import pyspark
from pyspark import SparkContext
from pyspark import SparkConf

from pyspark.sql import SparkSession
from pyspark.sql.window import Window as W

from pyspark.sql.types import (
    StringType,
    IntegerType, 
    DateType, 
    TimestampType,
    )

from pyspark.sql.functions import (
    min as Fmin, max as Fmax, 
    sum as Fsum, round as Fround, 
    
    col, lit, 
    first, last, 
    desc, asc,
    avg, count, countDistinct, 
    when, isnull, isnan,
    from_unixtime, 
    datediff,
    )

# libraries and packages for modeling

from pyspark.ml import Pipeline

from pyspark.ml.feature import (
    StringIndexer, 
    OneHotEncoder, 
    VectorAssembler, 
    StandardScaler
)
from pyspark.ml.feature import (
    OneHotEncoder, 
    OneHotEncoderModel
)

from pyspark.ml.classification import (
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GBTClassifier,
    MultilayerPerceptronClassifier,
    LinearSVC,
    NaiveBayes
)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
# build a Spark session using the SparkSession APIs

spark = (SparkSession
        .builder
        .appName("Sparkify")
        .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")

22/01/30 22:11:28 WARN Utils: Your hostname, greg resolves to a loopback address: 127.0.1.1; using 192.168.0.21 instead (on interface wlp82s0)
22/01/30 22:11:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/01/30 22:11:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# import python libraries

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# import library for enhanced plotting

import seaborn as sns
sns.set_style("darkgrid")
colors = sns.color_palette('PuBuGn_r')

## <font color='blue'>Load Train and Test Datasets</font>

In [5]:
def load_data(file_path):
    """
    Loads the raw dataset in Spark.
    
    INPUT:
            (str) - path for datafile
    OUTPUT:
            (PySpark dataframe) - dataframe of raw data
    
    """
    
    print("Loading the dataset ...")
    df = spark.read.json(file_path)
    print("Dataset is loaded...")
    
    return df


def save_data(df, data_path):
    """
    Saves the PySpark dataframe to a file.
    
    INPUT:
            df (PySpark dataframe) - data to be saved
            data_path (str) - path for datafile
    OUTPUT:
            none
    
    """
    
    df.write.json(data_path)
    
    
def clean_data(df):
    """
    Performs basic cleaning operations on the raw data:
        - removes entries with missing userId
        - rescale timestamp columns to seconds
        - drop unnecesary columns
            - personal information columns
            - song information columns
            - web and browser information
            - timestamp columns in miliseconds

    INPUT:
        (PySpark dataframe) - dataframe of raw data
    OUTPUT:
        (PySpark dataframe) - dataframe of cleaned data
    """

    # print message to indicate the start of the process
    print("Cleaning the data ...")

    # print a count of rows before cleaning
    initial_records = df.count()
    print("Dataset has {} rows initially.".format(initial_records))

    # remove all the records without userId
    df = df.where(df.userId != "")

    # rescale the timestamp to seconds (initially in miliseconds)
    df = df.withColumn("log_ts", df.ts/1000.0)
    df = df.withColumn("reg_ts", df.registration/1000.0)

    # drop several unnecessary columns
    cols_to_drop = ("firstName", "lastName", "location",
                    "artist", "song", "length",
                    "userAgent", "method", "status",
                    "ts", "registration"
                   )
    df = df.drop(*cols_to_drop)


    # print end of process message
    print("Finished cleaning the data ...")

    # print a count of rows after cleaning
    removed_rows = initial_records - df.count()

    print("Cleaned dataset has {} rows, {} rows were removed". \
        format(df.count(), initial_records - df.count()))

    return df


def preprocess_data(df):

    """
    Prepare the data for modeling via creating several features.

        - reg_date (date) - month-year of the registration

        - create windows grouped on userId and sessionId

         - firstevent_ts (timestamp) - first time an user is active
         - lastevent_ts (timestamp) - last time an user is active

         - init_days_interv (float) - days between registration and first activity
         - tenure_days_interv (float) - days between registration and last activity
         - active_days (float) - days the user has some activity on the platform
         - session_h (float) - session's duration in hours

     INPUT:
         df (PySpark dataframe) - cleaned dataframe
     OUTPUT:
         df (PySpark dataframe) - dataframe with the listed features added
    """

    # extract registration month and year from timestamp
    df = df.withColumn("reg_date", from_unixtime(col("reg_ts"), "MM-yyyy"))

    # create window: data grouped by userId, time ordered
    win_user = (W.partitionBy("userId")
            .orderBy("log_ts")
            .rangeBetween(W.unboundedPreceding,
                          W.unboundedFollowing))

    # create window: data grouped by sessionId and userId, time ordered
    win_user_session = (W.partitionBy("sessionId", "userId")
                        .orderBy("log_ts")
                        .rangeBetween(W.unboundedPreceding,
                                      W.unboundedFollowing))

    # record the first time an user is active
    df = df.withColumn("firstevent_ts", first(col("log_ts")).over(win_user))
    # record the last time an user is active
    df = df.withColumn("lastevent_ts", last(col("log_ts")).over(win_user))

    # warmup time = registration time to first event in days
    df = df.withColumn("init_days_interv",
                       (col("firstevent_ts").cast("long")-col("reg_ts").cast("long"))/(24*3600))

    # tenure time = registration time to last event in days
    df = df.withColumn("tenure_days_interv",
                       (col("lastevent_ts").cast("long")-col("reg_ts").cast("long"))/(24*3600))

    # active time =  days between the first event and the last event in days
    df = df.withColumn("active_days",
                       (col("lastevent_ts").cast("long")-col("firstevent_ts").cast("long"))/(24*3600))

    # create column that records the individual session's duration in hours
    df = df.withColumn("session_h",
                    (last(df.log_ts).over(win_user_session) \
                     - first(df.log_ts).over(win_user_session))/3600)

    # drop columns
    df = df.drop("reg_ts", "log_ts")

    return df


def build_features(df):

    """
    Features engineered to be used in modelling.

        - nr_songs (int) - total number of songs user listened to
        - nr_playlist (int) - number of songs added to the playlist

        - nr_friends (int) - number of friends added through "Add Friend"

        - nr_likes (int) - total number of "Thumbs Up" of the user
        - nr_dislikes (int) - total number of "Thumbs Down" of the user

        - nr_downgrades (int) - total number of visits to "Downgrade" page by the user
        - nr_upgrades (int) - total number of visits to "Upgrade" page by the user

        - nr_home (int) - total number of visits to "Home" page by the user
        - nr_settings (int) - total number of visits to "Settings" page by the user

        - nr_error (int) - total number of errors encountered by the user

        - nr_ads (int) - total number of ads the user got
        - nr_sessions (int) - number of sessions of the user
        - n_acts (int) - total number of actions taken by the user

        - avg_sess_h (float) - average session length in hours
        - acts_per_session (float) - average number of actions per session for the user
        - songs_per_session (float) - average numer of songs listened per session by the user
        - ads_per_session (float) - average number of ads per session, received by user

        - init_days_interv (int) - time interval in days from registration to the first action of the user
        - tenure_days_interv (int) - time interval in days from registration to the last action of the user
        - active_days (int) - number of days the user was active on the platform

        - gender (binary) - 1 for F (female), 0 for M (male)
        - level (binary) - 1 for paid, 0 for free

        - churn (binary) - 1 for "Cancellation Confirmation" page visit, 0 otherwise

    INPUT:
        df (PySpark dataframe) - preprocessed dataframe
    OUTPUT:
        df_feats (PySpark dataframe) - dataframe that contains engineered features
    """

    df_feats = df.groupBy("userId") \
        .agg(

            # count user's individual actions using all page visits

            count(when(col("page") == "NextSong", True)).alias("nr_songs"),
            count(when(col("page") == "Add to Playlist", True)).alias("nr_playlist"),

            count(when(col("page") == "Add Friend", True)).alias("nr_friends"),

            count(when(col("page") == "Thumbs Up", True)).alias("nr_likes"),                count(when(col("page") == "Thumbs Down", True)).alias("nr_dislikes"),

            count(when(col("page") == "Downgrade", True)).alias("nr_downgrades"),
            count(when(col("page") == "Upgrade", True)).alias("nr_upgrades"),

            count(when(col("page") == "Home", True)).alias("nr_home"),
            count(when(col("page") == "Settings", True)).alias("nr_settings"),

            count(when(col("page") == "Error", True)).alias("nr_error"),

            count(when(col("page") == "Roll Advert", True)).alias("nr_ads"),

            # compute the number of sessions a user is in
            countDistinct("sessionId").alias("nr_sessions"),

            # find the total number of actions a user took
            countDistinct("itemInSession").alias("n_acts"),

            # compute the average session length in hours
            avg(col("session_h")).alias("avg_sess_h"),

            # compute the average number of page actions per sesssion - i.e. items in session
            (countDistinct("itemInSession") /countDistinct("sessionId")).alias("acts_per_session"),

            # compute the average number of songs per session
            (count(when(col("page") == "NextSong", True)) /countDistinct("sessionId")).alias("songs_per_session"),

            # compute the average number of ads per session
             (count(when(col("page") == "Roll Advert", True)) /countDistinct("sessionId")).alias("ads_per_session"),

            # days between registration and first activity
            first(col("init_days_interv")).alias("init_days_interv"),
            # the tenure time on the platform: from registration to last event in days
            first(col("tenure_days_interv")).alias("tenure_days_interv"),
            # number of days user visited the platform, is active on the platform
            first(col("active_days")).alias("active_days"),

            # encode the gender 1 for F and 0 for M
            first(when(col("gender") == "F", 1).otherwise(0)).alias("gender"),

            # encode the level (paid/free) according to the last record
            last(when(col("level") == "paid", 1).otherwise(0)).alias("level"),

            # flag those users that downgraded
            #last(when(col("page") == "Downgrade", 1).otherwise(0)).alias("downgrade"),

            # create the churn column that records if the user cancelled
            last(when(col("page") == "Cancellation Confirmation", 1).otherwise(0)).alias("churn"),
            )

    # columns to drop
    drop_cols = ("userId", "gender", "avg_sess_h",
                 "nr_playlist", "nr_home")
    # drop the columns
    #df_feats = df_feats.drop("userId")
    df_feats = df_feats.drop(*drop_cols)

    # drop the null values
    df_feats=df_feats.na.drop()

    return df_feats


def split_data (df):

    """
    Split the dataset into training, validation set and test set.
    Use a stratified sampling method.

    INPUT:
        df (PySpark dataframe) - dataframe
    OUTPUT:
        train_set, validation_set, test_set (PySpark dataframes) - 
                          percentage split based on the provided values
    """

    # split dataframes between 0s and 1s
    zeros = df.filter(df["churn"]==0)
    ones = df.filter(df["churn"]==1)

    # split dataframes into training and testing
    train0, validation0, test0 = zeros.randomSplit(SPLIT_VALS, seed=1234)
    train1, validation1, test1 = ones.randomSplit(SPLIT_VALS, seed=1234)

    # stack datasets back together
    train_set = train0.union(train1)
    validation_set = validation0.union(validation1)
    test_set = test0.union(test1)

    return train_set, validation_set, test_set

def prepare_data(dataset_filepath):
    
    """
    Function that combines all the data preparation steps.
    
    INPUT:
        dataset_filepath (str) - filepath for raw data json file
    OUTPUT:
       train_set, validation_set, test_set (PySpark dataframes) - 
            subsets of the features dataset
    """
    
    print('Load data...')
    df = load_data(dataset_filepath)
    
    print('Clean data...')
    df_clean = clean_data(df)
    
    print('Preprocess data...')
    df_proc = preprocess_data(df_clean)
    
    print('Create features dataset...')
    df_feats = build_features(df_proc)
    
    print('Split the features dataset...')
    train_set, validation_set, test_set = split_data(df_feats)
    
    return train_set, validation_set, test_set

In [6]:
# split the features and the label
CAT_FEATURES = ["level"]
CONT_FEATURES = ["nr_songs", "nr_likes", "nr_dislikes", "nr_friends", "nr_downgrades",
                "nr_upgrades", "nr_error", "nr_settings", "nr_ads", "nr_sessions",
                "n_acts", "acts_per_session", "songs_per_session", "ads_per_session",
                "init_days_interv", "tenure_days_interv", "active_days"]
CHURN_LABEL = "churn"

def build_data_pipeline():
    """
    Combines all the stages of the data processing.
    """
    # stages in the pipeline
    stages = [] 
    
    # encode the labels
    label_indexer =  StringIndexer(inputCol=CHURN_LABEL, outputCol="label")
    stages += [label_indexer]
    
    # encode the binary features
    bin_assembler = VectorAssembler(inputCols=CAT_FEATURES, outputCol="bin_features")
    stages += [bin_assembler]
    
    # encode the continuous features
    cont_assembler = VectorAssembler(inputCols = CONT_FEATURES, outputCol="cont_features")
    stages += [cont_assembler]
    # normalize the continuous features
    cont_scaler = StandardScaler(inputCol="cont_features", outputCol="cont_scaler", 
                                 withStd=True , withMean=True)
    stages += [cont_scaler]
    
    # pass all to the vector assembler to create a single sparse vector
    all_assembler = VectorAssembler(inputCols=["bin_features", "cont_scaler"],  
                            outputCol="features")
    stages += [all_assembler]
    
    # add the models to the pipeline
    #stages += [models]
    
    # create a pipeline
    pipeline = Pipeline(stages=stages)
    
    return pipeline

In [7]:
# define the base models

lr = LogisticRegression(featuresCol='features', labelCol='label', 
                        predictionCol='pred_lr', probabilityCol='prob_lr',
                        rawPredictionCol='rawPred_lr')

rf = RandomForestClassifier(featuresCol='features', labelCol='label', 
                  predictionCol='pred_rf', probabilityCol='prob_rf', 
                  rawPredictionCol='rawPred_rf')

gbt = GBTClassifier(featuresCol='features', labelCol='label', 
                            predictionCol='pred_gbt')

layers=[18,8,4,2]
mlpc= MultilayerPerceptronClassifier(featuresCol='features', labelCol='label',
                                     predictionCol='pred_mlpc', probabilityCol='prob_mlpc',
                                     rawPredictionCol='rawPred_mlpc', layers=layers)
models = [lr, rf, gbt, mlpc]

In [10]:
SPLIT_VALS = [.4, .3, .3]   

# path for the test set file file
#path_dataset = "data/mini_sparkify_event_data.json"
path_dataset = "data/full_sparkify_event_data.json"

In [11]:
# clean, process and split the data
train_df, validation_df, test_df = prepare_data(path_dataset)

Load data...
Loading the dataset ...


                                                                                

Dataset is loaded...
Clean data...
Cleaning the data ...


                                                                                

Dataset has 26259199 rows initially.
Finished cleaning the data ...


                                                                                

Cleaned dataset has 26259199 rows, 0 rows were removed
Preprocess data...
Create features dataset...
Split the features dataset...


In [12]:
# save the data in the memory and disk
train_cached = train_df.persist()
validation_cached = validation_df.persist()
test_cached = test_df.persist()

In [13]:
# instantiate the data pipeline
data_pipeline = build_data_pipeline()

# prepare the datasets for modeling

data_train_pipeline_model = data_pipeline.fit(train_cached)
train_transf = data_train_pipeline_model.transform(train_cached)

data_validation_pipeline_model = data_pipeline.fit(validation_cached)
validation_transf = data_validation_pipeline_model.transform(validation_cached)

data_test_pipeline_model = data_pipeline.fit(test_cached)
test_transf = data_test_pipeline_model.transform(test_cached)

                                                                                

In [17]:
# remove the datasets from memory
train_cached.unpersist()
test_cached.unpersist()
validation_cached.unpersist()

DataFrame[nr_songs: bigint, nr_friends: bigint, nr_likes: bigint, nr_dislikes: bigint, nr_downgrades: bigint, nr_upgrades: bigint, nr_settings: bigint, nr_error: bigint, nr_ads: bigint, nr_sessions: bigint, n_acts: bigint, acts_per_session: double, songs_per_session: double, ads_per_session: double, init_days_interv: double, tenure_days_interv: double, active_days: double, level: int, churn: int]

In [18]:
# save the transformed set in the memory
train_transf_cached = train_transf.persist()
validation_transf_cached = validation_transf.persist()
test_transf_cached = test_transf.persist()

In [16]:
# build pipeline to generate predictions from base classifiers
base_pipeline = Pipeline(stages=models)

# start timer
start_time = time.time()

# fit the pipeline on the transformed set
base_pipeline_model = base_pipeline.fit(train_transf_cached)

# stop timer
end_time = time.time()

# evaluate the trainining time in minutes 
train_time = (end_time - start_time)/60

# print the training time 
print("")
print("Training time.........%6.3f min" % train_time)

                                                                                


Training time......... 2.809 min


In [19]:
# make predictions on the validation set
base_pred = base_pipeline_model.transform(validation_transf_cached)

In [20]:
# clear data from memory
train_transf_cached.unpersist()
validation_transf_cached.unpersist()

DataFrame[nr_songs: bigint, nr_friends: bigint, nr_likes: bigint, nr_dislikes: bigint, nr_downgrades: bigint, nr_upgrades: bigint, nr_settings: bigint, nr_error: bigint, nr_ads: bigint, nr_sessions: bigint, n_acts: bigint, acts_per_session: double, songs_per_session: double, ads_per_session: double, init_days_interv: double, tenure_days_interv: double, active_days: double, level: int, churn: int, label: double, bin_features: vector, cont_features: vector, cont_scaler: vector, features: vector]

In [21]:
# create the meta features dataset
meta_features_df = base_pred.select("pred_lr", "pred_rf", "pred_gbt", "pred_mlpc","label",
                                   "prob_lr", "prob_rf", "prob_mlpc")

In [22]:
# prepare the meta_features for modeling
# split the features and the label
META_FEATURES = ["pred_lr", "pred_rf","pred_gbt", "pred_mlpc"]
META_CONT_FEATURES = ["prob_lr", "prob_rf", "prob_mlpc"]
META_LABEL = "label"

def build_meta_pipeline():
    """
    Combines all the stages of the meta features processing.
    """
    # stages in the pipeline
    stages = [] 
    
    # encode the labels
    label_indexer =  StringIndexer(inputCol=META_LABEL, outputCol="meta_label")
    stages += [label_indexer]
    
    # encode the binary features
    bin_assembler = VectorAssembler(inputCols=META_FEATURES, outputCol="bin_features")
    stages += [bin_assembler]
    
    # encode the continuous features
    cont_assembler = VectorAssembler(inputCols = META_CONT_FEATURES, outputCol="cont_features")
    stages += [cont_assembler]
    # normalize the continuous features
    cont_scaler = StandardScaler(inputCol="cont_features", outputCol="cont_scaler", 
                                 withStd=True , withMean=True)
    stages += [cont_scaler]
    
    # pass all to the vector assembler to create a single sparse vector
    all_assembler = VectorAssembler(inputCols=["bin_features"],  
                                    outputCol="meta_features")
    stages += [all_assembler]
    
    # add the models to the pipeline
    #stages += [models]
    
    # create a pipeline
    pipeline = Pipeline(stages=stages)
    
    return pipeline

In [23]:
# instantiate the data pipeline
meta_pipeline = build_meta_pipeline()

# prepare the datasets for modeling

meta_pipeline_model = meta_pipeline.fit(meta_features_df)
meta_transf = meta_pipeline_model.transform(meta_features_df)

                                                                                

In [24]:
# train the meta clasifier
meta_lr = LogisticRegression(featuresCol='meta_features', 
                             labelCol='label', 
                             predictionCol='final_pred')
meta_classifier = meta_lr.fit(meta_transf)

                                                                                

In [62]:
meta_dt = LogisticRegression(featuresCol='meta_features', 
                             labelCol='label', 
                             predictionCol='final_pred',
                                maxIter=400, elasticNetParam=0.1, regParam=0)
meta_classifier_dt = meta_dt.fit(meta_transf.persist())

                                                                                

In [56]:
meta_dt = RandomForestClassifier(featuresCol='meta_features', 
                             labelCol='label', 
                             predictionCol='final_pred')
meta_classifier_dt = meta_dt.fit(meta_transf.persist())

                                                                                

In [25]:
# base classifiers make predictions on the test set
test_pred = base_pipeline_model.transform(test_transf_cached)

In [26]:
# create the meta features test dataset
meta_features_test_df = test_pred.select("pred_lr", "pred_rf", "pred_gbt", "pred_mlpc","label",
                                        "prob_lr", "prob_rf", "prob_mlpc")

In [27]:
# prepare the datasets for modeling
meta_pipeline_test = meta_pipeline.fit(meta_features_test_df)
meta_test_transf = meta_pipeline_test.transform(meta_features_test_df)

                                                                                

In [28]:
# make predictions on the test set
test_pred = meta_classifier.transform(meta_test_transf)

In [63]:
# make predictions on the test set
test_pred_dt = meta_classifier_dt.transform(meta_test_transf)

In [36]:
test_pred_dt.printSchema()

root
 |-- pred_lr: double (nullable = false)
 |-- pred_rf: double (nullable = false)
 |-- pred_gbt: double (nullable = false)
 |-- pred_mlpc: double (nullable = false)
 |-- label: double (nullable = false)
 |-- prob_lr: vector (nullable = true)
 |-- prob_rf: vector (nullable = true)
 |-- prob_mlpc: vector (nullable = true)
 |-- meta_label: double (nullable = false)
 |-- bin_features: vector (nullable = true)
 |-- cont_features: vector (nullable = true)
 |-- cont_scaler: vector (nullable = true)
 |-- meta_features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- final_pred: double (nullable = false)



In [30]:
evaluator_meta_classifier = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", 
                                          metricName= "areaUnderPR")

In [31]:
print(evaluator_meta_classifier.evaluate(test_pred))

                                                                                

0.8127750816797253


In [64]:
print(evaluator_meta_classifier.evaluate(test_pred_dt))



0.8127750816797253


                                                                                

In [47]:
evaluator1_meta_classifier = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", 
                                          metricName= "areaUnderROC")

In [43]:
print(evaluator1_meta_classifier.evaluate(test_pred))

                                                                                

0.891974796037296


In [65]:
print(evaluator1_meta_classifier.evaluate(test_pred_dt))



0.891974796037296


                                                                                

## <font color='blue'>Modeling</font>

<div class="alert alert-block alert-info">

Split the full dataset into train, test, and validation sets. Test out several of the machine learning methods you learned. Evaluate the accuracy of the various models, tuning parameters as necessary. Determine your winning model based on test accuracy and report results on the validation set. Since the churned users are a fairly small subset, I suggest using F1 score as the metric to optimize.

</div>

### <font color='blue'>Baseline Model</font>

In [None]:
# count the number of churn users in each set
fn_train = train_cached.where(train_cached.churn==1).count()
fn_test = test_cached.where(test_cached.churn==1).count()

# count the number of not churn users in each set
tn_train = train_cached.where(train_cached.churn==0).count()
tn_test = test_cached.where(test_cached.churn==0).count()

In [None]:
# accuracy of ZeroR model on train set
accuracy_train = tn_train/(fn_train+tn_train)
print("With fp = tp = 0, fn = {} and tn = {}, the accuracy of the ZeroR model on the train set is {}%."
      .format(fn_train, tn_train, round(accuracy_train,4)*100))

# accuracy of ZeroR model on test set
accuracy_test = tn_test/(fn_test+tn_test)
print("With fp = tp = 0, fn = {} and tn = {}, the accuracy of the ZeroR model on the test set is {}%."
      .format(fn_test, tn_test, round(accuracy_test,4)*100))

<div class="alert alert-block alert-info">
    
Build a ZeroR baseline model. This is a simple model that always predicts the most numerous class. The accuracy of this model on the test set is:

$${\rm accuracy\;testset} = \frac{\rm not\; churn}{\rm all\; users} = \frac{46}{58} = 0.79$$
    
Any classifier we build must have better accuracy on the test set than the accuracy of this dummy classifier.
    
</div>

### <font color='blue'>Build Models Evaluators</font>

In [36]:
labelCol="label"
predCol="predictions"
featuresCol="features"

In [37]:
# function to compute relevant metrics for binary classification
def conf_metrics(dataset):
    
    """
        Calculates the metrics associated to the confusion matrix.

        INPUT:
            dataset (pyspark.sql.DataFrame) - a dataset that contains
                                labels and predictions
        OUTPUT:
            accuracy (float) - metric
            precision (float) - metric
            recall (float) - metric
            F1 (float) - metric
    """
   

    # calculate the elements of the confusion matrix
    tn = dataset.where((dataset[labelCol]==0) & (dataset[predCol]==0)).count()
    tp = dataset.where((dataset[labelCol]==1) & (dataset[predCol]==1)).count()                   
    fn = dataset.where((dataset[labelCol]==1) & (dataset[predCol]==0)).count()                   
    fp = dataset.where((dataset[labelCol]==0) & (dataset[predCol]==1)).count()
    
    # calculate accuracy, precision, recall, and F1-score
    accuracy = (tn + tp) / (tn + tp + fn + fp)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 =  2 * (precision*recall) / (precision + recall)
    
    return accuracy, precision, recall, f1

In [22]:
# function to display the metrics of interest
def display_metrics(dataset, evaluator):
    
    """
    Prints evaluation metrics for the model. 
    
    INPUT:
         dataset (pyspark.sql.DataFrame) - a dataset that contains
                                labels and predictions
    
    """
    
    accuracy = conf_metrics(dataset)[0]
    precision = conf_metrics(dataset)[1]
    recall = conf_metrics(dataset)[2]
    f1 = conf_metrics(dataset)[3]

    # calculate auc metrics
    roc_cl = evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderROC"})
    pr_cl = evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
    
    print("")
    print("Confusion Matrix")
    dataset.groupBy(dataset[labelCol], dataset[predCol]).count().show()
    print("")
    print("accuracy...............%6.3f" % accuracy)
    print("precision..............%6.3f" % precision)
    print("recall.................%6.3f" % recall)
    print("F1.....................%6.3f" % f1)
    print("auc_roc................%6.3f" % roc_cl)
    print("auc_pr.................%6.3f" % pr_cl)


In [23]:
# source: https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python

def plot_roc_pr_curves(predictions, model_name):
    
    """
    Calculates ROC-AUC and PR-AUC scores and plots the ROC and PR curves.
    
    INPUT:
        predictions (PySpark dataframe) - contains probability predictions, label column
        model_name (str) - classifier name
        
    OUTPUT:
        none - two plots are displayed
        
    """
    
    # transform predictions PySpark dataframe into Pandas dataframe
    pred_pandas = predictions.select(predictions.label, predictions.probability).toPandas()
    
    # calculate roc_auc score
    roc_auc = roc_auc_score(pred_pandas.label, pred_pandas.probability.str[1])
    # generate a no skill prediction (majority class)
    ns_probs = [0 for _ in range(len(pred_pandas.label))]
    # calculate roc curves
    fpr, tpr, _ = roc_curve(pred_pandas.label, pred_pandas.probability.str[1])
    ns_fpr, ns_tpr, _ = roc_curve(pred_pandas.label, ns_probs)
    
    # calculate precision, recall for each threshold
    precision, recall, _ = precision_recall_curve(pred_pandas.label, pred_pandas.probability.str[1])
    # calculate pr auc score
    pr_auc = auc(recall, precision)
    

    # create figure which contains two subplots
    plt.figure(figsize=[12,6])
    
    plt.subplot(121)
    
    # plot the roc curve for the model
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, marker='.', color='firebrick', label='ROC AUC = %.3f' % (roc_auc))
    
    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the legend
    plt.legend()
    # figure title
    plt.title("ROC Curve:" + model_name)
    
    plt.subplot(122)
    
    # plot the precision-recall curves
    
    ns_line = len(pred_pandas[pred_pandas.label==1]) / len(pred_pandas.label)
    plt.plot([0, 1], [ns_line, ns_line], linestyle='--', label='No Skill')
    plt.plot(recall, precision, marker='.', color='firebrick', label='PR AUC = %.3f' % (pr_auc))
    
    # axis labels
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    # show the legend
    plt.legend()
    # figure title
    plt.title("Precision-Recall Curve:" + model_name)

    # show the plot
    plt.show()

### <font color='blue'>Build Pipelines</font>

In [24]:
# implement K-fold cross validation and grid search 

def grid_search_model(pipeline, param):
    """
    Creates a cross validation object and performs grid search
    over a set of parameters.
    
    INPUT:
        param = grid of parameters
        pipeline = model pipeline 
    
    OUTPUT:
        cv = cross validation object
    """
    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=param,
                    evaluator=evaluator,
                    numFolds=5,
                    parallelism=2)
    return cv

## <font color='blue'>Evaluate PySpark Classifiers</font>

<div class="alert alert-block alert-info">

Evaluate binary classifiers implemented in PySpark on the train set, using default parameters to select the best performing classifiers to be tuned in the next stage.
   
</div>

### <font color='blue'>Logistic Regression Classifier</font>

In [None]:
print("")
print(f"Training LOGISTIC REGRESSION CLASSIFIER")
print("")

labelCol="label"
predCol="prediction"

# instantiate the classifier
lr_classifier = LogisticRegression(labelCol = "label",
                                       featuresCol = "features")

# build specific pipeline
lr_pipeline = build_full_pipeline(lr_classifier)

# choose an evaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("label")

# start timer
start_time = time.time()

# train the model
model_lr = lr_pipeline.fit(train_cached)

# stop timer
end_time = time.time()

# evaluate the trainining time in minutes 
train_time = (end_time - start_time)/60

# print the training time
print("")
print("Training time.........%6.3f min" % train_time)

# create the predictions dataset
predictions_lr = model_lr.transform(train_cached)

# calculate auc metrics
#roc_lr = evaluator.evaluate(predictions_lr, {evaluator.metricName: "areaUnderROC"})
#pr_lr = evaluator.evaluate(predictions_lr, {evaluator.metricName: "areaUnderPR"})

# record the confusion matrix metrics
acc_lr, prec_lr, rec_lr, f1_lr = conf_metrics(predictions_lr)

# print all evaluation metrics
print("")
print("Metrics on the train set")
display_metrics(predictions_lr, evaluator)
print("")

# plot the ROC and PR curves
plot_roc_pr_curves(predictions_lr, "LR")

### <font color='blue'>Decision Trees Classifier</font>

In [None]:
print("")
print(f"Training DECISION TREES CLASSIFIER")
print("")

# build specific pipeline
dt_classifier = DecisionTreeClassifier(labelCol = "label",
                                           featuresCol = "features",
                                           seed=1234)
dt_pipeline = build_full_pipeline(dt_classifier)


# choose an evaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("label")

# start timer
start_time = time.time()

# train the model
model_dt = dt_pipeline.fit(train_cached)

# stop timer
end_time = time.time()

# evaluate the trainining time in minutes 
train_time = (end_time - start_time)/60

# print the training time
print("")
print("Training time.........%6.3f min" % train_time)

# create the predictions dataset
predictions_dt = model_dt.transform(train_cached)

# calculate auc metrics
roc_dt = evaluator.evaluate(predictions_dt, {evaluator.metricName: "areaUnderROC"})
pr_dt = evaluator.evaluate(predictions_dt, {evaluator.metricName: "areaUnderPR"})

# record the confusion matrix metrics
acc_dt, prec_dt, rec_dt, f1_dt = conf_metrics(predictions_dt)

# print all evaluation metrics
print("")
print("Metrics on the train set")
display_metrics(predictions_dt, roc_dt, pr_dt)
print("")

# plot the ROC and PR curves
plot_roc_pr_curves(predictions_dt, "DT")

### <font color='blue'>Random Forest Classifier</font>

In [35]:
print("")
print(f"Training RANDOM FOREST CLASSIFIER")
print("")

# instantiate the classifier
rf_classifier = RandomForestClassifier(labelCol = "label",
                                           featuresCol = "features", 
                                           seed=1234)
# build the specific pipeline
rf_pipeline = build_data_pipeline(rf_classifier)

# choose an evaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("label")

# start timer
start_time = time.time()

# train the model
model_rf = rf_pipeline.fit(df_train)

# stop timer
end_time = time.time()

# evaluate the trainining time in minutes 
train_time = (end_time - start_time)/60

# print the training time
print("")
print("Training time.........%6.3f min" % train_time)

# create the predictions dataset
predictions_rf = model_rf.transform(df_train)

# calculate auc metrics
roc_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "areaUnderROC"})
pr_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "areaUnderPR"})

# record the confusion matrix metrics
acc_rf, prec_rf, rec_rf, f1_rf = conf_metrics(predictions_rf)

# print all evaluation metrics
print("")
print("Metrics on the train set")
display_metrics(predictions_rf, roc_rf, pr_rf)
print("")

# plot the ROC and PR curves
plot_roc_pr_curves(predictions_rf, "RF")


Training RANDOM FOREST CLASSIFIER


Training time......... 0.044 min


NameError: name 'labelCol' is not defined

### <font color='blue'>Gradient Boosted Trees</font>

In [40]:
print("")
print(f"Training GRADIENT BOOSTED TREES")
print("")

# instantiate the classifier
gbt_classifier = GBTClassifier(labelCol = "label",
                                featuresCol = "features",
                                seed=1234)
# build specific pipeline
gbt_pipeline = build_data_pipeline(gbt_classifier)

# choose an evaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("label")

# start timer
start_time = time.time()

# train the model
model_gbt = gbt_pipeline.fit(df_train)

# stop timer
end_time = time.time()

# evaluate the trainining time in minutes 
train_time = (end_time - start_time)/60

# print the training time
print("")
print("Training time.........%6.3f min" % train_time)

# create the predictions dataset
predictions_gbt = model_gbt.transform(df_train)

# calculate auc metrics
roc_gbt = evaluator.evaluate(predictions_gbt, {evaluator.metricName: "areaUnderROC"})
pr_gbt = evaluator.evaluate(predictions_gbt, {evaluator.metricName: "areaUnderPR"})

# record the confusion matrix metrics
acc_gbt, prec_gbt, rec_gbt, f1_gbt = conf_metrics(predictions_gbt)

# print all evaluation metrics
print("")
print("Metrics on the train set")
display_metrics(predictions_gbt, roc_gbt, pr_gbt)
print("")

# plot the ROC and PR curves
plot_roc_pr_curves(predictions_gbt, "GBT")


Training GRADIENT BOOSTED TREES


Training time......... 0.073 min


AnalysisException: Cannot resolve column name "predictions" among (active_days, acts_per_session, ads_per_session, churn, init_days_interv, level, n_acts, nr_ads, nr_dislikes, nr_downgrades, nr_error, nr_friends, nr_likes, nr_sessions, nr_settings, nr_songs, nr_upgrades, songs_per_session, tenure_days_interv, label, bin_features, cont_features, cont_scaler, features, rawPrediction, probability, prediction)

### <font color='blue'>Multilayer Perceptron Classifier</font>

In [None]:
print("")
print(f"Training MULTILAYER PERCEPTRON CLASSIFIER")
print("")

# specify layers: 19 (features), two intermediate (8, 4), output 2 (classes)
layers=[18, 8, 4, 2]
# create the trainer and set its parameters
mlpc_classifier = MultilayerPerceptronClassifier(labelCol = "label",
                                                featuresCol = "features",
                                                layers=layers,
                                                seed=1234)
# build specific pipeline
mlpc_pipeline = build_full_pipeline(mlpc_classifier)


# choose an evaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("label")

# start timer
start_time = time.time()

# train the model
model_mlpc = mlpc_pipeline.fit(train_cached)

# stop timer
end_time = time.time()

# evaluate the trainining time in minutes 
train_time = (end_time - start_time)/60

# print the training time
print("")
print("Training time.........%6.3f min" % train_time)

# create the predictions dataset
predictions_mlpc = model_mlpc.transform(train_cached)

# calculate auc metrics
roc_mlpc = evaluator.evaluate(predictions_mlpc, {evaluator.metricName: "areaUnderROC"})
pr_mlpc = evaluator.evaluate(predictions_mlpc, {evaluator.metricName: "areaUnderPR"})

# record the confusion matrix metrics
acc_mlpc, prec_mlpc, rec_mlpc, f1_mlpc = conf_metrics(predictions_mlpc)

# print all evaluation metrics
print("")
print("Metrics on the train set")
display_metrics(predictions_mlpc, roc_mlpc, pr_mlpc)
print("")

# plot the ROC and PR curves
plot_roc_pr_curves(predictions_mlpc, "MLPC")

### <font color='blue'>Linear Support Vector Machine</font>

In [None]:
print("")
print(f"Training LINEAR SUPPORT VECTOR MACHINE")
print("")

# instantiate the classifier
lsvc_classifier = LinearSVC(labelCol = "label",
                            featuresCol = "features")
# build specific pipeline
lsvc_pipeline = build_full_pipeline(lsvc_classifier)

# choose an evaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("label")

# start timer
start_time = time.time()

# train the model
model_lsvc = lsvc.fit(train_cached)

# stop timer
end_time = time.time()

# evaluate the trainining time in minutes 
train_time = (end_time - start_time)/60

# print the training time
print("")
print("Training time.........%6.3f min" % train_time)

# create the predictions dataset
predictions_lsvc = model_lsvc.transform(train_cached)

# calculate auc metrics
roc_lsvc = evaluator.evaluate(predictions_lsvc, {evaluator.metricName: "areaUnderROC"})
pr_lsvc = evaluator.evaluate(predictions_lsvc, {evaluator.metricName: "areaUnderPR"})

# record the confusion matrix metrics
acc_lsvc, prec_lsvc, rec_lsvc, f1_lsvc = conf_metrics(predictions_lsvc)

# print all evaluation metrics
print("")
print("Metrics on the train set")
display_metrics(predictions_lsvc, roc_lsvc, pr_lsvc)
print("")

### <font color='blue'>Choose the best classifiers</font>

In [None]:
# create Pandas dataframe with metrics
dict_metrics = {"LinReg": [acc_lr, prec_lr, rec_lr, f1_lr, roc_lr, pr_lr],
                "DecTrees": [acc_dt, prec_dt, rec_dt, f1_dt, roc_dt, pr_dt], 
                "RandForest": [acc_rf, prec_rf, rec_rf, f1_rf, roc_rf, pr_rf],
                "GradBoost": [acc_gbt, prec_gbt, rec_gbt, f1_gbt, roc_gbt, pr_gbt],
                "MultiLPerceptron": [acc_mlpc, prec_mlpc, rec_mlpc, f1_mlpc, roc_mlpc, pr_mlpc],
                "LinearSVM": [acc_lsvc, prec_lsvc, rec_lsvc, f1_lsvc, roc_lsvc, pr_lsvc],
                "list_metrics" : ["accuracy", "precision", "recall", "f1_score", "auc_roc", "auc_pr"]
               }
df_mets = pd.DataFrame.from_dict(dict_metrics).set_index("list_metrics")
df_mets.round(3)

In [None]:
# adjust figure size and font size
sns.set(rc = {"figure.figsize":(16,2)})

sns.set(font_scale=1)
ax = df_mets.plot.bar(y=["LinReg", "DecTrees", "RandForest", "GradBoost", "MultiLPerceptron", "LinearSVM"], rot=0)

# create title and labels
plt.title("Confusion Matrix Metrics For Sparkify Small Train Dataset")
ax.set_xlabel("");
ax.set_ylabel("");

<div class="alert alert-block alert-info">

It is clear from the table of performance metrics that Random Forest, Gradient Boosted Trees and MultiLayer Perceptron perform the best. We will use grid search with cross validation to tune Random Forest and Gradient Boosted Trees. This dataset is too small for a neural network to perform optimally, so we will skip the MultiLayer Perceptron in this case.
   
</div>

## <font color='blue'>Tune Hyperparameters</font>

### <font color='blue'>Random Forest Classifier</font>

In [None]:
# model_name = "Random Forest"
def rf_grid_search(pipeline):
    
    model = pipeline.getStages()[-1]

    # create a list of parameters for Random Forest
    param_rf = ParamGridBuilder()
    param_rf = param_rf.addGrid(model.maxDepth, [5, 10, 15, 20, 25]) 
    param_rf = param_rf.addGrid(model.maxBins, [8, 16, 24, 32, 48])
    param_rf = param_rf.addGrid(model.numTrees, [10, 20, 40, 60, 80]) 
    param_rf = param_rf.build()
    
    print(f"Models trained: {len(param_rf)}")
    
    return grid_search_model(pipeline, param_rf)

In [None]:
print("")
print(f"Training RANDOM FOREST CLASSIFIER")
print("")

predCol="prediction"
labelCol="label"

# instantiate the classifier
rf_classifier = RandomForestClassifier(labelCol = "label",
                                           featuresCol = "features", 
                                           seed=1234)
# build the specific pipeline
rf_pipeline = build_full_pipeline(rf_classifier)

# choose an evaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol(labelCol)

# build the grid search pipeline
rf = rf_grid_search(rf_pipeline)

# start timer
start_time = time.time()

# train the model
model_rf = rf.fit(train_cached)

# stop timer
end_time = time.time()

# evaluate the trainining time in minutes 
train_time = (end_time - start_time)/60

# print the training time
print("")
print("Training time.........%6.3f min" % train_time)

# create the predictions dataset
predictions_rf = model_rf.bestModel.transform(test_cached)

# calculate auc metrics
roc_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "areaUnderROC"})
pr_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "areaUnderPR"})

# record the confusion matrix metrics
acc_rf, prec_rf, rec_rf, f1_rf = conf_metrics(predictions_rf)

# print all evaluation metrics
print("")
print("Evaluation metrics on the test set")
display_metrics(predictions_rf, roc_rf, pr_rf)
print("")

best_param_rf = list(model_rf.getEstimatorParamMaps()[np.argmax(model_rf.avgMetrics)].values())
print("The best hyperparameter values from the grid:")
print("maxDepth:..........", best_param_rf[0])
print("maxBins:...........", best_param_rf[1])
print("numTrees:..........", best_param_rf[2])

# plot the ROC and PR curves
plot_roc_pr_curves(predictions_rf, "RF")

In [None]:
### <font color='blue'>Gradient Boosted Trees</font>

In [None]:
# model_name = "Gradient Boosted Trees"
def gbt_grid_search(pipeline):
    
    model = pipeline.getStages()[-1]

    # create a list of parameters for Gradient Boosted Trees
    param_gbt = ParamGridBuilder()
    param_gbt = param_gbt.addGrid(model.maxDepth, [5, 10, 15, 20])
    param_gbt = param_gbt.addGrid(model.maxIter, [20, 40, 60, 80, 100])
    param_gbt = param_gbt.addGrid(model.stepSize, [.05, .1, .15, .2])
    param_gbt = param_gbt.build()
    
    print(f"Models trained: {len(param_gbt)}")
    
    return grid_search_model(pipeline, param_gbt)

In [None]:
print("")
print(f"Training GRADIENT BOOSTED TREES")
print("")

predCol="prediction"
labelCol="label"

# instantiate the classifier
gbt_classifier = GBTClassifier(labelCol = "label",
                                featuresCol = "features",
                                seed=1234)
# build specific pipeline
gbt_pipeline = build_full_pipeline(gbt_classifier)

# choose an evaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol(labelCol)

# build the grid search pipeline
gbt = gbt_grid_search(gbt_pipeline)

# start timer
start_time = time.time()

# train the model
model_gbt = gbt.fit(train_cached)

# stop timer
end_time = time.time()

# evaluate the trainining time in minutes 
train_time = (end_time - start_time)/60

# print the training time
print("")
print("Training time.........%6.3f min" % train_time)

# create the predictions dataset
predictions_gbt = model_gbt.bestModel.transform(test_cached)

# calculate auc metrics
roc_gbt = evaluator.evaluate(predictions_gbt, {evaluator.metricName: "areaUnderROC"})
pr_gbt = evaluator.evaluate(predictions_gbt, {evaluator.metricName: "areaUnderPR"})

# record the confusion matrix metrics
acc_gbt, prec_gbt, rec_gbt, f1_gbt = conf_metrics(predictions_gbt)

# print all evaluation metrics
print("")
print("Evaluation metrics on the test set")
display_metrics(predictions_gbt, roc_gbt, pr_gbt)
print("")

# print the best parameters from the grid
best_model_gbt = model_gbt.bestModel.stages[-1]
param_gbt1 = best_model_gbt.getMaxDepth()
param_gbt2 = best_model_gbt.getMaxIter()
param_gbt3 = best_model_gbt.getStepSize()
print("The best hyperparameter values from the grid:")
print("maxDepth:..........", param_gbt1)
print("maxIter:...........", param_gbt2)
print("stepSize:..........", param_gbt3)

# plot the ROC and PR curves
plot_roc_pr_curves(predictions_gbt, "GBT")

In [None]:
def plot_roc_pr_two_curves(predictions_model1, predictions_model2):
    
    """
    Plots the ROC and PR curves for two models on the same graphs.
    
    INPUT:
        predictions_model1 (PySpark dataframe) - contains probability predictions for the first model
        predictions_model2 (PySpark dataframe) - contains probability predictions for the second model
        
    OUTPUT:
        none - two plots are displayed
        
    """
    
    # transform predictions PySpark dataframe into Pandas dataframe
    pred1_pandas = predictions_model1.select(predictions_model1.label, 
                                             predictions_model1.probability).toPandas()
    pred2_pandas = predictions_model2.select(predictions_model2.label, 
                                             predictions_model2.probability).toPandas()
    
    # calculate roc_auc scores for first model
    roc_auc1 = roc_auc_score(pred1_pandas.label, pred1_pandas.probability.str[1])
    # calculate roc_auc scores for second model
    roc_auc2 = roc_auc_score(pred2_pandas.label, pred2_pandas.probability.str[1])
    
    # calculate roc curves for model 1
    fpr1, tpr1, _ = roc_curve(pred1_pandas.label, pred1_pandas.probability.str[1])
    # calculate roc curves for model 2
    fpr2, tpr2, _ = roc_curve(pred2_pandas.label, pred2_pandas.probability.str[1])
    
    # calculate precision, recall for each threshold for the first model
    precision1, recall1, _ = precision_recall_curve(pred1_pandas.label, pred1_pandas.probability.str[1])
    # calculate pr auc score
    pr_auc1 = auc(recall1, precision1)
    
    # calculate precision, recall for each threshold for the second model
    precision2, recall2, _ = precision_recall_curve(pred2_pandas.label, pred2_pandas.probability.str[1])
    # calculate pr auc score
    pr_auc2 = auc(recall2, precision2)

    # create figure which contains two subplots
    plt.figure(figsize=[12,6])
    
    plt.subplot(121)
    
    # plot the roc curve for the model1
    plt.plot(fpr1, tpr1, marker='.', color='firebrick', label='RF: ROC-AUC = %.3f' % (roc_auc1))
    plt.plot(fpr2, tpr2, marker='.', color='green', label='GBT: ROC-AUC = %.3f' % (roc_auc2))
    
    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the legend
    plt.legend()
    # figure title
    plt.title("ROC Curves")
    
    plt.subplot(122)
    
    # plot the precision-recall curves
    plt.plot(recall1, precision1, marker='.', color="firebrick", label='RF: PR-AUC = %.3f' % (pr_auc1))
    plt.plot(recall2, precision2, marker='.', color="green", label='GBT: PR-AUC = %.3f' % (pr_auc2))
    
    # axis labels
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    # show the legend
    plt.legend()
    # figure title
    plt.title("Precision-Recall Curves")

    # show the plot
    plt.show()

In [None]:
plot_roc_pr_two_curves(predictions_rf, predictions_gbt)

In [None]:
# create Pandas dataframe with metrics
dict_metrics = {"RandForest": [acc_rf, prec_rf, rec_rf, f1_rf, roc_rf, pr_rf],
                "GradBoost": [acc_gbt, prec_gbt, rec_gbt, f1_gbt, roc_gbt, pr_gbt],
                "list_metrics" : ["accuracy", "precision", "recall", "f1_score", "auc_roc", "auc_pr"]
               }
df_mets = pd.DataFrame.from_dict(dict_metrics).set_index("list_metrics")
df_mets.round(3)

In [None]:
# adjust figure size and font size
sns.set(rc = {"figure.figsize":(16,2)})

sns.set(font_scale=1)
ax = df_mets.plot.bar(y=["RandForest", "GradBoost"], rot=0)

# create title and labels
plt.title("Confusion Matrix Metrics For Sparkify Small Train Dataset")
ax.set_xlabel("");
ax.set_ylabel("");