In [None]:
#preparing kaggle for pyspark
!pip install pyspark

# importing

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt


from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print("Import completed")

# create spark session

In [None]:
spark = SparkSession.builder.master("local").appName("soddisfazione-passeggeri").getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(spark.sparkContext)

print(spark, sc, sqlContext)

# IMPORTING DATASET

In [None]:
airline_test_dataset = "../input/airline-passenger-satisfaction/test.csv"
airline_train_dataset = "../input/airline-passenger-satisfaction/train.csv"

print("Datasets imported")

# Merge test and train csv into a single dataframe

In [None]:
original_test_df = spark.read.csv(path=airline_test_dataset, inferSchema =True, header=True).cache()
original_train_df = spark.read.csv(path=airline_train_dataset, inferSchema =True, header=True).cache()

#merge test and train dataframes
full_df = original_test_df.union(original_train_df) 

#show a summary of dataframe
full_df.summary().show()

# Data cleaning

In [None]:
#Clean the dataframe from id e number _c0 features, which are useless.
full_df = full_df.drop("_c0","id") 

print("Cleaned")

As shown in summary, all features have 129880 values, just **"Arrival Delay in Minutes"** has 129487. Its mean value is 15.09112883918849

> 

In [None]:
#Counting null values in Arrival Delay in Minutes
full_df.filter(col("Arrival Delay in Minutes").isNull()).count()

In [None]:
#Fill missing arrival delay values with column mean
full_df = full_df.fillna({"Arrival Delay in Minutes":'15.1'})

print("Filled, now null values count is: ",full_df.filter(col("Arrival Delay in Minutes").isNull()).count())

In [None]:
#Replace blank characters with underscore
replacements = {c:c.replace(' ','_') for c in full_df.columns if ' ' in c}

#Replace satisfied with '1', neutral or dissatisfied with '0'
full_df = full_df.withColumn("satisfaction", F.when(F.col("satisfaction")=="satisfied", 1).otherwise(F.col("satisfaction")))
full_df = full_df.withColumn("satisfaction", F.when(F.col("satisfaction")=="neutral or dissatisfied", 0).otherwise(F.col("satisfaction")))

#Cast String type of satisfaction column into Integer type
full_df = full_df.withColumn("satisfaction",col("satisfaction").cast(IntegerType()))

print("renamed")

full_df.dtypes

# Heatmap analysis 

In [None]:
pandas_full_df = full_df.toPandas()
plt.figure(figsize=(13,13))
sns.heatmap(abs(pandas_full_df.corr()), cmap = 'Blues', annot=True, fmt=".2f")

In [None]:
fig = plt.figure(figsize = (10,7))
plt.scatter(pandas_full_df['Departure Delay in Minutes'], pandas_full_df['Arrival Delay in Minutes'], alpha = 0.1)

plt.xlabel("Departure Delay in Minutes")
plt.ylabel("Arrival Delay in Minutes")

Arrival Delay in Minutes and Departure Delay in Minutes are highly correlated. One of them can be dropped

In [None]:
#Drop Arrival delay in minutes, since it is highly correlated with other column

full_df = full_df.drop("Arrival Delay in Minutes") 

print("dropped Arrival Delay in Minutes")

# DATA Visualization

What influences satisfaction?

In [None]:
abs(pandas_full_df.corr()['satisfaction']).sort_values().drop('satisfaction').plot(kind='barh')

In [None]:
import plotly.express as px
fig = px.sunburst(pandas_full_df, path=["satisfaction",'Type of Travel','Class', 'Customer Type'],color_continuous_scale='RdBu')
fig.show()

Age & Satisfaction

In [None]:
g= sns.FacetGrid(pandas_full_df,col="satisfaction")
g.map(sns.distplot,"Age",bins=25)
plt.show()
# 0=neutral or dissatisfied, 1=satisfied 

In [None]:
#Missing categorical columns
pandas_full_df.hist(bins=50, figsize=(20,15))

Customer Type & Satisfaction

In [None]:
g=sns.catplot(x="Customer Type",y="satisfaction",data=pandas_full_df,kind="bar",height=6, palette="Blues")
g.set_ylabels("Satisfaction Probability")
plt.show()

Class & Satisfaction

In [None]:
g=sns.catplot(x="Class",y="satisfaction",data=pandas_full_df,kind="bar",height=6, palette="Blues")
g.set_ylabels("Satisfation Probability")
plt.show()

# Data types manipulation

In [None]:
#Rename the satisfaction column as label for easier manipulation
full_df = full_df.withColumnRenamed("satisfaction","label")

#Merge train and test dataframes into a single one for easier manipulation
train_df, test_df = full_df.randomSplit([0.7, 0.3], seed=30)

print(f"Train set length: {train_df.count()} entries")
print(f"Test set length: {test_df.count()} entries")

In [None]:
#Select only categorical features excluding the label satisfaction
catCols = [x for (x, dataType) in full_df.dtypes if ((dataType =="string") & (x !="label"))]
numCols = [x for (x, dataType) in full_df.dtypes if ((dataType !="string") & (x !="label"))]

print(catCols)
print(numCols)

# TRAIN TEST SPLIT

In [None]:
#recap of datatypes
train_df.dtypes

# Transform categorical variables

Label encoding assigns each unique value to a different integer.

Import onehot encoder and string indexer. OnehotEncoder is used because I don't want the model to see some sort of unrelated ordering.

In [None]:
#from pyspark.ml.feature import (StringIndexer, OneHotEncoder)

string_indexer = [
    StringIndexer(inputCol=x, outputCol=x + "_StringIndexer", handleInvalid="skip")
    for x in catCols
]
string_indexer

In [None]:
one_hot_encoder = [
    OneHotEncoder(
        inputCols=[f"{x}_StringIndexer" for x in catCols],
        outputCols=[f"{x}_OneHotEncoder" for x in catCols]
    )
]

one_hot_encoder

vector assembler to be used by the machine model. transform all the features in a single vector


In [None]:
#from pyspark.ml.feature import VectorAssembler

assemblerInput = [x for x in numCols]
assemblerInput += [f"{x}_OneHotEncoder" for x in catCols]

assemblerInput

In [None]:
vector_assembler = VectorAssembler(
    inputCols = assemblerInput, outputCol="features"
)

vector_assembler

# Logistic regression

In [None]:
#from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()

# Pipeline creation

In [None]:
stages = []
stages += string_indexer
stages += one_hot_encoder
stages += [vector_assembler , lr] #Must be inserted as list

stages

In [None]:
#from pyspark.ml import Pipeline

pipeline = Pipeline().setStages(stages)
model_pp_lr = pipeline.fit(train_df)
predictions_pp_lr = model_pp_lr.transform(test_df)

print("pipeline completed")

In [None]:
predictions_pp_lr.select("features", "rawPrediction", "probability", "prediction","label").show()

# AUC

In [None]:
#show
predictions_pp_lr.summary()

ROC evaluation with PySpark.

In [None]:
#from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
print('PySpark Area Under ROC', evaluator.evaluate(predictions_pp_lr))
print("Area Under PR: " + str(evaluator.evaluate(predictions_pp_lr, {evaluator.metricName: "areaUnderPR"})))

In [None]:
model_pp_lr.stages[-1].summary.pr.show()

In [None]:
#from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='label',metricName='accuracy').evaluate(predictions_pp_lr))
print('Precision: ',MulticlassClassificationEvaluator(labelCol='label',metricName='weightedPrecision').evaluate(predictions_pp_lr))
print('Recall: ',MulticlassClassificationEvaluator(labelCol='label',metricName='weightedRecall').evaluate(predictions_pp_lr))
print('f1: ',MulticlassClassificationEvaluator(labelCol='label',metricName='f1').evaluate(predictions_pp_lr))

In [None]:
trainingSummary = model_pp_lr.stages[-1].summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('AreaUnderROC: ' + str(trainingSummary.areaUnderROC))

In [None]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve')
plt.show()

In [None]:
rmse = evaluator.evaluate(predictions_pp_lr)
print("RMSE: %g" % rmse)

# TREES 🌳

# 🌳 Decision tree classifier

In [None]:
#from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(maxDepth = 3)


In [None]:
stages = []
stages += string_indexer
stages += one_hot_encoder
stages += [vector_assembler , dt] 

stages

In [None]:
pipeline = Pipeline().setStages(stages)
model_pp_dt = pipeline.fit(train_df)
predictions_pp_dt = model_pp_dt.transform(test_df)

print("pipeline completed")

In [None]:
evaluator = BinaryClassificationEvaluator()

print("Area Under ROC: " + str(evaluator.evaluate(predictions_pp_dt, {evaluator.metricName: "areaUnderROC"})))
print("Area Under PR: " + str(evaluator.evaluate(predictions_pp_dt, {evaluator.metricName: "areaUnderPR"})))

In [None]:
print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='label',metricName='accuracy').evaluate(predictions_pp_dt))
print('Precision: ',MulticlassClassificationEvaluator(labelCol='label',metricName='weightedPrecision').evaluate(predictions_pp_dt))
print('Recall: ',MulticlassClassificationEvaluator(labelCol='label',metricName='weightedRecall').evaluate(predictions_pp_dt))
print('f1: ',MulticlassClassificationEvaluator(labelCol='label',metricName='f1').evaluate(predictions_pp_dt))

In [None]:
rmse = evaluator.evaluate(predictions_pp_dt)
print("RMSE: %g" % rmse)

# 🌲🌴🌳 Random forest 

In [None]:
#from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')

In [None]:
stages = []
stages += string_indexer
stages += one_hot_encoder
stages += [vector_assembler , rf]

stages

In [None]:
pipeline = Pipeline().setStages(stages)
model_pp_rf = pipeline.fit(train_df)
predictions_pp_rf = model_pp_rf.transform(test_df)

print("pipeline completed")

In [None]:
evaluator = BinaryClassificationEvaluator()

print("Area Under ROC: " + str(evaluator.evaluate(predictions_pp_rf, {evaluator.metricName: "areaUnderROC"})))
print("Area Under PR: " + str(evaluator.evaluate(predictions_pp_rf, {evaluator.metricName: "areaUnderPR"})))

In [None]:
print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='label',metricName='accuracy').evaluate(predictions_pp_rf))
print('Precision: ',MulticlassClassificationEvaluator(labelCol='label',metricName='weightedPrecision').evaluate(predictions_pp_rf))
print('Recall: ',MulticlassClassificationEvaluator(labelCol='label',metricName='weightedRecall').evaluate(predictions_pp_rf))
print('f1: ',MulticlassClassificationEvaluator(labelCol='label',metricName='f1').evaluate(predictions_pp_rf))

In [None]:
trainingSummary = model_pp_rf.stages[-1].summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

print('AreaUnderROC: ' + str(trainingSummary.areaUnderROC))

In [None]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve')

plt.show()

In [None]:
rmse = evaluator.evaluate(predictions_pp_rf)
print("RMSE: %g" % rmse)

# 🎄 Gradient-boosted tree classifier

In [None]:
#from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(maxIter=10)

In [None]:
stages = []
stages += string_indexer
stages += one_hot_encoder
stages += [vector_assembler , gbt] 

stages

In [None]:
pipeline = Pipeline().setStages(stages)
model_pp_gbt = pipeline.fit(train_df)
predictions_pp_gbt = model_pp_gbt.transform(test_df)

print("pipeline completed")

In [None]:
evaluator = BinaryClassificationEvaluator()

print("Area Under ROC: " + str(evaluator.evaluate(predictions_pp_gbt, {evaluator.metricName: "areaUnderROC"})))
print("Area Under PR: " + str(evaluator.evaluate(predictions_pp_gbt, {evaluator.metricName: "areaUnderPR"})))

In [None]:
print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='label',metricName='accuracy').evaluate(predictions_pp_gbt))
print('Precision: ',MulticlassClassificationEvaluator(labelCol='label',metricName='weightedPrecision').evaluate(predictions_pp_gbt))
print('Recall: ',MulticlassClassificationEvaluator(labelCol='label',metricName='weightedRecall').evaluate(predictions_pp_gbt))
print('f1: ',MulticlassClassificationEvaluator(labelCol='label',metricName='f1').evaluate(predictions_pp_gbt))

In [None]:
rmse = evaluator.evaluate(predictions_pp_gbt)
print("RMSE: %g" % rmse)