In [1]:
import pandas as pd
import numpy as np 

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer

In [3]:
spark

In [4]:
data = spark.table('default.titanic_csv')

In [5]:
display(data)

Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,Mrs. John Bradley (Florence Briggs Thayer) Cumings,female,38.0,1,0,71.2833
1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
0,3,Mr. William Henry Allen,male,35.0,0,0,8.05
0,3,Mr. James Moran,male,27.0,0,0,8.4583
0,1,Mr. Timothy J McCarthy,male,54.0,0,0,51.8625
0,3,Master. Gosta Leonard Palsson,male,2.0,3,1,21.075
1,3,Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson,female,27.0,0,2,11.1333
1,2,Mrs. Nicholas (Adele Achem) Nasser,female,14.0,1,0,30.0708


### Average Fare per Gender and Survival

In [7]:
display(data)

Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,Mrs. John Bradley (Florence Briggs Thayer) Cumings,female,38.0,1,0,71.2833
1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
0,3,Mr. William Henry Allen,male,35.0,0,0,8.05
0,3,Mr. James Moran,male,27.0,0,0,8.4583
0,1,Mr. Timothy J McCarthy,male,54.0,0,0,51.8625
0,3,Master. Gosta Leonard Palsson,male,2.0,3,1,21.075
1,3,Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson,female,27.0,0,2,11.1333
1,2,Mrs. Nicholas (Adele Achem) Nasser,female,14.0,1,0,30.0708


In [8]:
display(data.groupby(['Sex','Survived']).agg({'Fare':'avg'}))

Sex,Survived,avg(Fare)
male,0,22.066170474137923
female,1,51.93857339055791
female,0,23.02438518518519
male,1,40.82148440366974


In [9]:
data = data.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))

In [10]:
display(data.describe())

summary,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Initial
count,887.0,887.0,887,887,887.0,887.0,887.0,887.0,887
mean,0.3855693348365276,2.305524239007892,,,29.471443066516347,0.5253664036076663,0.3833145434047351,32.30542018038328,
stddev,0.4870041177510126,0.8366620036697728,,,14.121908405462552,1.104668553867569,0.8074659070316833,49.78204040017391,
min,0.0,1.0,Capt. Edward Gifford Crosby,female,0.42,0.0,0.0,0.0,Capt
max,1.0,3.0,the Countess. of (Lucy Noel Martha Dyer-Edwards) Rothes,male,80.0,8.0,6.0,512.3292,Sir


In [11]:
display(data.groupBy('Initial').count())

Initial,count
Don,1
Miss,182
Countess,1
Col,2
Rev,6
Lady,1
Master,40
Mme,1
Capt,1
Mr,513


In [12]:
data = data.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])

In [13]:
display(data.groupBy('Initial').count())

Initial,count
Miss,186
Other,9
Master,40
Mr,525
Mrs,127


In [14]:
data.filter('AGE IS NULL').count()

In [15]:
data = data.withColumn('FamilySize' , col('Siblings/Spouses Aboard') + col('Parents/Children Aboard'))

In [16]:
display(data)

Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Initial,FamilySize
0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25,Mr,1
1,1,Mrs. John Bradley (Florence Briggs Thayer) Cumings,female,38.0,1,0,71.2833,Mrs,1
1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925,Miss,0
1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1,Mrs,1
0,3,Mr. William Henry Allen,male,35.0,0,0,8.05,Mr,0
0,3,Mr. James Moran,male,27.0,0,0,8.4583,Mr,0
0,1,Mr. Timothy J McCarthy,male,54.0,0,0,51.8625,Mr,0
0,3,Master. Gosta Leonard Palsson,male,2.0,3,1,21.075,Master,4
1,3,Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson,female,27.0,0,2,11.1333,Mrs,2
1,2,Mrs. Nicholas (Adele Achem) Nasser,female,14.0,1,0,30.0708,Mrs,1


In [17]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(data) for column in ["Sex","Initial"]]
pipeline = Pipeline(stages=indexers)
titanic_df = pipeline.fit(data).transform(data)

In [18]:
display(titanic_df)

Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Initial,FamilySize,Sex_index,Initial_index
0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25,Mr,1,0.0,0.0
1,1,Mrs. John Bradley (Florence Briggs Thayer) Cumings,female,38.0,1,0,71.2833,Mrs,1,1.0,2.0
1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925,Miss,0,1.0,1.0
1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1,Mrs,1,1.0,2.0
0,3,Mr. William Henry Allen,male,35.0,0,0,8.05,Mr,0,0.0,0.0
0,3,Mr. James Moran,male,27.0,0,0,8.4583,Mr,0,0.0,0.0
0,1,Mr. Timothy J McCarthy,male,54.0,0,0,51.8625,Mr,0,0.0,0.0
0,3,Master. Gosta Leonard Palsson,male,2.0,3,1,21.075,Master,4,0.0,3.0
1,3,Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson,female,27.0,0,2,11.1333,Mrs,2,1.0,2.0
1,2,Mrs. Nicholas (Adele Achem) Nasser,female,14.0,1,0,30.0708,Mrs,1,1.0,2.0


In [19]:
modelling_data = titanic_df.drop(*['Name','Sex','Initial'])

In [20]:
display(modelling_data)

Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,FamilySize,Sex_index,Initial_index
0,3,22.0,1,0,7.25,1,0.0,0.0
1,1,38.0,1,0,71.2833,1,1.0,2.0
1,3,26.0,0,0,7.925,0,1.0,1.0
1,1,35.0,1,0,53.1,1,1.0,2.0
0,3,35.0,0,0,8.05,0,0.0,0.0
0,3,27.0,0,0,8.4583,0,0.0,0.0
0,1,54.0,0,0,51.8625,0,0.0,0.0
0,3,2.0,3,1,21.075,4,0.0,3.0
1,3,27.0,0,2,11.1333,2,1.0,2.0
1,2,14.0,1,0,30.0708,1,1.0,2.0


In [21]:
modelling_data.columns

In [22]:
vector = VectorAssembler(inputCols=['Pclass',
 'Age',
 'Siblings/Spouses Aboard',
 'Parents/Children Aboard',
 'Fare',
 'FamilySize',
 'Sex_index',
 'Initial_index'] , outputCol="features")

In [23]:
model_data = vector.transform(modelling_data)

In [24]:
display(model_data)

Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,FamilySize,Sex_index,Initial_index,features
0,3,22.0,1,0,7.25,1,0.0,0.0,"List(1, 8, List(), List(3.0, 22.0, 1.0, 0.0, 7.25, 1.0, 0.0, 0.0))"
1,1,38.0,1,0,71.2833,1,1.0,2.0,"List(1, 8, List(), List(1.0, 38.0, 1.0, 0.0, 71.2833, 1.0, 1.0, 2.0))"
1,3,26.0,0,0,7.925,0,1.0,1.0,"List(1, 8, List(), List(3.0, 26.0, 0.0, 0.0, 7.925, 0.0, 1.0, 1.0))"
1,1,35.0,1,0,53.1,1,1.0,2.0,"List(1, 8, List(), List(1.0, 35.0, 1.0, 0.0, 53.1, 1.0, 1.0, 2.0))"
0,3,35.0,0,0,8.05,0,0.0,0.0,"List(0, 8, List(0, 1, 4), List(3.0, 35.0, 8.05))"
0,3,27.0,0,0,8.4583,0,0.0,0.0,"List(0, 8, List(0, 1, 4), List(3.0, 27.0, 8.4583))"
0,1,54.0,0,0,51.8625,0,0.0,0.0,"List(0, 8, List(0, 1, 4), List(1.0, 54.0, 51.8625))"
0,3,2.0,3,1,21.075,4,0.0,3.0,"List(1, 8, List(), List(3.0, 2.0, 3.0, 1.0, 21.075, 4.0, 0.0, 3.0))"
1,3,27.0,0,2,11.1333,2,1.0,2.0,"List(1, 8, List(), List(3.0, 27.0, 0.0, 2.0, 11.1333, 2.0, 1.0, 2.0))"
1,2,14.0,1,0,30.0708,1,1.0,2.0,"List(1, 8, List(), List(2.0, 14.0, 1.0, 0.0, 30.0708, 1.0, 1.0, 2.0))"


In [25]:
(trainingData, testData) = model_data.randomSplit([0.8, 0.2],seed = 11)

In [26]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="Survived", featuresCol="features")
#Training algo
lrModel = lr.fit(trainingData)
lr_prediction = lrModel.transform(testData)
lr_prediction.select("prediction", "Survived", "features").show()
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

In [27]:
lr_accuracy = evaluator.evaluate(lr_prediction)
print("Accuracy of LogisticRegression is = %g"% (lr_accuracy))
print("Test Error of LogisticRegression = %g " % (1.0 - lr_accuracy))

In [28]:
from pyspark.ml.classification import DecisionTreeClassifier

In [29]:
dtree = DecisionTreeClassifier(labelCol='Survived' , featuresCol='features')

In [30]:
dtree_fit = dtree.fit(trainingData)

In [31]:
dt_pred = dtree_fit.transform(testData)

In [32]:
evaluator.evaluate(dt_pred)