## To predict whether a person survived or not in Titanic using titanic dataset

In [72]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [73]:
# Initialize and create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Titanic').getOrCreate()

In [74]:
# Import statements to setup ML
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors

In [75]:
# Using Spark to read the titanic data set
data = spark.read.csv('titanic.csv', header=True, inferSchema=True)

In [76]:
# Printing the first row of the dataframe
data.head()

Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S')

In [77]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [78]:
data.count()

891

__Checking out whether the string columns "Sex", "Name", "Ticket", "Cabin" and "Embarked" are useful or not__

In [79]:
data.groupBy('Sex').count().count()

2

In [80]:
data.groupBy('Name').count().count()

891

In [81]:
data.groupBy('Ticket').count().count()

681

In [82]:
data.groupBy('Cabin').count().count()

148

In [83]:
data.select('Cabin').describe().show()

+-------+-----+
|summary|Cabin|
+-------+-----+
|  count|  204|
|   mean| null|
| stddev| null|
|    min|  A10|
|    max|    T|
+-------+-----+



In [84]:
data.groupBy('Embarked').count().count()

4

_Filtering out other columns_

In [85]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [86]:
#Checking whether the feature "PassengerId" is useful or not
data.groupBy('PassengerId').count().count()

891

In [87]:
filtered_data = data.select('Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked')

In [88]:
#Dropping out the null value records
filtered_data = filtered_data.na.drop()

In [89]:
filtered_data.count()

712

__Since the "Sex" and "Embarked" columns are categorical type, we can convert it to numerical and to vector using StringIndexer and OneHotEncoder__

In [90]:
#Using String Indexer to convert categorical string columns to numerical type

genderIndexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
embarkIndexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')

In [91]:
#Using One Hot Encoder to convert categorical numeric type columns to Vector type

genderEncoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')
embarkEncoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

In [92]:
#Assembling all the features to a single vector column "features"

assembler = VectorAssembler(inputCols=['Pclass','SexVec','Age','SibSp','Parch','Fare','EmbarkVec']
                            ,outputCol='features')

__Splitting the resultant data into training data and testing data, Training data is to train the model, Testing data is to test the builted model__

In [93]:
train_data,test_data = filtered_data.randomSplit([0.7,0.3])

In [94]:
train_data.count()

509

In [95]:
test_data.count()

203

In [96]:
#Creating a logistic regression model object
lr = LogisticRegression(labelCol='Survived', featuresCol='features')

In [97]:
#Setting Up the Pipeline
from pyspark.ml import Pipeline

In [98]:
pipeline = Pipeline(stages=[genderIndexer,embarkIndexer,genderEncoder,embarkEncoder,assembler,lr])

In [99]:
#Fitting the pipeline to training set.
model = pipeline.fit(train_data)

In [100]:
#Getting Results on Test Set
results = model.transform(test_data)

In [101]:
results.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- EmbarkIndex: double (nullable = false)
 |-- SexVec: vector (nullable = true)
 |-- EmbarkVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [102]:
output = results.select('Survived','rawPrediction','prediction','probability','features')

In [103]:
output.show(3)

+--------+--------------------+----------+--------------------+--------------------+
|Survived|       rawPrediction|prediction|         probability|            features|
+--------+--------------------+----------+--------------------+--------------------+
|       0|[-0.2575595314904...|       1.0|[0.43596372376282...|[1.0,1.0,19.0,1.0...|
|       0|[0.26915005263411...|       0.0|[0.56688423223917...|[1.0,1.0,19.0,3.0...|
|       0|[-0.6913078851033...|       1.0|[0.33374219087673...|[1.0,1.0,21.0,0.0...|
+--------+--------------------+----------+--------------------+--------------------+
only showing top 3 rows



### MODEL EVALUATION

__1) Converting the data to rdd and evaluating using MulticlassMetrics to print the confusion matrix__

In [104]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [105]:
clean_result = output.withColumn('Survived',output['Survived'].cast('double'))

In [106]:
clean_result.select('prediction','Survived').show(3)

+----------+--------+
|prediction|Survived|
+----------+--------+
|       1.0|     0.0|
|       0.0|     0.0|
|       1.0|     0.0|
+----------+--------+
only showing top 3 rows



In [107]:
predictionAndLabel = clean_result.select('prediction','Survived').rdd

In [108]:
metrics = MulticlassMetrics(predictionAndLabel)

In [109]:
#Printing the confusion matrix
print(metrics.confusionMatrix())

DenseMatrix([[112.,  14.],
             [ 15.,  62.]])


In [110]:
#Printing the Accuracy
print(metrics.accuracy)

0.8571428571428571


In [111]:
metrics.recall()

0.8571428571428571

In [112]:
metrics.precision()

0.8571428571428571

__2) Evaluating using BinaryClassificationEvaluator__

In [113]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [114]:
bin_eval = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Survived')

In [115]:
#Calculating Area Under ROC
AOC = bin_eval.evaluate(output)

In [116]:
#Printing Area Under ROC
print(AOC)

0.9188311688311686


__3) Evaluating using MulticlassClassificationEvaluator__

In [117]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [118]:
multi_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Survived')

In [119]:
#Calculating Area Under ROC
AOC_2 = multi_eval.evaluate(output)

In [120]:
#Printing Area Under ROC
print(AOC_2)

0.8569583301041102


In [None]:
#Closing spark session
spark.stop()