# Titanic Suvival Prediction
### by Logistic Regrssion using PySpark

## Prepare data and enviranment

In [None]:
# Download Java and Spark

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
# Set up the paths

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

In [None]:
# Create a Spark session

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark.conf.set("spark.sql.caseSensitive", True) # Avoid error "Found duplicate column(s) in the data schema"
spark

In [None]:
# Load the data into the system

titanic_train = spark.read.csv('titanic_train.csv', header = True, inferSchema=True)
titanic_train.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [None]:
# Explore the target variable "Survived"
print("Count the number of surviver")
titanic_train.groupBy("Survived").count().show()

print("Correlation between gender and number of suviver")
# Explore correlation
titanic_train.groupBy("Sex","Survived").count().orderBy("Sex","Survived").show()

print("Correlation between Passenger class and number of suviver")
# Explore another correlation
titanic_train.groupBy("Pclass","Survived").count().orderBy("Pclass","Survived").show()

Count the number of surviver
+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+

Correlation between gender and number of suviver
+------+--------+-----+
|   Sex|Survived|count|
+------+--------+-----+
|female|       0|   81|
|female|       1|  233|
|  male|       0|  468|
|  male|       1|  109|
+------+--------+-----+

Correlation between Passenger class and number of suviver
+------+--------+-----+
|Pclass|Survived|count|
+------+--------+-----+
|     1|       0|   80|
|     1|       1|  136|
|     2|       0|   97|
|     2|       1|   87|
|     3|       0|  372|
|     3|       1|  119|
+------+--------+-----+



In [None]:
# Produce summary statistics
# The "count" row gives the info about null values

titanic_train.describe()

summary,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
mean,446.0,0.3838383838383838,2.308641975308642,,,29.69911764705882,0.5230078563411896,0.3815937149270482,260318.54916792738,32.2042079685746,,
stddev,257.3538420152301,0.4865924542648575,0.8360712409770491,,,14.526497332334037,1.1027434322934315,0.8060572211299488,471609.26868834975,49.69342859718089,,
min,1.0,0.0,1.0,"""Andersson, Mr. A...",female,0.42,0.0,0.0,110152,0.0,A10,C
max,891.0,1.0,3.0,"van Melkebeke, Mr...",male,80.0,8.0,6.0,WE/P 5735,512.3292,T,S


In [None]:
# Drop columns less informative

titanic_train = titanic_train.drop("PassengerId",
                                   "Name",
                                   "Ticket",
                                   "Cabin",
                                   "Embarked")
titanic_train.show()

+--------+------+------+----+-----+-----+-------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|
+--------+------+------+----+-----+-----+-------+
|       0|     3|  male|22.0|    1|    0|   7.25|
|       1|     1|female|38.0|    1|    0|71.2833|
|       1|     3|female|26.0|    0|    0|  7.925|
|       1|     1|female|35.0|    1|    0|   53.1|
|       0|     3|  male|35.0|    0|    0|   8.05|
|       0|     3|  male|null|    0|    0| 8.4583|
|       0|     1|  male|54.0|    0|    0|51.8625|
|       0|     3|  male| 2.0|    3|    1| 21.075|
|       1|     3|female|27.0|    0|    2|11.1333|
|       1|     2|female|14.0|    1|    0|30.0708|
|       1|     3|female| 4.0|    1|    1|   16.7|
|       1|     1|female|58.0|    0|    0|  26.55|
|       0|     3|  male|20.0|    0|    0|   8.05|
|       0|     3|  male|39.0|    1|    5| 31.275|
|       0|     3|female|14.0|    0|    0| 7.8542|
|       1|     2|female|55.0|    0|    0|   16.0|
|       0|     3|  male| 2.0|    4|    1| 29.125|


In [None]:
from pyspark.sql import functions as f

# Add up SibSp (Siblings/Spouses) and Parch (Parents/Children) to make a new variable 
titanic_train = titanic_train.withColumn("FamilySize",f.col('SibSp')+f.col('Parch'))

# Drop the previous columns
titanic_train = titanic_train.drop("SibSp","Parch")

titanic_train.show()

+--------+------+------+----+-------+----------+
|Survived|Pclass|   Sex| Age|   Fare|FamilySize|
+--------+------+------+----+-------+----------+
|       0|     3|  male|22.0|   7.25|         1|
|       1|     1|female|38.0|71.2833|         1|
|       1|     3|female|26.0|  7.925|         0|
|       1|     1|female|35.0|   53.1|         1|
|       0|     3|  male|35.0|   8.05|         0|
|       0|     3|  male|null| 8.4583|         0|
|       0|     1|  male|54.0|51.8625|         0|
|       0|     3|  male| 2.0| 21.075|         4|
|       1|     3|female|27.0|11.1333|         2|
|       1|     2|female|14.0|30.0708|         1|
|       1|     3|female| 4.0|   16.7|         2|
|       1|     1|female|58.0|  26.55|         0|
|       0|     3|  male|20.0|   8.05|         0|
|       0|     3|  male|39.0| 31.275|         6|
|       0|     3|female|14.0| 7.8542|         0|
|       1|     2|female|55.0|   16.0|         0|
|       0|     3|  male| 2.0| 29.125|         5|
|       1|     2|  m

In [None]:
# Create Dummy variable for gender
from pyspark.ml.feature import StringIndexer

stringIndexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')

myFit = stringIndexer.fit(titanic_train)
titanic_train = myFit.transform(titanic_train)
titanic_train = titanic_train.drop("Sex")

titanic_train.show()

+--------+------+----+-------+----------+--------+
|Survived|Pclass| Age|   Fare|FamilySize|SexIndex|
+--------+------+----+-------+----------+--------+
|       0|     3|22.0|   7.25|         1|     0.0|
|       1|     1|38.0|71.2833|         1|     1.0|
|       1|     3|26.0|  7.925|         0|     1.0|
|       1|     1|35.0|   53.1|         1|     1.0|
|       0|     3|35.0|   8.05|         0|     0.0|
|       0|     3|null| 8.4583|         0|     0.0|
|       0|     1|54.0|51.8625|         0|     0.0|
|       0|     3| 2.0| 21.075|         4|     0.0|
|       1|     3|27.0|11.1333|         2|     1.0|
|       1|     2|14.0|30.0708|         1|     1.0|
|       1|     3| 4.0|   16.7|         2|     1.0|
|       1|     1|58.0|  26.55|         0|     1.0|
|       0|     3|20.0|   8.05|         0|     0.0|
|       0|     3|39.0| 31.275|         6|     0.0|
|       0|     3|14.0| 7.8542|         0|     1.0|
|       1|     2|55.0|   16.0|         0|     1.0|
|       0|     3| 2.0| 29.125| 

In [None]:
# Finally replace N/A in "Age" column with average age

avg_age = titanic_train.select(f.avg('Age')).collect()[0][0]
titanic_train = titanic_train.na.fill({"Age" : avg_age})

titanic_train.show()

+--------+------+-----------------+-------+----------+--------+
|Survived|Pclass|              Age|   Fare|FamilySize|SexIndex|
+--------+------+-----------------+-------+----------+--------+
|       0|     3|             22.0|   7.25|         1|     0.0|
|       1|     1|             38.0|71.2833|         1|     1.0|
|       1|     3|             26.0|  7.925|         0|     1.0|
|       1|     1|             35.0|   53.1|         1|     1.0|
|       0|     3|             35.0|   8.05|         0|     0.0|
|       0|     3|29.69911764705882| 8.4583|         0|     0.0|
|       0|     1|             54.0|51.8625|         0|     0.0|
|       0|     3|              2.0| 21.075|         4|     0.0|
|       1|     3|             27.0|11.1333|         2|     1.0|
|       1|     2|             14.0|30.0708|         1|     1.0|
|       1|     3|              4.0|   16.7|         2|     1.0|
|       1|     1|             58.0|  26.55|         0|     1.0|
|       0|     3|             20.0|   8.

## Logistic Regression Procedure and Prediction Model Evaluation

In [None]:
# Prepare the feature vector

from pyspark.ml.feature import VectorAssembler

feature = VectorAssembler(inputCols=titanic_train.columns[1:],outputCol="features")
feature_vector= feature.transform(titanic_train)

feature_vector.show()

+--------+------+-----------------+-------+----------+--------+--------------------+
|Survived|Pclass|              Age|   Fare|FamilySize|SexIndex|            features|
+--------+------+-----------------+-------+----------+--------+--------------------+
|       0|     3|             22.0|   7.25|         1|     0.0|[3.0,22.0,7.25,1....|
|       1|     1|             38.0|71.2833|         1|     1.0|[1.0,38.0,71.2833...|
|       1|     3|             26.0|  7.925|         0|     1.0|[3.0,26.0,7.925,0...|
|       1|     1|             35.0|   53.1|         1|     1.0|[1.0,35.0,53.1,1....|
|       0|     3|             35.0|   8.05|         0|     0.0|[3.0,35.0,8.05,0....|
|       0|     3|29.69911764705882| 8.4583|         0|     0.0|[3.0,29.699117647...|
|       0|     1|             54.0|51.8625|         0|     0.0|[1.0,54.0,51.8625...|
|       0|     3|              2.0| 21.075|         4|     0.0|[3.0,2.0,21.075,4...|
|       1|     3|             27.0|11.1333|         2|     1.0|[3

In [None]:
# Split data from Training model and Test model

(training, test) = feature_vector.randomSplit([0.8, 0.2],seed = 56)

In [None]:
# Import an evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Import logistic regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="Survived", featuresCol="features")

# Set up the model
lrModel = lr.fit(training)
lr_prediction = lrModel.transform(test)
lr_prediction.select("prediction", "Survived", "features").orderBy("Survived",ascending=False).show(50)
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       1|[1.0,15.0,211.337...|
|       1.0|       1|[3.0,5.0,12.475,0...|
|       1.0|       1|[1.0,16.0,39.4,1....|
|       1.0|       1|[1.0,17.0,57.0,1....|
|       1.0|       1|[1.0,18.0,79.65,2...|
|       1.0|       1|[1.0,19.0,91.0792...|
|       1.0|       1|[1.0,23.0,113.275...|
|       1.0|       1|[1.0,23.0,263.0,5...|
|       1.0|       1|[1.0,29.699117647...|
|       1.0|       1|[1.0,29.699117647...|
|       1.0|       1|[1.0,29.699117647...|
|       1.0|       1|[1.0,29.699117647...|
|       1.0|       1|[1.0,29.699117647...|
|       0.0|       1|[1.0,31.0,57.0,1....|
|       0.0|       1|[1.0,34.0,26.55,0...|
|       1.0|       1|[1.0,35.0,53.1,1....|
|       1.0|       1|[1.0,35.0,83.475,...|
|       1.0|       1|[1.0,35.0,135.633...|
|       0.0|       1|[1.0,36.0,26.2875...|
|       1.0|       1|[1.0,36.0,120.0,3...|
|       1.0

In [None]:
# Report the accuracy

lr_accuracy = evaluator.evaluate(lr_prediction)
print("Accuracy of this Logistic Regression model is %g"% (lr_accuracy))

Accuracy of this Logistic Regression model is 0.8125
