### Import Libraries

Use Jupyter Notebook as Spark IDE

In [1]:
import findspark
findspark.init()

Import required libraries

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc=SparkContext.getOrCreate()
ss=SparkSession(sc)

### Load Data

Load data

In [4]:
data=ss.read.csv("../data/titanic.csv",inferSchema=True, header=True)

In [5]:
data.cache()

DataFrame[pclass: int, survived: int, name: string, sex: string, age: double, sibsp: int, parch: int, ticket: string, fare: double, cabin: string, embarked: string, boat: string, body: int, home.dest: string]

Get data structure

In [6]:
data.printSchema()

root
 |-- pclass: integer (nullable = true)
 |-- survived: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: double (nullable = true)
 |-- sibsp: integer (nullable = true)
 |-- parch: integer (nullable = true)
 |-- ticket: string (nullable = true)
 |-- fare: double (nullable = true)
 |-- cabin: string (nullable = true)
 |-- embarked: string (nullable = true)
 |-- boat: string (nullable = true)
 |-- body: integer (nullable = true)
 |-- home.dest: string (nullable = true)



Get first 5 records

In [7]:
data.show(5)

+------+--------+--------------------+------+------+-----+-----+------+--------+-------+--------+----+----+--------------------+
|pclass|survived|                name|   sex|   age|sibsp|parch|ticket|    fare|  cabin|embarked|boat|body|           home.dest|
+------+--------+--------------------+------+------+-----+-----+------+--------+-------+--------+----+----+--------------------+
|     1|       1|Allen, Miss. Elis...|female|  29.0|    0|    0| 24160|211.3375|     B5|       S|   2|null|        St Louis, MO|
|     1|       1|Allison, Master. ...|  male|0.9167|    1|    2|113781|  151.55|C22 C26|       S|  11|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Miss. He...|female|   2.0|    1|    2|113781|  151.55|C22 C26|       S|null|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Mr. Huds...|  male|  30.0|    1|    2|113781|  151.55|C22 C26|       S|null| 135|Montreal, PQ / Ch...|
|     1|       0|Allison, Mrs. Hud...|female|  25.0|    1|    2|113781|  151.55|C22 C26|       S|

In [8]:
data=data[['pclass','survived','sex','age','sibsp','parch','fare','embarked','cabin']]

In [9]:
data.show(5)

+------+--------+------+------+-----+-----+--------+--------+-------+
|pclass|survived|   sex|   age|sibsp|parch|    fare|embarked|  cabin|
+------+--------+------+------+-----+-----+--------+--------+-------+
|     1|       1|female|  29.0|    0|    0|211.3375|       S|     B5|
|     1|       1|  male|0.9167|    1|    2|  151.55|       S|C22 C26|
|     1|       0|female|   2.0|    1|    2|  151.55|       S|C22 C26|
|     1|       0|  male|  30.0|    1|    2|  151.55|       S|C22 C26|
|     1|       0|female|  25.0|    1|    2|  151.55|       S|C22 C26|
+------+--------+------+------+-----+-----+--------+--------+-------+
only showing top 5 rows



In [10]:
data=data.dropna()

In [11]:
data.show(5)

+------+--------+------+------+-----+-----+--------+--------+-------+
|pclass|survived|   sex|   age|sibsp|parch|    fare|embarked|  cabin|
+------+--------+------+------+-----+-----+--------+--------+-------+
|     1|       1|female|  29.0|    0|    0|211.3375|       S|     B5|
|     1|       1|  male|0.9167|    1|    2|  151.55|       S|C22 C26|
|     1|       0|female|   2.0|    1|    2|  151.55|       S|C22 C26|
|     1|       0|  male|  30.0|    1|    2|  151.55|       S|C22 C26|
|     1|       0|female|  25.0|    1|    2|  151.55|       S|C22 C26|
+------+--------+------+------+-----+-----+--------+--------+-------+
only showing top 5 rows



### Preprocess Data

Create Feature

In [12]:
feature=['pclass','sibsp','parch']

In [13]:
from pyspark.ml.feature import VectorAssembler

In [14]:
assembler=VectorAssembler(inputCols=feature, outputCol="features")
new_df=assembler.transform(data)
new_df.show(5)

+------+--------+------+------+-----+-----+--------+--------+-------+-------------+
|pclass|survived|   sex|   age|sibsp|parch|    fare|embarked|  cabin|     features|
+------+--------+------+------+-----+-----+--------+--------+-------+-------------+
|     1|       1|female|  29.0|    0|    0|211.3375|       S|     B5|[1.0,0.0,0.0]|
|     1|       1|  male|0.9167|    1|    2|  151.55|       S|C22 C26|[1.0,1.0,2.0]|
|     1|       0|female|   2.0|    1|    2|  151.55|       S|C22 C26|[1.0,1.0,2.0]|
|     1|       0|  male|  30.0|    1|    2|  151.55|       S|C22 C26|[1.0,1.0,2.0]|
|     1|       0|female|  25.0|    1|    2|  151.55|       S|C22 C26|[1.0,1.0,2.0]|
+------+--------+------+------+-----+-----+--------+--------+-------+-------------+
only showing top 5 rows



In [15]:
new_df=new_df[['features','survived']]
new_df.show(5)

+-------------+--------+
|     features|survived|
+-------------+--------+
|[1.0,0.0,0.0]|       1|
|[1.0,1.0,2.0]|       1|
|[1.0,1.0,2.0]|       0|
|[1.0,1.0,2.0]|       0|
|[1.0,1.0,2.0]|       0|
+-------------+--------+
only showing top 5 rows



Split data into training and testing sets

In [16]:
train,test=new_df.randomSplit([0.7,0.3])

In [17]:
train.show(5),test.show(5)

+-------------+--------+
|     features|survived|
+-------------+--------+
|[1.0,0.0,0.0]|       0|
|[1.0,0.0,0.0]|       0|
|[1.0,0.0,0.0]|       0|
|[1.0,0.0,0.0]|       0|
|[1.0,0.0,0.0]|       0|
+-------------+--------+
only showing top 5 rows

+-------------+--------+
|     features|survived|
+-------------+--------+
|[1.0,0.0,0.0]|       0|
|[1.0,0.0,0.0]|       0|
|[1.0,0.0,0.0]|       0|
|[1.0,0.0,0.0]|       0|
|[1.0,0.0,0.0]|       0|
+-------------+--------+
only showing top 5 rows



(None, None)

### Build ML Model

In [18]:
from pyspark.ml.classification import RandomForestClassifier

In [19]:
model=RandomForestClassifier(labelCol='survived').fit(train)

In [20]:
test_model=model.transform(train)

### Evaliate ML Model

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [22]:
print("Accuracy :")
MulticlassClassificationEvaluator(labelCol='survived',metricName='accuracy').evaluate(test_model)

Accuracy :


0.6684210526315789

In [23]:
print("Precision :")
MulticlassClassificationEvaluator(labelCol='survived', metricName='weightedPrecision').evaluate(test_model)

Precision :


0.6678487663007786

In [28]:
test_model.show(5)

+-------------+--------+--------------------+--------------------+----------+
|     features|survived|       rawPrediction|         probability|prediction|
+-------------+--------+--------------------+--------------------+----------+
|[1.0,0.0,0.0]|       0|[9.09553343072947...|[0.45477667153647...|       1.0|
|[1.0,0.0,0.0]|       0|[9.09553343072947...|[0.45477667153647...|       1.0|
|[1.0,0.0,0.0]|       0|[9.09553343072947...|[0.45477667153647...|       1.0|
|[1.0,0.0,0.0]|       0|[9.09553343072947...|[0.45477667153647...|       1.0|
|[1.0,0.0,0.0]|       0|[9.09553343072947...|[0.45477667153647...|       1.0|
+-------------+--------+--------------------+--------------------+----------+
only showing top 5 rows

