## Importing libraries

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.\
        builder.\
        appName('Titanic').\
        getOrCreate()

## Preparing the Titanic data

### Importing the data

In [3]:
df = spark.read.csv('titanic.csv', header = True, inferSchema = True)

In [4]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



### Selecting columns and dropping missing values

In [5]:
my_cols = df.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
data = my_cols.na.drop()

### Handling Categorical Data

In [6]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                                OneHotEncoder, StringIndexer)
# For gender column
gender_indexer = StringIndexer(inputCol = 'Sex', outputCol = 'SexIndex')
gender_encoder = OneHotEncoder(inputCol = 'SexIndex', outputCol='SexVec')

# For Embarked column
embark_indexer = StringIndexer(inputCol = 'Embarked', outputCol = 'EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol = 'EmbarkIndex', outputCol = 'EmbarkVec')

# Creating an assembler
assembler = VectorAssembler(inputCols = ['Pclass', 'SexVec', 'EmbarkVec', 'Age', 'SibSp', 'Parch', 'Fare'],
                            outputCol = 'features')

### Building the Logistic regression model

In [7]:
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(featuresCol = 'features', labelCol = 'Survived')

### Creating the Pipeline

In [8]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [gender_indexer, embark_indexer,
                              gender_encoder, embark_encoder,
                              assembler, model])

### Splitting the data

In [9]:
train_data, test_data = data.randomSplit([0.7, 0.3])

### Training and predicting with Logistic Regression

In [10]:
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



### Evaluating the model

In [11]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator as BCE
my_eval = BCE(rawPredictionCol = 'prediction', labelCol = 'Survived', metricName='areaUnderROC')
my_eval.evaluate(results)

0.7719979296066254