In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
train = spark.read.format('csv').options(inferSchema='True',header='True').load('/content/drive/MyDrive/Colab Notebooks/Titanic/train.csv')
test = spark.read.format('csv').options(inferSchema='True',header='True').load('/content/drive/MyDrive/Colab Notebooks/Titanic/test.csv')

In [None]:
mycols = train.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
final_train = mycols.na.drop()


In [None]:
final_train.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       S|
|       0|     3|female|14.0|    0|    0| 7.8542|       

In [None]:
from pyspark.ml.feature import (VectorAssembler, OneHotEncoder, VectorIndexer, StringIndexer)

gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

embarked_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embarked_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkedVec')

In [None]:
assembler = VectorAssembler(inputCols=['Pclass','SexVec','Age','SibSp','Parch','Fare','EmbarkedVec'],
                            outputCol='features')

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features', labelCol='Survived')

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[gender_indexer, embarked_indexer, gender_encoder, embarked_encoder, assembler, lr])

In [None]:
model = pipeline.fit(final_train)

In [None]:
final_test = test.na.drop()

In [None]:
predictions = model.transform(final_test)

In [None]:
predictions.show()

+-----------+------+--------------------+------+----+-----+-----+-----------+-------+---------------+--------+--------+-------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|     Ticket|   Fare|          Cabin|Embarked|SexIndex|EmbarkedIndex|       SexVec|  EmbarkedVec|            features|       rawPrediction|         probability|prediction|
+-----------+------+--------------------+------+----+-----+-----+-----------+-------+---------------+--------+--------+-------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|        904|     1|Snyder, Mrs. John...|female|23.0|    1|    0|      21228|82.2667|            B45|       S|     1.0|          0.0|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,23.0,1.0...|[-2.7944551541697...|[0.05762454542235...|       1.0|
|        906|     1|Chaffee, Mrs. Her...|female|47.0|   

As final_test has no label, it cannot be used to evaluate the quality of the classifier.

So, we do it again by splitting the training set.

In [None]:
(train, test) = final_train.randomSplit([0.7,0.3])
model = pipeline.fit(train)
predictionAndLabels = model.transform(test)


In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')
acc = evaluator.evaluate(predictionAndLabels)


In [None]:
acc

0.783564238014552