In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import corr
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext()
spark = SparkSession(sc)

In [4]:
data = spark.read.csv('./titanic.csv',inferSchema=True,header=True)

In [5]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [6]:
data.show(3)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 3 rows



### Pre-processing

In [7]:
from pyspark.ml.feature import VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer

In [8]:
my_cols = data.drop('PassengerId','Name','Ticket','Cabin') #Drop cols no meaning

In [9]:
my_cols = my_cols.drop_duplicates()
final_data = my_cols.na.drop()

In [10]:
final_data.columns

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [11]:
#Sex column

In [12]:
Sex_index = StringIndexer(inputCol='Sex',outputCol='Sex_index')

In [13]:
Sex_1hot = OneHotEncoder(inputCol='Sex_index',outputCol='Sex_1hot')

In [14]:
#Embarked

In [15]:
Embarked_index = StringIndexer(inputCol='Embarked',outputCol='Embarked_index')

In [16]:
Embarked_1hot = OneHotEncoder(inputCol='Embarked_index',outputCol='Embarked_1hot')

In [17]:
assembler = VectorAssembler(inputCols=['Survived', 'Pclass',
                                       'Sex_1hot', 'Age', 'SibSp',
                                       'Parch', 'Fare', 'Embarked_1hot'],
                           outputCol='features')

In [18]:
from pyspark.ml import Pipeline

In [19]:
model = LogisticRegression(featuresCol='features',labelCol='Survived')

In [20]:
pipeline = Pipeline(stages=[Sex_index,Sex_1hot,Embarked_index,Embarked_1hot,assembler,model])

In [21]:
train_set, test_set = final_data.randomSplit([0.7,0.3])

In [22]:
trained_model = pipeline.fit(train_set)

In [23]:
result = trained_model.transform(test_set)

In [25]:
result.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       0.0|
|       1|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       1|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       1|       1.0|
|       0|       0.0|
|       1|       1.0|
|       1|       1.0|
|       0|       0.0|
|       1|       1.0|
|       1|       1.0|
|       1|       1.0|
|       0|       0.0|
|       1|       1.0|
|       1|       1.0|
+--------+----------+
only showing top 20 rows



In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [27]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [29]:
my_eval.evaluate(result)

1.0