In [11]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('titanic').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
sc.setLogLevel("INFO")

[('numpy', '1.21.6'), ('pandas', '1.3.5'), ('pyspark', '3.2.1')]




In [12]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                               OneHotEncoder, StringIndexer)

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [14]:
df = spark.read.csv('titanic-training-data.csv',header=True,inferSchema=True)
print(df.count())
df.show()

891
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|  

In [15]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [16]:
print(df.columns)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [17]:
my_cols = [ 'Survived', 'Pclass', 'Sex', 'Age',
           'SibSp', 'Parch', 'Fare', 'Embarked']
df = df.select(my_cols)

In [18]:
my_final_data = df.dropna()

In [19]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                               OneHotEncoder, StringIndexer)

In [20]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='Sex_index')

gender_encoder = OneHotEncoder(inputCol='Sex_index', outputCol='Sex_vec')

In [21]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='Embarked_index')

embark_encoder = OneHotEncoder(inputCol='Embarked_index', outputCol='Embarked_vec')

In [22]:
my_final_data.columns

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [23]:
assembler = VectorAssembler(inputCols=['Pclass','Sex_vec','Embarked_vec',
                                      'Age','SibSp','Parch','Fare'],
                           outputCol='features')


In [24]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline


In [25]:
lr = LogisticRegression(featuresCol='features',labelCol='Survived')

In [26]:
pipeline = Pipeline(stages=[gender_indexer, embark_indexer,
                           gender_encoder, embark_encoder,
                           assembler, lr])

In [27]:
train, test = my_final_data.randomSplit([0.7, 0.3])

In [28]:
lr_model = pipeline.fit(train)

In [30]:
results = lr_model.transform(test)
results.show(5)

+--------+------+------+----+-----+-----+-------+--------+---------+--------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|Sex_index|Embarked_index|      Sex_vec| Embarked_vec|            features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+-------+--------+---------+--------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|25.0|    1|    2| 151.55|       S|      1.0|           0.0|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,1.0,0.0,...|[-2.9124863409042...|[0.05153975902903...|       1.0|
|       0|     1|female|50.0|    0|    0|28.7125|       C|      1.0|           1.0|    (1,[],[])|(2,[1],[1.0])|(8,[0,3,4,7],[1.0...|[-2.2222272778996...|[0.09777215387463...|       1.0|
|       0|     1|  male|19.0|    3|    2|  263.0|       S|      0.0|  

In [31]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [32]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Survived')

In [33]:
results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [34]:
auc = my_eval.evaluate(results)
auc

0.7866735537190083