In [0]:

%python

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit,countDistinct
from pyspark.ml.feature import StringIndexer


spark = SparkSession \
    .builder \
    .appName("Stanislas Titanic ML version on Spark ") \
    .getOrCreate()

titanic_train_file_path="dbfs:/FileStore/train.csv"
titanic_df = spark.read.csv(titanic_train_file_path,header = 'True',inferSchema='True')

titanic_df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [0]:

#Compute the number of passenger and check if PassengerId are unique

passengers_row = titanic_df.count()
print('number of row: '+str(passengers_row)) #891.

passengers_count=titanic_df.select(countDistinct('PassengerId').alias('number of unique passenger'))

passengers_count.show() #891 id are unique



number of row: 891
+--------------------------+
|number of unique passenger|
+--------------------------+
|                       891|
+--------------------------+



In [0]:
print("SUM UP :")
titanic_df.describe().show()



SUM UP :
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738|

In [0]:
print("DF SCHEMA :")
titanic_df.printSchema()

DF SCHEMA :
root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [0]:

titanic_df.select("Survived","Pclass","Embarked").show()

+--------+------+--------+
|Survived|Pclass|Embarked|
+--------+------+--------+
|       0|     3|       S|
|       1|     1|       C|
|       1|     3|       S|
|       1|     1|       S|
|       0|     3|       S|
|       0|     3|       Q|
|       0|     1|       S|
|       0|     3|       S|
|       1|     3|       S|
|       1|     2|       C|
|       1|     3|       S|
|       1|     1|       S|
|       0|     3|       S|
|       0|     3|       S|
|       0|     3|       S|
|       1|     2|       S|
|       0|     3|       Q|
|       1|     2|       S|
|       0|     3|       S|
|       1|     3|       C|
+--------+------+--------+
only showing top 20 rows



In [0]:
print("Analyze survivor number")
titanic_df.groupBy("Survived").count().show() 



print("Sex Repartition")
titanic_df.groupBy("Sex","Survived").count().orderBy('Sex').orderBy('Sex').show()

print('Pclass repartition')
titanic_df.groupBy("Pclass","Survived").count().orderBy('Pclass').show()



Analyze survivor number
+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+

Sex Repartition
+------+--------+-----+
|   Sex|Survived|count|
+------+--------+-----+
|female|       1|  233|
|female|       0|   81|
|  male|       0|  468|
|  male|       1|  109|
+------+--------+-----+

Pclass repartition
+------+--------+-----+
|Pclass|Survived|count|
+------+--------+-----+
|     1|       0|   80|
|     1|       1|  136|
|     2|       1|   87|
|     2|       0|   97|
|     3|       1|  119|
|     3|       0|  372|
+------+--------+-----+



In [0]:
null_counts = [(column, titanic_df.where(titanic_df[column].isNull()).count()) for column in titanic_df.columns] #Count all null values for each feature 

for column, count in null_counts:
    print(f"Column '{column}' has {count} null values.")

Column 'PassengerId' has 0 null values.
Column 'Survived' has 0 null values.
Column 'Pclass' has 0 null values.
Column 'Name' has 0 null values.
Column 'Sex' has 0 null values.
Column 'Age' has 177 null values.
Column 'SibSp' has 0 null values.
Column 'Parch' has 0 null values.
Column 'Ticket' has 0 null values.
Column 'Fare' has 0 null values.
Column 'Cabin' has 687 null values.
Column 'Embarked' has 2 null values.


In [0]:

##########################################
#                                        #
#             Feature Engineering        #
#                                        #
##########################################


#
#             FIX Feature: AGE 
#

titanic_df = titanic_df.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))

titanic_df.show()
titanic_df.select("Initial").distinct().show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Initial|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|     Mr|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|    Mrs|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|   Miss|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|    Mrs|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|     Mr|
|          6|   

In [0]:


titanic_df = titanic_df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])

titanic_df.select("Initial").distinct().show()

+-------+
|Initial|
+-------+
|   Miss|
|  Other|
| Master|
|     Mr|
|    Mrs|
+-------+



In [0]:
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Miss") & (titanic_df["Age"].isNull()), 22).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Other") & (titanic_df["Age"].isNull()), 46).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Master") & (titanic_df["Age"].isNull()), 5).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mr") & (titanic_df["Age"].isNull()), 33).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mrs") & (titanic_df["Age"].isNull()), 36).otherwise(titanic_df["Age"]))

     
titanic_df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Initial|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|     Mr|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|    Mrs|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|   Miss|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|    Mrs|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|     Mr|
|          6|   

In [0]:
#
#             FIX Feature: Embarked 
#


titanic_df.groupBy("Embarked").count().show()

titanic_df = titanic_df.na.fill({"Embarked" : 'S'}) #Because Majority of people are in S




+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|    null|    2|
|       C|  168|
|       S|  644|
+--------+-----+



In [0]:
#
#           REWORK FARE
# 

#Create Fareband


quantiles = titanic_df.approxQuantile("fare", [0.25, 0.5, 0.75], 0.01)

print(quantiles)

titanic_df = titanic_df.withColumn("Fareband",when((titanic_df["Fare"] < quantiles[0]), 1).otherwise(titanic_df["Fare"]))
titanic_df = titanic_df.withColumn("Fareband",when((titanic_df["Fare"] >= quantiles[0]) & (titanic_df["Fare"] < quantiles[1]) , 2).otherwise(titanic_df["fareband"]))
titanic_df = titanic_df.withColumn("Fareband",when((titanic_df["Fare"] >= quantiles[1]) & (titanic_df["Fare"] < quantiles[2]) , 3).otherwise(titanic_df["fareband"]))
titanic_df = titanic_df.withColumn("Fareband",when((titanic_df["Fare"] >= quantiles[2]) , 4).otherwise(titanic_df["fareband"]))


titanic_df.show()

[7.8958, 14.4542, 30.5]
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Initial|Fareband|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|     Mr|     1.0|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|    Mrs|     4.0|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|   Miss|     2.0|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|    Mrs|     4.0|
|          5|       0|     3|Allen, Mr. Willia...|  

In [0]:


#
#             DROP Feature: Cabin 
#

titanic_df = titanic_df.drop("Cabin") #too much null values, not exploitable

#
#            ADD Feature: Family Size
#

titanic_df = titanic_df.withColumn("Family_Size",col('SibSp')+col('Parch'))

#
#            ADD Feature: Alone
#
titanic_df = titanic_df.withColumn('Alone',lit(0))
titanic_df = titanic_df.withColumn("Alone",when(titanic_df["Family_Size"] == 0, 1).otherwise(titanic_df["Alone"]))

titanic_df.show()




+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+--------+-------+--------+-----------+-----+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Embarked|Initial|Fareband|Family_Size|Alone|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+--------+-------+--------+-----------+-----+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25|       S|     Mr|     1.0|          1|    0|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|       C|    Mrs|     4.0|          1|    0|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925|       S|   Miss|     2.0|          0|    1|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1|       S|    Mrs|     4.0|          1

In [0]:

#
#            ENCODE Features: Sex","Embarked","Initial
#

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(titanic_df) for column in ["Sex","Embarked","Initial"]]
pipeline = Pipeline(stages=indexers)
titanic_df = pipeline.fit(titanic_df).transform(titanic_df)

#
#            DROP Unused Features
#

titanic_df = titanic_df.drop("PassengerId","Name","Ticket","Cabin","Embarked","Sex","Initial")
titanic_df.show()




+--------+------+----+-----+-----+-------+--------+-----------+-----+---------+--------------+-------------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|Fareband|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|
+--------+------+----+-----+-----+-------+--------+-----------+-----+---------+--------------+-------------+
|       0|     3|22.0|    1|    0|   7.25|     1.0|          1|    0|      0.0|           0.0|          0.0|
|       1|     1|38.0|    1|    0|71.2833|     4.0|          1|    0|      1.0|           1.0|          2.0|
|       1|     3|26.0|    0|    0|  7.925|     2.0|          0|    1|      1.0|           0.0|          1.0|
|       1|     1|35.0|    1|    0|   53.1|     4.0|          1|    0|      1.0|           0.0|          2.0|
|       0|     3|35.0|    0|    0|   8.05|     2.0|          0|    1|      0.0|           0.0|          0.0|
|       0|     3|33.0|    0|    0| 8.4583|     2.0|          0|    1|      0.0|           2.0|          0.0|
|       0|     1|54

In [0]:
titanic_df.show()

+--------+------+----+-----+-----+-------+--------+-----------+-----+---------+--------------+-------------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|Fareband|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|
+--------+------+----+-----+-----+-------+--------+-----------+-----+---------+--------------+-------------+
|       0|     3|22.0|    1|    0|   7.25|     1.0|          1|    0|      0.0|           0.0|          0.0|
|       1|     1|38.0|    1|    0|71.2833|     4.0|          1|    0|      1.0|           1.0|          2.0|
|       1|     3|26.0|    0|    0|  7.925|     2.0|          0|    1|      1.0|           0.0|          1.0|
|       1|     1|35.0|    1|    0|   53.1|     4.0|          1|    0|      1.0|           0.0|          2.0|
|       0|     3|35.0|    0|    0|   8.05|     2.0|          0|    1|      0.0|           0.0|          0.0|
|       0|     3|33.0|    0|    0| 8.4583|     2.0|          0|    1|      0.0|           2.0|          0.0|
|       0|     1|54

In [0]:

#
#            Vectorize data
#


feature = VectorAssembler(inputCols=titanic_df.columns[1:],outputCol="features")
feature_vector= feature.transform(titanic_df)

feature_vector.show()

#
#            Vectorize data
#

(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 11)

+--------+------+----+-----+-----+-------+--------+-----------+-----+---------+--------------+-------------+--------------------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|Fareband|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|            features|
+--------+------+----+-----+-----+-------+--------+-----------+-----+---------+--------------+-------------+--------------------+
|       0|     3|22.0|    1|    0|   7.25|     1.0|          1|    0|      0.0|           0.0|          0.0|(11,[0,1,2,4,5,6]...|
|       1|     1|38.0|    1|    0|71.2833|     4.0|          1|    0|      1.0|           1.0|          2.0|[1.0,38.0,1.0,0.0...|
|       1|     3|26.0|    0|    0|  7.925|     2.0|          0|    1|      1.0|           0.0|          1.0|[3.0,26.0,0.0,0.0...|
|       1|     1|35.0|    1|    0|   53.1|     4.0|          1|    0|      1.0|           0.0|          2.0|[1.0,35.0,1.0,0.0...|
|       0|     3|35.0|    0|    0|   8.05|     2.0|          0|    1|      0.0|           

In [0]:
#
#            Logistic Regression
#


from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="Survived", featuresCol="features")
#Training algo
lrModel = lr.fit(trainingData)
lr_prediction = lrModel.transform(testData)
lr_prediction.select("prediction", "Survived", "features").show()
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")


lr_accuracy = evaluator.evaluate(lr_prediction)
print("Accuracy of LogisticRegression is = %g"% (lr_accuracy))
print("Test Error of LogisticRegression = %g " % (1.0 - lr_accuracy))
     

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|(11,[0,1,4,5,7,9]...|
|       1.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,5,7],[1....|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       1.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       1.0|       0|(11,[0,1,3,4,5,6]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
dt_model = dt.fit(trainingData)
dt_prediction = dt_model.transform(testData)
dt_prediction.select("prediction", "Survived", "features").show()

dt_accuracy = evaluator.evaluate(dt_prediction)
print("Accuracy of DecisionTreeClassifier is = %g"% (dt_accuracy))
print("Test Error of DecisionTreeClassifier = %g " % (1.0 - dt_accuracy))

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       1.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       1.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,5,7],[1....|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,3,4,5,6]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [0]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="Survived", featuresCol="features")
rf_model = rf.fit(trainingData)
rf_prediction = rf_model.transform(testData)
rf_prediction.select("prediction", "Survived", "features").show()

rf_accuracy = evaluator.evaluate(rf_prediction)
print("Accuracy of RandomForestClassifier is = %g"% (rf_accuracy))
print("Test Error of RandomForestClassifier  = %g " % (1.0 - rf_accuracy))
     

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,5,7],[1....|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,3,4,5,6]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [0]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="Survived", featuresCol="features",maxIter=10)
gbt_model = gbt.fit(trainingData)
gbt_prediction = gbt_model.transform(testData)
gbt_prediction.select("prediction", "Survived", "features").show()

gbt_accuracy = evaluator.evaluate(gbt_prediction)
print("Accuracy of Gradient-boosted tree classifie is = %g"% (gbt_accuracy))
print("Test Error of Gradient-boosted tree classifie %g"% (1.0 - gbt_accuracy))

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       1.0|       0|(11,[0,1,4,5,7],[...|
|       1.0|       0|(11,[0,1,2,4,5,6]...|
|       1.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,5,7],[1....|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,3,4,5,6]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [0]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol="Survived", featuresCol="features")
nb_model = nb.fit(trainingData)
nb_prediction = nb_model.transform(testData)
nb_prediction.select("prediction", "Survived", "features").show()

nb_accuracy = evaluator.evaluate(nb_prediction)
print("Accuracy of NaiveBayes is  = %g"% (nb_accuracy))
print("Test Error of NaiveBayes  = %g " % (1.0 - nb_accuracy))

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|(11,[0,1,4,5,7,9]...|
|       1.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       1.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       1.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,5,7],[1....|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       1.0|       0|(11,[0,1,4,5,7],[...|
|       1.0|       0|(11,[0,1,4,5,7],[...|
|       1.0|       0|(11,[0,1,2,4,5,6]...|
|       1.0|       0|(11,[0,1,3,4,5,6]...|
|       1.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       1.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       1.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [0]:
from pyspark.ml.classification import LinearSVC
svm = LinearSVC(labelCol="Survived", featuresCol="features")
svm_model = svm.fit(trainingData)
svm_prediction = svm_model.transform(testData)
svm_prediction.select("prediction", "Survived", "features").show()

svm_accuracy = evaluator.evaluate(svm_prediction)
print("Accuracy of Support Vector Machine is = %g"% (svm_accuracy))
print("Test Error of Support Vector Machine = %g " % (1.0 - svm_accuracy))


+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,5,7],[1....|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,3,4,5,6]...|
|       0.0|       0|(11,[0,1,2,4,5,6]...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7],[...|
|       0.0|       0|(11,[0,1,4,5,7,9]...|
|       0.0|       0|[2.0,19.0,1.0,1.0...|
+----------