In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.getOrCreate()

In [100]:
df = spark.read.csv("titanic.csv", header=True, inferSchema=True)

In [101]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [102]:
df.show(3)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 3 rows



In [103]:
columnList = [item[0] for item in df.dtypes if item[1].startswith('string')]

In [104]:
columnList

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [105]:
df = df.drop(*columnList)

In [106]:
df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Age: double, SibSp: int, Parch: int, Fare: double]

In [107]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)



In [108]:
df = df.drop('PassengerId')

In [109]:
df.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)



In [110]:
df.columns

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [95]:
df = df.dropna()

In [96]:
df

DataFrame[Pclass: int, Age: double, SibSp: int, Parch: int, Fare: double]

In [97]:
from pyspark.ml.feature import VectorAssembler

In [111]:
train_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [112]:
train_cols

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [113]:
vector = VectorAssembler(inputCols=train_cols, outputCol='features')

In [114]:
v_df = vector.transform(df)

In [115]:
v_df = v_df.select(['features', 'Survived'])

In [116]:
v_df.show(3)

+--------------------+--------+
|            features|Survived|
+--------------------+--------+
|[3.0,22.0,1.0,0.0...|       0|
|[1.0,38.0,1.0,0.0...|       1|
|[3.0,26.0,0.0,0.0...|       1|
+--------------------+--------+
only showing top 3 rows



In [117]:
(train_df, test_df) = v_df.randomSplit([0.8, 0.2])

In [118]:
from pyspark.ml.classification import DecisionTreeClassifier

In [119]:
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='Survived')

In [120]:
dtc_model = dtc.fit(train_df)

In [121]:
train_df

DataFrame[features: vector, Survived: int]

In [122]:
test_df.head()

Row(features=SparseVector(5, {0: 2.0, 1: 28.0}), Survived=0)

In [123]:
dtc_prediction = dtc_model.transform(test_df)

In [124]:
dtc_prediction.select("prediction","Survived","features").show()

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       0.0|       0|(5,[0,1],[2.0,28.0])|
|       0.0|       0|(5,[0,1],[2.0,28.0])|
|       0.0|       0|(5,[0,1],[3.0,49.0])|
|       1.0|       1|[1.0,15.0,0.0,1.0...|
|       1.0|       1|[1.0,17.0,0.0,2.0...|
|       1.0|       1|[1.0,17.0,1.0,0.0...|
|       1.0|       1|[1.0,18.0,1.0,0.0...|
|       1.0|       1|[1.0,18.0,2.0,2.0...|
|       1.0|       1|[1.0,19.0,0.0,2.0...|
|       1.0|       1|[1.0,22.0,0.0,2.0...|
|       1.0|       1|[1.0,22.0,1.0,0.0...|
|       1.0|       1|[1.0,23.0,1.0,0.0...|
|       1.0|       1|[1.0,24.0,0.0,0.0...|
|       1.0|       1|[1.0,26.0,0.0,0.0...|
|       1.0|       0|[1.0,27.0,0.0,2.0...|
|       1.0|       0|[1.0,28.0,0.0,0.0...|
|       1.0|       1|[1.0,28.0,0.0,0.0...|
|       1.0|       0|[1.0,28.0,0.0,0.0...|
|       1.0|       0|[1.0,28.0,0.0,0.0...|
|       1.0|       1|[1.0,28.0,1.0,0.0...|
+----------

In [125]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [126]:
dtс_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="Survived", metricName="accuracy")

In [127]:
dtс_evaluator.evaluate(dtc_prediction)

0.7114427860696517