In [0]:
df=spark.sql("Select * from titanic")

In [0]:
df.printSchema()

root
 |-- PassengerId: long (nullable = true)
 |-- Survived: long (nullable = true)
 |-- Pclass: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: long (nullable = true)
 |-- Parch: long (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [0]:
df.columns

Out[3]: ['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [0]:
my_cols=df.select(['Survived',
 'Pclass','Sex',
 'Age',
 'SibSp',
 'Parch','Fare','Embarked'])

In [0]:
my_final_data=my_cols.na.drop()

In [0]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder,StringIndexer

In [0]:
gender_indexer=StringIndexer(inputCol='Sex',outputCol='SexIndex')

In [0]:
gender_encoder=OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [0]:
embark_indexer=StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder=OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [0]:
assembler=VectorAssembler(inputCols=[
 'Pclass','SexVec',
 'Age',
 'SibSp',
 'Parch','Fare','EmbarkVec'],outputCol='features')

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [0]:
log_reg_titanic=LogisticRegression(featuresCol='features',labelCol='Survived')

In [0]:
pipeline=Pipeline(stages=[gender_indexer,embark_indexer,gender_encoder,embark_encoder,assembler,log_reg_titanic])

In [0]:
train_data,test_data=my_final_data.randomSplit([0.7,0.3])

In [0]:
train_data.show()

+--------+------+------+----+-----+-----+--------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|
+--------+------+------+----+-----+-----+--------+--------+
|       0|     1|female| 2.0|    1|    2|  151.55|       S|
|       0|     1|female|50.0|    0|    0| 28.7125|       C|
|       0|     1|  male|21.0|    0|    1| 77.2875|       S|
|       0|     1|  male|22.0|    0|    0|135.6333|       C|
|       0|     1|  male|24.0|    0|    0|    79.2|       C|
|       0|     1|  male|27.0|    0|    2|   211.5|       C|
|       0|     1|  male|28.0|    0|    0|    47.1|       S|
|       0|     1|  male|29.0|    1|    0|    66.6|       S|
|       0|     1|  male|30.0|    0|    0|   27.75|       C|
|       0|     1|  male|31.0|    0|    0| 50.4958|       S|
|       0|     1|  male|33.0|    0|    0|     5.0|       S|
|       0|     1|  male|36.0|    0|    0|  40.125|       C|
|       0|     1|  male|36.0|    1|    0|   78.85|       S|
|       0|     1|  male|37.0|    0|    1

In [0]:
fit_model=pipeline.fit(train_data)

In [0]:
results=fit_model.transform(test_data)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
my_eval=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [0]:
my_eval.rawPredictionCol

Out[32]: Param(parent='BinaryClassificationEvaluator_854d71e7d498', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.')

In [0]:
auc=my_eval.evaluate(results)

In [0]:
auc

Out[34]: 0.816045066045066