In [45]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator,VectorAssembler,MinMaxScaler

In [107]:
pwd

'/Users/jimmy/Documents/workspacePython/TensorFlow'

In [55]:
dataTitanic=spark.read.csv("titanic_train.csv",header=True,inferSchema=True)
dataTitanic_test=spark.read.csv("titanic_eval.csv",header=True,inferSchema=True)

In [19]:
print(dataTitanic.describe().show())
print(dataTitanic.count())

+-------+------------------+------+------------------+------------------+------------------+------------------+-----+-------+-----------+-----+
|summary|          survived|   sex|               age|n_siblings_spouses|             parch|              fare|class|   deck|embark_town|alone|
+-------+------------------+------+------------------+------------------+------------------+------------------+-----+-------+-----------+-----+
|  count|               627|   627|               627|               627|               627|               627|  627|    627|        627|  627|
|   mean|0.3875598086124402|  null|29.631307814992027|0.5454545454545454| 0.379585326953748|34.385398564593245| null|   null|       null| null|
| stddev|0.4875821656114251|  null|12.511817629565812|1.1510895973422302|0.7929992125432801|54.597730499456304| null|   null|       null| null|
|    min|                 0|female|              0.75|                 0|                 0|               0.0|First|      A|  Cherbourg

In [6]:
dataTitanic.printSchema()

root
 |-- survived: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: double (nullable = true)
 |-- n_siblings_spouses: integer (nullable = true)
 |-- parch: integer (nullable = true)
 |-- fare: double (nullable = true)
 |-- class: string (nullable = true)
 |-- deck: string (nullable = true)
 |-- embark_town: string (nullable = true)
 |-- alone: string (nullable = true)



In [16]:
dataTitanic.show(10)

+--------+------+----+------------------+-----+-------+------+-------+-----------+-----+
|survived|   sex| age|n_siblings_spouses|parch|   fare| class|   deck|embark_town|alone|
+--------+------+----+------------------+-----+-------+------+-------+-----------+-----+
|       0|  male|22.0|                 1|    0|   7.25| Third|unknown|Southampton|    n|
|       1|female|38.0|                 1|    0|71.2833| First|      C|  Cherbourg|    n|
|       1|female|26.0|                 0|    0|  7.925| Third|unknown|Southampton|    y|
|       1|female|35.0|                 1|    0|   53.1| First|      C|Southampton|    n|
|       0|  male|28.0|                 0|    0| 8.4583| Third|unknown| Queenstown|    y|
|       0|  male| 2.0|                 3|    1| 21.075| Third|unknown|Southampton|    n|
|       1|female|27.0|                 0|    2|11.1333| Third|unknown|Southampton|    n|
|       1|female|14.0|                 1|    0|30.0708|Second|unknown|  Cherbourg|    n|
|       1|female| 4.0

In [22]:
type(dataTitanic)

pyspark.sql.dataframe.DataFrame

In [23]:
dataTitanic.createOrReplaceTempView("titanic")

In [27]:
spark.sql(""" SELECT DISTINCT deck from titanic""").show()

+-------+
|   deck|
+-------+
|      F|
|unknown|
|      E|
|      B|
|      D|
|      C|
|      A|
|      G|
+-------+



In [72]:
spark.sql(""" SELECT DISTINCT embark_town from titanic""").show()

+-----------+
|embark_town|
+-----------+
|    unknown|
| Queenstown|
|Southampton|
|  Cherbourg|
+-----------+



In [75]:
enc_sex=StringIndexer(inputCol="sex",outputCol="sex_num",stringOrderType="alphabetAsc")
enc_class=StringIndexer(inputCol="class",outputCol="class_num",stringOrderType="alphabetAsc")
enc_deck=StringIndexer(inputCol="deck",outputCol="deck_num",stringOrderType="alphabetAsc")
enc_embark=StringIndexer(inputCol="embark_town",outputCol="embark_town_num",stringOrderType="alphabetAsc")
enc_alone=StringIndexer(inputCol="alone",outputCol="alone_num",stringOrderType="frequencyAsc")

In [76]:
ohe=OneHotEncoderEstimator(inputCols=["sex_num","deck_num","embark_town_num"],
                               outputCols=["sex_ohe","deck_ohe","embark_town_ohe"])

In [77]:
vector=VectorAssembler(inputCols=["sex_ohe","age","n_siblings_spouses","parch","fare","class_num","deck_ohe",
                                  "embark_town_ohe","alone_num"],outputCol="covariables")

In [78]:
scala=MinMaxScaler(inputCol="covariables",outputCol="sca_covariables")

In [79]:
from pyspark.ml.classification import LinearSVC

In [84]:
model=LinearSVC(featuresCol="sca_covariables",labelCol="survived")

In [85]:
from pyspark.ml import Pipeline

In [86]:
tuberia=Pipeline(stages=[enc_sex,enc_class,enc_deck,enc_embark,enc_alone,ohe,vector,scala,model])

In [87]:
mod_train=tuberia.fit(dataTitanic)

In [99]:
prediccion=mod_train.transform(dataTitanic_test)
prediccion.select("prediction","survived").show(10)

+----------+--------+
|prediction|survived|
+----------+--------+
|       0.0|       0|
|       1.0|       0|
|       1.0|       1|
|       1.0|       1|
|       0.0|       1|
|       1.0|       1|
|       1.0|       0|
|       0.0|       0|
|       1.0|       0|
|       1.0|       1|
+----------+--------+
only showing top 10 rows



In [93]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [105]:
score=MulticlassClassificationEvaluator(
predictionCol="prediction",
labelCol="survived",
metricName="accuracy")

In [106]:
score.evaluate(prediccion)

0.7537878787878788