In [1]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator,VectorAssembler,MinMaxScaler

In [4]:
if __name__=="__main__":
    try:
        from pyspark.sql import SparkSession
    except:
        import findspark
        findspark.init()
        from pyspark.sql import SparkSession
    spark=SparkSession.builder \
    .master("local[8]") \
    .appName("ejemplo") \
    .getOrCreate()

In [5]:
dataTitanic=spark.read.csv("titanic_train.csv",header=True,inferSchema=True)
dataTitanic_test=spark.read.csv("titanic_eval.csv",header=True,inferSchema=True)

In [7]:
print(dataTitanic.describe().show())
print(dataTitanic.count())
print(dataTitanic.printSchema())


+-------+--------------------+--------------------+------------------+------+------------------+------------------+------------------+------------------+-----+-------+-----------+-----+
|summary|          Start.Time|           Stop Time|          survived|   sex|               age|n_siblings_spouses|             parch|              fare|class|   deck|embark_town|alone|
+-------+--------------------+--------------------+------------------+------+------------------+------------------+------------------+------------------+-----+-------+-----------+-----+
|  count|                 158|                 158|               627|   627|               627|               627|               627|               627|  627|    627|        627|  627|
|   mean|1.558541201464107...|1.558541205959171E12|0.3875598086124402|  null|29.631307814992027|0.5454545454545454| 0.379585326953748|34.385398564593245| null|   null|       null| null|
| stddev|  19793.234457004008|   3181.834167280871|0.4875821656114251|

In [8]:
type(dataTitanic)

pyspark.sql.dataframe.DataFrame

In [23]:
dataTitanic.createOrReplaceTempView("titanic")

In [27]:
spark.sql(""" SELECT DISTINCT deck from titanic""").show()

+-------+
|   deck|
+-------+
|      F|
|unknown|
|      E|
|      B|
|      D|
|      C|
|      A|
|      G|
+-------+



In [72]:
spark.sql(""" SELECT DISTINCT embark_town from titanic""").show()

+-----------+
|embark_town|
+-----------+
|    unknown|
| Queenstown|
|Southampton|
|  Cherbourg|
+-----------+



In [9]:
enc_sex=StringIndexer(inputCol="sex",outputCol="sex_num",stringOrderType="alphabetAsc")
enc_class=StringIndexer(inputCol="class",outputCol="class_num",stringOrderType="alphabetAsc")
enc_deck=StringIndexer(inputCol="deck",outputCol="deck_num",stringOrderType="alphabetAsc")
enc_embark=StringIndexer(inputCol="embark_town",outputCol="embark_town_num",stringOrderType="alphabetAsc")
enc_alone=StringIndexer(inputCol="alone",outputCol="alone_num",stringOrderType="frequencyAsc")

In [10]:
ohe=OneHotEncoderEstimator(inputCols=["sex_num","deck_num","embark_town_num"],
                               outputCols=["sex_ohe","deck_ohe","embark_town_ohe"])

In [11]:
vector=VectorAssembler(inputCols=["sex_ohe","age","n_siblings_spouses","parch","fare","class_num","deck_ohe",
                                  "embark_town_ohe","alone_num"],outputCol="covariables")

In [12]:
scala=MinMaxScaler(inputCol="covariables",outputCol="sca_covariables")

In [13]:
from pyspark.ml.classification import LinearSVC

In [14]:
model=LinearSVC(featuresCol="sca_covariables",labelCol="survived")

In [15]:
from pyspark.ml import Pipeline

In [16]:
tuberia=Pipeline(stages=[enc_sex,enc_class,enc_deck,enc_embark,enc_alone,ohe,vector,scala,model])

In [17]:
mod_train=tuberia.fit(dataTitanic)

In [18]:
prediccion=mod_train.transform(dataTitanic_test)
prediccion.select("prediction","survived").show(10)

+----------+--------+
|prediction|survived|
+----------+--------+
|       0.0|       0|
|       1.0|       0|
|       1.0|       1|
|       1.0|       1|
|       0.0|       1|
|       1.0|       1|
|       1.0|       0|
|       0.0|       0|
|       1.0|       0|
|       1.0|       1|
+----------+--------+
only showing top 10 rows



In [93]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [105]:
score=MulticlassClassificationEvaluator(
predictionCol="prediction",
labelCol="survived",
metricName="accuracy")

In [106]:
score.evaluate(prediccion)

0.7537878787878788