In [1]:
!pip install pyspark



In [2]:
# Import packages
import time
import pyspark
import os
import csv
import numpy as np
import pandas as pd
from numpy import array
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext, SparkConf

In [3]:
from pyspark.sql import SparkSession
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
            .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:0.18.1") \
            .getOrCreate()
import mmlspark

In [4]:
# Cargar un file .csv
file_name = "Data/train.csv"

titanic = spark.read.csv(file_name, sep=',', header=True, inferSchema=True)
train, test = titanic.randomSplit([0.85, 0.15], seed=1)

In [5]:
# Validar valores omitidos

from pyspark.sql import functions as F

for column in train.columns:
    if train.where(F.col(column).isNull()).count() != 0:
        print("\tBe careful: there are null values in the column '{}'".format(column))
    else:
        print("The column '{}' does not have null values".format(column))

The column 'PassengerId' does not have null values
The column 'Survived' does not have null values
The column 'Pclass' does not have null values
The column 'Name' does not have null values
The column 'Sex' does not have null values
	Be careful: there are null values in the column 'Age'
The column 'SibSp' does not have null values
The column 'Parch' does not have null values
The column 'Ticket' does not have null values
The column 'Fare' does not have null values
	Be careful: there are null values in the column 'Cabin'
	Be careful: there are null values in the column 'Embarked'


In [6]:
from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoder, IndexToString
from pyspark.sql.functions import *

# tipos de variables
train = train.select(col("Survived"),col("Sex"),col("Embarked"),col("Pclass").cast("float"),
                     col("Age").cast("float"),col("SibSp").cast("float"),col("Fare").cast("float"))
test = test.select(col("Survived"),col("Sex"),col("Embarked"),col("Pclass").cast("float"),
                   col("Age").cast("float"),col("SibSp").cast("float"),col("Fare").cast("float"))

# quitamos valores nulos no imputados
train = train.dropna()
test = test.dropna()

# Indexar labels
train = StringIndexer(inputCol="Sex", outputCol="indexedSex").fit(train).transform(train)
train = StringIndexer(inputCol="Embarked", outputCol="indexedEmbarked").fit(train).transform(train)
train = StringIndexer(inputCol="Survived", outputCol="indexedSurvived").fit(train).transform(train)
test = StringIndexer(inputCol="Sex", outputCol="indexedSex").fit(test).transform(test)
test = StringIndexer(inputCol="Embarked", outputCol="indexedEmbarked").fit(test).transform(test)
test = StringIndexer(inputCol="Survived", outputCol="indexedSurvived").fit(test).transform(test)
# One Hot Encoder en los features indexados
train = OneHotEncoder(inputCol="indexedSex", outputCol="sexVec").transform(train)
train = OneHotEncoder(inputCol="indexedEmbarked", outputCol="embarkedVec").transform(train)
test = OneHotEncoder(inputCol="indexedSex", outputCol="sexVec").transform(test)
test = OneHotEncoder(inputCol="indexedEmbarked", outputCol="embarkedVec").transform(test) 

In [7]:
# Creamos vector assembler
from pyspark.ml.feature import VectorAssembler

# Feature assembler as a vector
train = train.withColumnRenamed('Survived', 'label')
train = VectorAssembler(inputCols=["Pclass","sexVec","embarkedVec", "Age","SibSp","Fare"],
                        outputCol="features").transform(train)
test = test.withColumnRenamed('Survived', 'label')
test = VectorAssembler(inputCols=["Pclass","sexVec","embarkedVec", "Age","SibSp","Fare"],
                       outputCol="features").transform(test)


In [8]:
from mmlspark.lightgbm import LightGBMClassifier
from mmlspark.train import TrainClassifier
model = LightGBMClassifier(learningRate=0.3,
                           numIterations=100,
                           numLeaves=31).fit(train)

scoredData = model.transform(test)

In [16]:
# Evaluacion del modelo
from mmlspark.train import ComputeModelStatistics
metrics = ComputeModelStatistics(evaluationMetric='classification',
                                 labelCol='label',
                                 scoresCol='rawPrediction',
                                 scoredLabelsCol='prediction').transform(scoredData)
metrics.toPandas()

Unnamed: 0,evaluation_type,confusion_matrix,accuracy,precision,recall,AUC
0,Classification,"DenseMatrix([[64., 8.],\n [12., 3...",0.824561,0.789474,0.714286,0.887897


In [17]:
trainData = model.transform(train)
metrics = ComputeModelStatistics(evaluationMetric='classification',
                                 labelCol='label',
                                 scoresCol='rawPrediction',
                                 scoredLabelsCol='prediction').transform(trainData)
metrics.toPandas()

Unnamed: 0,evaluation_type,confusion_matrix,accuracy,precision,recall,AUC
0,Classification,"DenseMatrix([[349., 3.],\n [ 9....",0.979933,0.9875,0.963415,0.99877
