In [87]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import lit, col
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import DenseVector
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.classification import NaiveBayes, MultilayerPerceptronClassifier
import time
%pylab inline
start = time.time()

Populating the interactive namespace from numpy and matplotlib


In [88]:
## Load csv file as RDD
trainRDD = sc.textFile("train.csv")
testRDD = sc.textFile("test.csv")
#print(trainRDD.take(3))

In [89]:
# Convert RDD to DF
def convert(rdd):
    header = rdd.first()
    body = rdd.filter(lambda r:r!=header)
    def rowHelper(row):
        l = row.replace('"', '').split(",")
        t = tuple(l)
        return t
    
    parsed = body.map(rowHelper)
    headers = header.split(",")
    if headers[1] == "Survived":
        headers.insert(3,"FirstName")
    else:
        headers.insert(2,"FirstName")
        
    return parsed.toDF(headers)

train = convert(trainRDD)
test = convert(testRDD)
#print(train.show(3))

In [90]:
# Combine Data
train = train.withColumn("Datatype", lit("train"))
test = test.withColumn("Datatype",lit("test")).withColumn("Survived", lit(0))
test = test[train.columns]
combined = train.unionAll(test)

# Drop Columns
toDrop = ["FirstName", "PassengerId", "Ticket", "Cabin"]
for i in toDrop:
    combined = combined.drop(i)

# Convert To Double
combined = (combined.withColumn("Age", combined["Age"].cast("double"))
            .withColumn("SibSp", combined["SibSp"].cast("double"))
            .withColumn("Parch", combined["Parch"].cast("double"))
            .withColumn("Fare", combined["Fare"].cast("double"))
            .withColumn("Survived", combined["Survived"].cast("double")))
combined.printSchema()

# Handle Missing Values
ageMean = combined.groupBy().mean("Age").first()[0]
fareMean = combined.groupBy().mean("Fare").first()[0]
combined = combined.na.fill({"Age":ageMean,"Fare":fareMean,"Embarked":"S"})
#print(combined.show(3))

root
 |-- Survived: double (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: double (nullable = true)
 |-- Parch: double (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Datatype: string (nullable = false)



In [91]:
# Name & Title
titles = udf(lambda name: name.split(".")[0].strip(),StringType())
combined = combined.withColumn("Title", titles(combined["Name"]))
combined = combined.withColumn("Title", regexp_replace("Title", "Mlle", "Miss"))
combined = combined.withColumn("Title", regexp_replace("Title", "Ms", "Miss"))
combined = combined.withColumn("Title", regexp_replace("Title", "Mme", "Mrs"))
for i in ["Lady", "Countess","Capt", "Col","Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona"]:
    combined = combined.withColumn("Title", regexp_replace("Title", i, "Others"))
#print(combined.show(3))

In [92]:
# Age
combined = combined.withColumn("AgeRange", F.when(combined["Age"] < 16, lit(0)))
combined = combined.withColumn("AgeRange", F.when((combined["Age"] >= 16) & (combined["Age"] < 36), lit(1)).otherwise(combined["AgeRange"]))
combined = combined.withColumn("AgeRange", F.when(combined["Age"] >= 36, lit(2)).otherwise(combined["AgeRange"]))
#print(combined.show(3))

In [93]:
# Party Size
combined = combined.withColumn("PartySize", F.when((combined["SibSp"] + combined["Parch"] == 0), lit(0)))
combined = combined.withColumn("PartySize", F.when(((combined["SibSp"] + combined["Parch"] > 0) & (combined["SibSp"] + combined["Parch"] <= 3)), lit(1)).otherwise(combined["PartySize"]))
combined = combined.withColumn("PartySize", F.when(((combined["SibSp"] + combined["Parch"] > 3)), lit(2)).otherwise(combined["PartySize"]))
#print(combined.show(3))

In [94]:
# Fare
combined = combined.withColumn("FareRange", F.when(combined["Fare"] <= 7.91, lit(0)))
combined = combined.withColumn("FareRange",F.when((combined["Fare"] > 7.91) & (combined["Fare"] <= 14.454), lit(1)).otherwise(combined["FareRange"]))
combined = combined.withColumn("FareRange",F.when((combined["Fare"] > 14.454) & (combined["Fare"] <= 31), lit(2)).otherwise(combined["FareRange"]))
combined = combined.withColumn("FareRange",F.when(combined["Fare"]> 31, lit(3)).otherwise(combined["FareRange"]))
#print(combined.show(3))

In [95]:
# Index Features
cats = ["Pclass", "Title", "Sex", "Embarked"]
def indexer(data, col):
    res = StringIndexer(inputCol = col, outputCol = col + "I").fit(data)
    return res

indexers = [indexer(combined, col) for col in cats]
pipeline = Pipeline(stages = indexers)
combined = pipeline.fit(combined).transform(combined)
for i in ["Pclass", "Title", "Sex", "Embarked", "Fare", "SibSp", "Parch", "Age", "Name"]:
    combined = combined.drop(i)
#print(combined.show(3))

In [96]:
# Convert To Label/Features Vector
nums = ["Survived", "AgeRange", "PartySize", "FareRange"]
catsI = [i + "I" for i in cats]
featCol = nums + catsI
featCol.remove("Survived")
labelCol = ["Datatype", "Survived"]
row = Row("datatype", "label", "features")

combined = combined[labelCol + featCol]
lf = (combined.rdd.map(lambda r : (row(r[0], r[1], DenseVector(r[2:])))).toDF())
#lf = (StringIndexer(inputCol = "label", outputCol = "index").fit(lf).transform(lf))
#lf.show(5)

+--------+-----+--------------------+
|datatype|label|            features|
+--------+-----+--------------------+
|   train|  0.0|[1.0,1.0,0.0,0.0,...|
|   train|  1.0|[2.0,1.0,3.0,1.0,...|
|   train|  1.0|[1.0,0.0,1.0,0.0,...|
|   train|  1.0|[1.0,1.0,3.0,1.0,...|
|   train|  0.0|[1.0,0.0,1.0,0.0,...|
+--------+-----+--------------------+
only showing top 5 rows



In [249]:
# seperate train/test data
trainAll = lf.where(lf.datatype == "train")
test = lf.where(lf.datatype == "test")

# Select A Partial of Training Data
#trainLimited, hold = trainAll.randomSplit([0.88, 0.12])

# random split further to get train/validate
train, validate = trainLimited.randomSplit([0.8, 0.2])
evl = BinaryClassificationEvaluator(labelCol="label")
#print("# Train Row: " + str(train.count()))
#print("# Validate Row: " + str(validate.count()))
#print("# Total Row: " + str(trainAll.count()))

# Train Row: 634
# Validate Row: 135
# Total Row: 891


In [245]:
delta = time.time() - start
print("Time of Data Processing: " + str(delta))

Time of Data Processing: 64.7177131176


In [250]:
# Logistic Regression
start = time.time()
log = LogisticRegression(labelCol="label").fit(train)
pred1 = log.transform(train)
pred2 = log.transform(validate)
delta = time.time() - start
print("Logistic Regression " + str(evl.evaluate(pred1)) + " " + str(evl.evaluate(pred2)) + " Time: " + str(delta))

Logistic Regression 0.864155431362 0.866523508137


In [251]:
# Decision Tree
start = time.time()
dtree = DecisionTreeClassifier(labelCol="label").fit(train)
pred1 = dtree.transform(train)
pred2 = dtree.transform(validate)
delta = time.time() - start
print("Decision Tree " + str(evl.evaluate(pred1)) + " " + str(evl.evaluate(pred2)) + " Time: " + str(delta))

Decision Tree 0.852646479955 0.819846292948 Time: 0.811095952988


In [252]:
# Random Forest
start = time.time()
randForest = RandomForestClassifier(labelCol="label").fit(train)
pred1 = randForest.transform(train)
pred2 = randForest.transform(validate)
delta = time.time() - start
print("Random Forest " + str(evl.evaluate(pred1)) + " " + str(evl.evaluate(pred2)) + " Time: " + str(delta))

Random Forest 0.902266005704 0.869914104882 Time: 0.806475162506


In [253]:
# Kaggle Submission
#rf = RandomForestClassifier(labelCol="label").fit(trainAll)
#pred = randForest.transform(test)
#pred.rdd.map(lambda x: ",".join(map(str, x))).coalesce(1).saveAsTextFile("a.csv")