In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.functions import col, unix_timestamp, expr, desc, regexp_extract, isnan, when, count
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, OneHotEncoderEstimator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml import Pipeline

In [33]:
spark.stop()

In [34]:
spark = SparkSession \
    .builder \
    .config("spark.driver.maxResultSize",  "0") \
    .appName("08_predict_the_survivors_of_the_titanic_shipwreck") \
    .master("yarn") \
    .getOrCreate()

In [35]:
spark

In [36]:
test = spark.read.csv("hdfs:///data/lsml/6-spark-ml/test.csv", inferSchema = True, header=True)
train = spark.read.csv("hdfs:///data/lsml/6-spark-ml/train.csv", inferSchema = True, header=True)

In [37]:
test.show(2)

+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|Ticket|  Fare|Cabin|Embarked|
+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0|330911|7.8292| null|       Q|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0|363272|   7.0| null|       S|
+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
only showing top 2 rows



In [38]:
test.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [39]:
train.show(2)    

+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0| PC 17599|71.2833|  C85|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
only showing top 2 rows



In [40]:
test.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [41]:
train_cut = train.withColumn('Title', regexp_extract('Name', r'\s([A-Z]{1}[a-z]+)\.', 1)) \
     .withColumn('FamilySize', expr("CASE WHEN SibSp = 0 then 'Single' WHEN SibSp > 0 and SibSp <4 then 'Small' else 'Large' end")) \
     .select('Survived', 'Pclass', 'Title', 'Sex', 'Age', 'FamilySize', 'Parch'
             , 'Fare' , 'Embarked')

In [42]:
df_count = train_cut.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in train_cut.columns])
df_count.show()

+--------+------+-----+---+---+----------+-----+----+--------+
|Survived|Pclass|Title|Sex|Age|FamilySize|Parch|Fare|Embarked|
+--------+------+-----+---+---+----------+-----+----+--------+
|       0|     0|    0|  0|177|         0|    0|   0|       2|
+--------+------+-----+---+---+----------+-----+----+--------+



In [43]:
avg_age = round(train_cut.groupBy().avg('Age').collect()[0]['avg(Age)'], 0)
train_cut_na = train_cut.na.fill(avg_age, subset = ['Age'])

In [44]:
train_cut.groupBy('Embarked').count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|    null|    2|
|       C|  168|
|       S|  644|
+--------+-----+



In [45]:
train_cut_na = train_cut_na.na.fill("S", subset = ['Embarked']) 

In [46]:
df_count_na = train_cut_na.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in train_cut_na.columns])
df_count_na.show()

+--------+------+-----+---+---+----------+-----+----+--------+
|Survived|Pclass|Title|Sex|Age|FamilySize|Parch|Fare|Embarked|
+--------+------+-----+---+---+----------+-----+----+--------+
|       0|     0|    0|  0|  0|         0|    0|   0|       0|
+--------+------+-----+---+---+----------+-----+----+--------+



In [47]:
train_cut_na.describe().toPandas()

Unnamed: 0,summary,Survived,Pclass,Title,Sex,Age,FamilySize,Parch,Fare,Embarked
0,count,891.0,891.0,891,891,891.0,891,891.0,891.0,891
1,mean,0.3838383838383838,2.308641975308642,,,29.758888888888887,,0.3815937149270482,32.2042079685746,
2,stddev,0.4865924542648575,0.8360712409770491,,,13.002570039820949,,0.8060572211299488,49.69342859718089,
3,min,0.0,1.0,Capt,female,0.42,Large,0.0,0.0,C
4,max,1.0,3.0,Sir,male,80.0,Small,6.0,512.3292,S


In [48]:
categorical = [col for (col, dataType) in train_cut_na.dtypes if dataType == 'string']
numerical = [col for (col, dataType) in train_cut_na.dtypes if (dataType != 'string') & (col != 'Survived')]

In [67]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label", maxDepth=15)
stages = []
for i in categorical:
    si = StringIndexer(handleInvalid="skip", inputCol=i, outputCol=i+"_si")
    #ohe = OneHotEncoderEstimator(handleInvalid='keep', inputCols=[si.getOutputCol()], outputCols=[i+"_ohe"])
    stages += [si]#, ohe]

stages.append(StringIndexer(inputCol="Survived", outputCol="label"))

assembler = [col + "_si" for col in categorical] + numerical
assembler = VectorAssembler(inputCols=assembler, outputCol="features")
stages += [assembler] + [dt]

In [68]:
stages

[StringIndexer_493e8e4a3aa75604d6f2,
 StringIndexer_44318672eab3404a7ef0,
 StringIndexer_475e8ddcd443a536ce68,
 StringIndexer_4719baa50ad2c25341ac,
 StringIndexer_4d97857da2491afbe5c7,
 VectorAssembler_4fda9e45abb9770e6a10,
 DecisionTreeClassifier_4fab95614b22c69b2e09]

In [69]:
pipeline = Pipeline(stages=stages)
model = pipeline.fit(train_cut_na)
data = model.transform(train_cut_na)

In [70]:
data.select('label','features', 'prediction').show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|(8,[2,4,5,7],[1.0...|       0.0|
|  1.0|[2.0,1.0,1.0,1.0,...|       1.0|
|  1.0|[1.0,1.0,0.0,0.0,...|       1.0|
|  1.0|[2.0,1.0,1.0,0.0,...|       1.0|
|  0.0|(8,[4,5,7],[3.0,3...|       0.0|
|  0.0|(8,[3,4,5,7],[2.0...|       0.0|
|  0.0|(8,[4,5,7],[1.0,5...|       0.0|
|  0.0|[3.0,0.0,1.0,0.0,...|       0.0|
|  1.0|[2.0,1.0,0.0,0.0,...|       1.0|
|  1.0|[2.0,1.0,1.0,1.0,...|       1.0|
|  1.0|[1.0,1.0,1.0,0.0,...|       1.0|
|  1.0|[1.0,1.0,0.0,0.0,...|       1.0|
|  0.0|(8,[4,5,7],[3.0,2...|       0.0|
|  0.0|[0.0,0.0,1.0,0.0,...|       0.0|
|  0.0|[1.0,1.0,0.0,0.0,...|       0.0|
|  1.0|[2.0,1.0,0.0,0.0,...|       1.0|
|  0.0|[3.0,0.0,2.0,2.0,...|       0.0|
|  1.0|(8,[4,5,7],[2.0,3...|       0.0|
|  0.0|[2.0,1.0,1.0,0.0,...|       0.0|
|  1.0|[2.0,1.0,0.0,1.0,...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



In [71]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(data)
print("Accuracy:" , (accuracy))


Accuracy: 0.9528619528619529


In [72]:
bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
AUR = bcEvaluator.evaluate(data)
print("Area under ROC curve:", AUR)


Area under ROC curve: 0.9688801542410975


In [73]:
with open("answers/week6/task1/output.txt", "w") as f:
    f.write(str(AUR))

In [74]:
!cat answers/week6/task1/output.txt

0.9688801542410975

In [29]:
!pip install --upgrade pip
!pip install matplotlib
!pip install enaml
!pip install seaborn

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. pip 21.0 will drop support for Python 2.7 in January 2021. More details about Python 2 support in pip can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support pip 21.0 will remove support for this functionality.[0m
Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: pip in /u3/shared/anaconda/lib/python2.7/site-packages (20.3.4)
[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. pip 21.0 will drop support for Python 2.7 in January 2021. More details about Python 2 support in pip can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support pip 21.0 will remove support for this functionality.[0m
Defaulting to user installation be

In [46]:
plt.figure(figsize=(5,5))
plt.plot([0, 1], [0, 1], 'r--')
plt.plot(madel.summary.roc.select('FPR').collect(),
         madel.summary.roc.select('TPR').collect())
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()

ImportError: No module named 'matplotlib'

In [31]:
!sudo apt-get install python-matplotlib

[sudo] password for nabramov: 
