In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Load Data

In [3]:
features = spark.read.format("libsvm").option("numFeatures", "500").load("features.libsvm")

In [4]:
features.schema

StructType(List(StructField(label,DoubleType,true),StructField(features,VectorUDT,true)))

In [5]:
features = features_test.withColumn("label", col("label").cast(IntegerType()))

In [6]:
features.count()

50620

In [7]:
features.show(truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
(trainingData, testData) = features.randomSplit([0.8, 0.2])

## Logistic Regression

In [9]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()

# Fit the model
lrModel = lr.fit(trainingData)

In [10]:
# Print the coefficients and intercept for logistic regression
print("Coefficients[:10]: " + str(lrModel.coefficients[:10]))
print("Intercept: " + str(lrModel.intercept))

Coefficients[:10]: [ 0.17921518  0.28500437  0.04143275  0.18858752  0.0458465   0.0804834
  0.00407339  0.15883926 -0.34930828  0.05788241]
Intercept: -3.53384003131


In [11]:
trainingSummary = lrModel.summary

In [12]:
# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

+--------------------+-------------------+
|                 FPR|                TPR|
+--------------------+-------------------+
|                 0.0|                0.0|
|0.001290426647310267|0.10304308526664659|
|0.003333602172218...|0.19734859897559506|
| 0.00623706212866629| 0.2823139499849352|
|0.010430948732424658|0.35281711358843026|
|0.015108745328924375|0.41759566134377823|
|0.019840309702395354|  0.481771617957216|
| 0.02591606850014786| 0.5308827960228985|
| 0.03234131784821356| 0.5760771316661645|
| 0.03938489663144877| 0.6146429647484182|
|0.046751082076511546| 0.6492919554082555|
|0.054251686964002475| 0.6824344682133172|
|  0.0549775519531145| 0.6848448327809581|
|  0.0629889507218324| 0.7122627297378729|
| 0.07110788504449284| 0.7384754444109671|
| 0.07933435492109579| 0.7640855679421512|
| 0.08793719923649757| 0.7848749623380536|
|  0.0967551146597844| 0.8032539921663151|
| 0.10578810119095626| 0.8195239529978909|
| 0.11498239105304191| 0.8339861404037361|
+----------

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [14]:
testData.count()

10104

In [15]:
# select example rows to display.
predictions = lrModel.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.939133016627


In [16]:
predictions.where((predictions.label==1) & (predictions.prediction ==1)).count()

295

In [17]:
# compute f1 on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
f1 = evaluator.evaluate(predictions)
print("Test set f1 score = " + str(f1))

Test set f1 score = 0.931397995177


In [18]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Test set precision = " + str(precision))

Test set precision = 0.930094376644


In [19]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Test set recall = " + str(recall))

Test set recall = 0.939133016627


## Without nlp features

In [168]:
features_no_nlp = spark.read.format("libsvm").option("numFeatures", "150").load("features_no_nlp.libsvm")

In [240]:
features_no_nlp.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                                                                                      |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(150,[4,7,26,36,41,43,70,145,148],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0])                                    

In [241]:
(trainingData_no_nlp, testData_no_nlp) = features_no_nlp.randomSplit([0.8, 0.2])

In [247]:
from pyspark.ml.classification import LogisticRegression

lr_no_nlp = LogisticRegression()

# Fit the model
lrModel_no_nlp = lr_no_nlp.fit(trainingData_no_nlp)

In [248]:
# Print the coefficients and intercept for logistic regression
print("Coefficients[:10]: " + str(lrModel_no_nlp.coefficients[:10]))
print("Intercept: " + str(lrModel_no_nlp.intercept))

Coefficients[:10]: [ 0.18992387  0.26471058  0.07279275  0.22631037  0.01603386  0.02633943
  0.05526346  0.20970611 -0.24639487 -0.03954323]
Intercept: -3.54750938407


In [249]:
trainingSummary_no_nlp = lrModel_no_nlp.summary

In [250]:
# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
# trainingSummary_no_nlp.roc.show()
print("areaUnderROC: " + str(trainingSummary_no_nlp.areaUnderROC))

areaUnderROC: 0.920768316496


In [251]:
# select example rows to display.
predictions_no_nlp = lrModel_no_nlp.transform(testData_no_nlp)

# compute accuracy on the test set
evaluator_no_nlp = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy_no_nlp = evaluator_no_nlp.evaluate(predictions_no_nlp)
print("Test set accuracy = " + str(accuracy_no_nlp))

Test set accuracy = 0.93937610359


In [4]:
import pandas
predictions_no_nlp.show()
predictions.toPandas().to_csv('predictions.csv')

+-----+-----------+--------------------+--------------------+----------+
|label|   features|       rawPrediction|         probability|prediction|
+-----+-----------+--------------------+--------------------+----------+
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974412441...|       0.0|
|  0.0|(150,[],[])|[3.54750938406521...|[0.97200974

In [252]:
# compute f1 on the test set
evaluator_no_nlp = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
f1_no_nlp = evaluator_no_nlp.evaluate(predictions_no_nlp)
print("Test set f1 score = " + str(f1_no_nlp))

Test set f1 score = 0.931117154275


## Feature Importance

In [43]:
features_raw = spark.read.csv("features_df.csv", header=True, inferSchema=True)

In [44]:
feature_names = features_raw.schema.names[2:]

In [45]:
coefficients = lrModel_no_nlp.coefficients

In [46]:
import pandas as pd
import numpy as np

feature_importance = pd.DataFrame(zip(feature_names, coefficients), columns=["feature_name", "coefficient"])

In [258]:
feature_importance["coef_abs"] = np.abs(feature_importance.coefficient)

In [None]:
feature_importance.sort_values("coef_abs", ascending=False)

feature_importance.toPandas().to_csv('feature-importance.csv')

In [36]:
from pyspark.ml.classification import DecisionTreeClassifier

In [20]:
dt = DecisionTreeClassifier()

In [21]:
dtModel = dt.fit(trainingData)

In [42]:
dtModel.featureImportances

(250,[4,9,10,15,31,44,66,74,95,105,123,141,145,148,154,193,212,218],[0.00251970030448,0.000952658687502,0.00178323233412,0.0304553902185,0.00225582352972,0.0705666384121,0.18113325785,0.00737393078548,0.01061957385,0.00260312278863,0.0603206228675,0.0249645282369,0.105118237508,0.481483948728,0.00290988469595,0.00670669255092,0.00271938699011,0.00551336966223])

In [22]:
coefficients = dtModel.featureImportances

In [23]:
feature_importance = pd.DataFrame(zip(feature_names, coefficients), columns=["feature_name", "coefficient"])

In [24]:
feature_importance["coef_abs"] = np.abs(feature_importance.coefficient)

In [25]:
feature_importance.sort_values("coef_abs", ascending=False)

                             feature_name  coefficient  coef_abs
148                                   992     0.486448  0.486448
66                         Norepinephrine     0.182024  0.182024
145                                   940     0.108198  0.108198
44                   Iso-Osmotic Dextrose     0.069423  0.069423
123                                   365     0.067908  0.067908
141                                   909     0.031116  0.031116
95                             Vancomycin     0.011785  0.011785
1    0.9% Sodium Chloride (Mini Bag Plus)     0.009333  0.009333
65                          Nitroglycerin     0.007544  0.007544
14                              CefazoLIN     0.003762  0.003762
88                     Sodium Bicarbonate     0.002882  0.002882
4                           Acetaminophen     0.002658  0.002658
105                                   226     0.001725  0.001725
0                    0.9% Sodium Chloride     0.001643  0.001643
41                       

In [26]:
predictions_DT = dtModel.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
f1 = evaluator.evaluate(predictions_DT)
print("Test set f1 = " + str(f1))

Test set f1 = 0.921665559236


In [27]:
predictions_DT = dtModel.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedPrecision")
precision = evaluator.evaluate(predictions_DT)
print("Test set precision = " + str(precision))

Test set precision = 0.91955324265


In [28]:
predictions_DT = dtModel.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedRecall")
recall = evaluator.evaluate(predictions_DT)
print("Test set recall = " + str(recall))

Test set recall = 0.932403008709


In [29]:
from pyspark.ml.classification import GBTClassifier

In [30]:
gbt = GBTClassifier()
gbtModel = gbt.fit(trainingData)

In [32]:
predictions_gbt = gbtModel.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
f1 = evaluator.evaluate(predictions_gbt)
print("Test set f1 = " + str(f1))

Test set f1 = 0.924731553276


In [33]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedPrecision")
precision = evaluator.evaluate(predictions_gbt)
print("Test set precision = " + str(precision))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedRecall")
recall = evaluator.evaluate(predictions_gbt)
print("Test set recall = " + str(recall))

Test set precision = 0.923632908999
Test set recall = 0.93517418844


In [35]:
from pyspark.ml.classification import RandomForestClassifier

In [36]:
rf = RandomForestClassifier()
rfModel = rf.fit(trainingData)

In [38]:
predictions_rf = rfModel.transform(testData)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
f1 = evaluator.evaluate(predictions_rf)
print("Test set f1 = " + str(f1))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedPrecision")
precision = evaluator.evaluate(predictions_rf)
print("Test set precision = " + str(precision))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedRecall")
                                              
recall = evaluator.evaluate(predictions_rf)
print("Test set recall = " + str(recall))

Test set f1 = 0.902572651921
Test set precision = 0.931193931584
Test set recall = 0.930324623911


In [49]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
nbModel = nb.fit(trainingData)

In [50]:
predictions_nb = nbModel.transform(testData)

In [51]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
f1 = evaluator.evaluate(predictions_nb)
print("Test set f1 = " + str(f1))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedPrecision")
precision = evaluator.evaluate(predictions_nb)
print("Test set precision = " + str(precision))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedRecall")
                                              
recall = evaluator.evaluate(predictions_nb)
print("Test set recall = " + str(recall))

Test set f1 = 0.845325703653
Test set precision = 0.915845948876
Test set recall = 0.805522565321
