In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix

from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import udf, col
from pyspark.sql.types import FloatType

In [2]:
df = spark.read.csv(
  path='/FileStore/tables/bank.csv', 
  sep=';',
  header=True, 
  inferSchema=True)
df.printSchema()

In [3]:
pd.DataFrame(df.take(3), columns=df.columns).transpose()

Unnamed: 0,0,1,2
age,30,33,35
job,unemployed,services,management
marital,married,married,single
education,primary,secondary,tertiary
default,no,no,no
balance,1787,4789,1350
housing,no,yes,yes
loan,no,yes,no
contact,cellular,cellular,cellular
day,19,11,16


In [5]:
# Unbalanced Dataset

targetColName = 'y'
df.groupBy(targetColName).count().orderBy('count').show()

In [6]:
# Summarize numeric features

numColNames = [colName for colName, dtype in df.dtypes if dtype in ('int', 'float')]
catgColNames = [colName for colName, dtype in df.dtypes if dtype in ('string', 'bool')]

In [7]:
df.select(numColNames).describe().show()

In [8]:
catgColNames.remove('y')

stages = list()
for catgColName in catgColNames:
    stringIndexer = StringIndexer(inputCol = catgColName, outputCol = catgColName + 'Index')
    oneHotEncoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[catgColName + "classVec"])
    stages.extend([stringIndexer, oneHotEncoder])
    
yStringIndexer = StringIndexer(inputCol = 'y', outputCol = 'label')
stages.append(yStringIndexer)

assemblerInputs = [c + "classVec" for c in catgColNames] + numColNames
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages.append(assembler)

In [9]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
df.printSchema()

Logistic Regression

In [11]:
train, test = df.randomSplit([0.8, 0.2], seed = 2018)

In [12]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)
predictions = lrModel.transform(test)

In [13]:
# Notice that predictions is a SparkDf which has all the columns of test and three additional columns (rawPrediction, probablity and prediction)
predictions.printSchema()

In [14]:
print(predictions.select('rawPrediction').take(1)[0])
print(len(predictions.select('rawPrediction').take(1)))
print(predictions.select('rawPrediction', 'probability', 'prediction').take(1)[0])

In [15]:
predictions.select('rawPrediction', 'probability', 'prediction').show(3)

In [16]:
beta = np.sort(lrModel.coefficients)
beta

In [17]:
trainingSummary = lrModel.summary
trainingSummary.roc.show(3)


In [18]:
trainingSummary.pr.show(3)

In [19]:
# Evaluator API:  https://people.eecs.berkeley.edu/~jegonzal/pyspark/_modules/pyspark/ml/evaluation.html

evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))

In [20]:
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
    .addGrid(lr.regParam, [0.01, 0.5, 2.0])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .addGrid(lr.maxIter, [1, 5, 10])
    .build()
)

# Create 3-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
cvModel = cv.fit(train)


In [21]:
# Attributes of model can be obtained from: https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionModel

cvModel.bestModel.coefficients

In [22]:
predictions = cvModel.transform(test)

evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))

In [23]:
train.select('y', 'label').show(10)

**Adding a new column with weights to account for the imbalance in the labels of the dataset**

Reference: https://stackoverflow.com/questions/33372838/dealing-with-unbalanced-datasets-in-spark-mllib

In [26]:
df.groupBy('label').count().show()

nZeros = df.filter(df['label'] == 0).count()
nRows = df.count()
balancingRatioZeros = (nRows - nZeros) / nRows
balancingRatioOnes = 1 - balancingRatioZeros

def calculateWeights(label):
  if (label == 1.0):
    return balancingRatioOnes
  else:
    return balancingRatioZeros
  
calculateWeightsUdf = udf(calculateWeights, FloatType()) 

df = df.withColumn('labelWeight', calculateWeightsUdf(col("label")))

df.select('label', 'labelWeight').show()

In [27]:
train, test = df.randomSplit([0.7, 0.2], seed = 2018)

evaluator = BinaryClassificationEvaluator()
lr = LogisticRegression(
  featuresCol='features',
  labelCol ='label', 
  weightCol='labelWeight',
  maxIter=10)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
    .addGrid(lr.regParam, [0.01, 0.5, 2.0])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .addGrid(lr.maxIter, [1, 5, 10])
    .build()
)

# Create 3-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
cvModel = cv.fit(train)
predictions = cvModel.transform(test)

print('Test Area Under ROC', evaluator.evaluate(predictions))

In [28]:
predictions.select('label', 'prediction').show(8)

In [29]:
yTrue = predictions.select('label').toPandas()
yPred = predictions.select('prediction').toPandas()
confMatrix = confusion_matrix(y_true=yTrue, y_pred=yPred)

In [30]:
confMatrix

In [31]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
  
plot_confusion_matrix(confMatrix, classes=['0', '1'])  

Descision Tree

In [33]:
evaluator = BinaryClassificationEvaluator()
dt = DecisionTreeClassifier(labelCol='label', featuresCol='features')

paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

# Create 3-fold CrossValidator
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
cvModel = cv.fit(train)

In [34]:
predictions = cvModel.transform(test)

print('Test Area Under ROC', evaluator.evaluate(predictions))

Area under ROC is unusually low. Calculate AUC using sklearn

In [36]:
yTrue = predictions.select('label').toPandas()
yPred = predictions.select('prediction').toPandas()

def print_conf_matrix(y_test, y_pred): 
    confusion_df = pd.DataFrame(
        data=confusion_matrix(y_test, y_pred), 
        columns=["Predicted Class " + str(class_name) for class_name in [0,1]], 
        index = ["Class " + str(class_name) for class_name in [0,1]]
    )
    print(confusion_df)

print_conf_matrix(yTrue, yPred)

I get different area-under ROC when using sklearn's metrics.auc

In [38]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from sklearn.metrics import roc_curve, auc

secondElement = udf(lambda v:float(v[1]), FloatType())
predictions = predictions.withColumn('class1Prob', secondElement('probability'))

yProb = predictions.select('class1Prob').toPandas()

fpr, tpr, threshold = roc_curve(yTrue, yProb)
auc = auc(fpr, tpr)
auc

Random Forest Classifier

In [40]:
evaluator = BinaryClassificationEvaluator()
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

paramGrid = (ParamGridBuilder()
    .addGrid(rf.maxDepth, [2, 4, 6])
    .addGrid(rf.maxBins, [20, 60])
    .addGrid(rf.numTrees, [5, 20])
    .build()
)
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
cvModel = cv.fit(train)
predictions = cvModel.transform(test)

print('Test Area Under ROC', evaluator.evaluate(predictions))

Gradient Boosting

In [42]:
evaluator = BinaryClassificationEvaluator()
gbt = GBTClassifier(maxIter=10)

paramGrid = (ParamGridBuilder()
    .addGrid(gbt.maxDepth, [2, 4, 6])
    .addGrid(gbt.maxBins, [20, 60])
    .addGrid(gbt.maxIter, [10, 20])
    .build()
)

cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
cvModel = cv.fit(train)
predictions = cvModel.transform(test)

print('Test Area Under ROC', evaluator.evaluate(predictions))