In [1]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import graphviz 
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing, linear_model, decomposition
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import numpy as np
from math import floor

from pyspark.sql import SparkSession
import pyspark.sql.types as ST
from pyspark.sql import Row
from pyspark.sql import functions as F
from datetime import datetime

pd.set_option('display.max_columns', 500)

# Data Correlation

In [None]:
def plot_corr_matrix(df, corr_method="pearson"):
    #source: https://seaborn.pydata.org/examples/many_pairwise_correlations.html
    
    # Compute the correlation matrix
    corr = df.corr(corr_method, min_periods=100)

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    g = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.title(corr_method)
    plt.show()

# Model performance

In [None]:
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    
    # Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
def report_confustion_mtx(clf):
    _train_pre = clf.predict(x_train)
    _test_pre = clf.predict(x_test)
    _confus_mx_train = confusion_matrix(_train_pre, y_train) 
    _confus_mx_test = confusion_matrix(_test_pre, y_test) 

    plt.figure()
    plot_confusion_matrix(_confus_mx_train, classes= ["0", "1"], title = "train performance", normalize=True)
    plt.show()
    plot_confusion_matrix(_confus_mx_test, classes= ["0", "1"], title = "test performance", normalize=True)
    plt.show()    