Notebook to hold all functions for capstone project.

Sources are referenced in codeblock comments at the top of any functions that have been taken from somewhere online.

In [20]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import VotingClassifier

In [None]:
def scaler_w_column_names_tags(df_to_scale, scaler_type, tags='yes'):
    
    # instantiate scaler
    scaler = scaler_type()

    scaled_df = scaler.fit_transform(df_to_scale.drop('genre_group', axis=1))

    if tags == 'yes':
    # GET BACK GENRE TAGS
        scaled_df = pd.DataFrame(scaled_df)
        scaled_df['genre_group'] = df_to_scale['genre_group']
    pass

    # PUT BACK COLUMN NAMES
    old_colnames = list(scaled_df.columns)
    new_colnames= list(df_to_scale.columns)
    rename_dict = {a:b for a,b in zip(old_colnames,new_colnames)}
    scaled_df.rename(columns=rename_dict, inplace=True)
    scaled_df

In [None]:
def scaler_w_column_names(df_to_scale, scaler_type):
    
    # instantiate and implementscaler
    scaler = scaler_type
    scaled_df = scaler.fit_transform(df_to_scale)

    scaled_df = pd.DataFrame(scaled_df)
    
    # PUT BACK COLUMN NAMES
    old_colnames = list(scaled_df.columns)
    new_colnames= list(df_to_scale.columns)
    rename_dict = {a:b for a,b in zip(old_colnames,new_colnames)}
    scaled_df.rename(columns=rename_dict, inplace=True)
    scaled_df

In [1]:
def kmeans_k(scaled_df):
    # run a few k values to plot the inertia and silhouette scores
    # we have 13 features so let's run 2 to 13
    k_range = np.arange(2, scaled_df.shape[1]+1)
    inertia_list = []
    silhouette_score_list = []

    for k in k_range:
        #Instantiate
        KMeans_model = KMeans(n_clusters = k)
        # Fit KMeans
        y_labels = KMeans_model.fit_predict(scaled_df)
        # Save the intertia and silhouette scores
        inertia_list.append(KMeans_model.inertia_)
        silhouette_score_list.append(silhouette_score(scaled_df, y_labels))

        print(f'Computing scores for k = {k}') #to see that the model is running and hasn't crashed
    
    # plot silhouette scores
    plt.figure()
    plt.plot(k_range, silhouette_score_list, marker = 'o')
    plt.xlabel('K')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette by No. of Clusters')
    plt.xticks(k_range)
    plt.show()
    
    # plot inertia scores
    plt.figure()
    plt.plot(k_range, inertia_list, marker="o")
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title('Inertia by No. of Clusters')
    plt.xticks(k_range)
    plt.show();

In [None]:
def kmeans_model(scaled_df, k):
    # run kmeans model for chosen number of clusters
    KMeans_model = KMeans(n_clusters = k)
    kmeans_y_labels = KMeans_model.fit_predict(scaled_df)

    # Let's add our labels to the dataset so we can colour our clusters on graphs. 
    kmeans_df = scaled_df.copy()
    kmeans_df['kmeans_labels'] = kmeans_y_labels
    
    # create sample for t-sne
    sample = kmeans_df.sample(frac=0.4, random_state = 1)

    # Instantiate t-SNE
    tsne = TSNE(n_components = 3, random_state = 1, verbose = 1)
    # Fit_Transform t-SNE
    tsne_data = tsne.fit_transform(sample)
    
    # create tSNE dataframe
    tsne_df = pd.DataFrame(tsne_data, columns = [f'tSNE D{i+1}' for i in range(tsne_data.shape[1])])
    
    # ADD CLUSTER LABELS
    tsne_df['kmeans_labels'] = sample['kmeans_labels'].values
    
    # visualise
    sns.pairplot(tsne_df, hue = 'kmeans_labels', plot_kws = {'alpha': 0.5})

In [3]:
def dbscan_epsilon(scaled_df, eps_start, eps_end, counter):
    # try a range of epsilons
    eps_range = np.arange(eps_start,eps_end,counter)
    num_clusters = []
    silhouette_scores = []
    
    from tempfile import mkdtemp
    cachedir = mkdtemp()

    for epsilon in eps_range:
        DBS_model = DBSCAN(eps = epsilon, min_samples = 5)

        y_labels = DBS_model.fit_predict(scaled_df)

        silhouette = silhouette_score(scaled_df, y_labels)
        silhouette_scores.append(silhouette)

        n_clusters = len(np.unique(y_labels[y_labels != -1]))

        num_clusters.append(n_clusters)

        print(f'eps = {round(epsilon, 2)}, --- n_clusters: {n_clusters} --- silhouette: {silhouette}')

In [None]:
def dbscan_model(scaled_df, e):
    DBS_model = DBSCAN(eps = e, min_samples = 5)
    dbscan_y_labels = DBS_model.fit_predict(scaled_df)

    # Let's add our labels to the dataset so we can colour our clusters on graphs. 
    DBS_df = scaled_df.copy()
    DBS_df['dbscan_labels'] = dbscan_y_labels

    sample = DBS_df.sample(frac=0.4, random_state = 1)

    # Instantiate t-SNE
    DB_tsne = TSNE(n_components = 3, random_state = 1, verbose = 1)
    
    # Fit_Transform t-SNE
    DB_tsne_data = DB_tsne.fit_transform(sample.drop('dbscan_labels', axis = 1))

    # create tSNE dataframe
    DB_tsne_df = pd.DataFrame(DB_tsne_data, columns = [f'tSNE D{i+1}' for i in range(DB_tsne_data.shape[1])])
    
    # ADD CLUSTER LABELS   
    DB_tsne_df['dbscan_labels'] = sample['dbscan_labels'].values

    # visualise
    sns.pairplot(DB_tsne_df, hue = 'dbscan_labels', plot_kws={'alpha':0.5})

In [8]:
def plot_roc_train_test(model, X_train, y_train):
    
    # get the probability for each point in the train set.
    y_proba_train = model.predict_proba(X_train)[:,1]

    # Compute ROC curve and AUC for for the one class
    fprs_train, tprs_train, thresholds_train = roc_curve(y_train, y_proba_train)
    roc_auc_train = roc_auc_score(y_train, y_proba_train)

    # Plot the ROC curve.
    plt.figure()
    plt.plot(fprs_train, tprs_train, color='darkorange', lw=2, label='train')
    plt.plot(fprs, tprs, lw=2, label='test')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC and AUC')
    plt.legend(loc="best")
    plt.show()
    print(f"Test AUC score: {roc_auc}")
    print(f"Train AUC score: {roc_auc_train}")

In [6]:
def feature_importance(fittedgrid):
    # calculate feature importances
    importances = fittedgrid.best_estimator_.steps[len(fittedgrid.best_estimator_.steps)-1][1].feature_importances_

    forest_importances = pd.Series(importances, index=X_train.columns)
    std = np.std([importances for tree in fittedgrid.best_estimator_], axis=0)

    # plot feature importances
    fig, ax = plt.subplots()
    forest_importances.plot.bar(yerr=std, ax=ax)
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()

In [17]:
# https://github.com/vinyluis/Articles/blob/main/ROC%20Curve%20and%20ROC%20AUC/ROC%20Curve%20-%20Multiclass.ipynb
def calculate_tpr_fpr(y_real, y_pred):
    '''
    Calculates the True Positive Rate (tpr) and the True Negative Rate (fpr) based on real and predicted observations
    
    Args:
        y_real: The list or series with the real classes
        y_pred: The list or series with the predicted classes
        
    Returns:
        tpr: The True Positive Rate of the classifier
        fpr: The False Positive Rate of the classifier
    '''
    
    # Calculates the confusion matrix and recover each element
    cm = confusion_matrix(y_real, y_pred)
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]
    
    # Calculates tpr and fpr
    tpr =  TP/(TP + FN) # sensitivity - true positive rate
    fpr = 1 - TN/(TN+FP) # 1-specificity - false positive rate
    
    return tpr, fpr

In [16]:
# https://github.com/vinyluis/Articles/blob/main/ROC%20Curve%20and%20ROC%20AUC/ROC%20Curve%20-%20Multiclass.ipynb
def get_all_roc_coordinates(y_real, y_proba):
    '''
    Calculates all the ROC Curve coordinates (tpr and fpr) by considering each point as a treshold for the predicion of the class.
    
    Args:
        y_real: The list or series with the real classes.
        y_proba: The array with the probabilities for each class, obtained by using the `.predict_proba()` method.
        
    Returns:
        tpr_list: The list of TPRs representing each threshold.
        fpr_list: The list of FPRs representing each threshold.
    '''
    tpr_list = [0]
    fpr_list = [0]
    for i in range(len(y_proba)):
        threshold = y_proba[i]
        y_pred = y_proba >= threshold
        tpr, fpr = calculate_tpr_fpr(y_real, y_pred)
        tpr_list.append(tpr)
        fpr_list.append(fpr)
    return tpr_list, fpr_list

In [15]:
# https://github.com/vinyluis/Articles/blob/main/ROC%20Curve%20and%20ROC%20AUC/ROC%20Curve%20-%20Multiclass.ipynb
def plot_roc_curve(tpr, fpr, scatter = True, ax = None):
    '''
    Plots the ROC Curve by using the list of coordinates (tpr and fpr).
    
    Args:
        tpr: The list of TPRs representing each coordinate.
        fpr: The list of FPRs representing each coordinate.
        scatter: When True, the points used on the calculation will be plotted with the line (default = True).
    '''
    if ax == None:
        plt.figure(figsize = (5, 5))
        ax = plt.axes()
    
    if scatter:
        sns.scatterplot(x = fpr, y = tpr, ax = ax)
    sns.lineplot(x = fpr, y = tpr, ax = ax)
    sns.lineplot(x = [0, 1], y = [0, 1], color = 'green', ax = ax)
    plt.xlim(-0.05, 1.05)
    plt.ylim(-0.05, 1.05)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")