In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
# Defining function for printing metrics
def scores(labels, predictions):
    print('Precision: ', precision_score(labels, predictions, average = 'macro'))
    print('Recall: ', recall_score(labels, predictions, average = 'macro'))
    print('F-score: ', f1_score(labels, predictions, average = 'macro'))
    print('Accuracy: ', sum(labels.values == predictions) / len(labels))
    print('Confusion matrix: \n', confusion_matrix(labels, predictions))
    
# Defining function for plotting confusion matrix
def cm_analysis(y_true, y_pred, labels, ymap=None, figsize=(10,10)):
    """
    Generate matrix plot of confusion matrix with pretty annotations.
    The plot image is saved to disk.
    args: 
      y_true:    true label of the data, with shape (nsamples,)
      y_pred:    prediction of the data, with shape (nsamples,)
      filename:  filename of figure file to save
      labels:    string array, name the order of class labels in the confusion matrix.
                 use `clf.classes_` if using scikit-learn models.
                 with shape (nclass,).
      ymap:      dict: any -> string, length == nclass.
                 if not None, map the labels & ys to more understandable strings.
                 Caution: original y_true, y_pred and labels must align.
      figsize:   the size of the figure plotted.
    """
    if ymap is not None:
        y_pred = [ymap[yi] for yi in y_pred]
        y_true = [ymap[yi] for yi in y_true]
        labels = [ymap[yi] for yi in labels]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=labels, columns=labels)
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, annot=annot, fmt='', ax=ax)
    plt.xticks(rotation=45)
    plt.savefig('conf.pdf', dpi = 300)
    plt.show()



In [None]:
# Loading and resampling data

data = pd.read_csv('final_dataset.csv', index_col = 0)
data = data.sample(frac = 1)


In [None]:
# Splitting data into train and test
train_data, test_data = train_test_split(data, test_size = 0.15)

train_X, train_y = train_data.iloc[:, :-1], train_data.iloc[:, -1]
test_X, test_y = test_data.iloc[:, :-1], test_data.iloc[:, -1]

In [None]:
# Defining and fitting the classifier
clf = RandomForestClassifier(n_estimators = 20000, max_depth = 24, n_jobs=-1, min_samples_leaf = 1)
clf.fit(train_X, train_y)
predictions = clf.predict(test_X)

In [None]:
# Printing and plotting scores
scores(test_y, predictions)
cm_analysis(test_y, predictions, list(labs_rec.keys()), ymap=None, figsize=(10,10))