# Model training & Data Analyse

## Imports

In [1]:
import pandas as pd
import bigml.api
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'bigml'

## Instantiate BigMl - need BigML's project id

In [None]:
api = bigml.api.BigML(project='project/5db1644859f5c33b3c00076c')

## Creating datasets

In [None]:
# Sources files
train_full_source = api.create_source('./full_train_edit.csv')
test_source = api.create_source('./test_edit.csv')
api.ok(test_source)

In [None]:
# Datasets
train_full_dataset = api.create_dataset(train_full_source)
test_dataset = api.create_dataset(test_source)
api.ok(train_full_dataset)

Splitting train_full in a train dataset and a validation dataset

In [None]:
train_dataset = api.create_dataset(
    train_full_dataset, {"name": "Train Dataset",
                         "sample_rate": 0.8, "seed": "my seed"})
api.ok(train_dataset)
test_dataset = api.create_dataset(
    train_full_dataset, {"name": "Validation Dataset",
                         "sample_rate": 0.8, "seed": "my seed",
                         "out_of_bag": True})
api.ok(test_dataset)

## Training a model

In [None]:
ensemble = api.create_ensemble(
    train_dataset, {"objective_field": "SeriousDlqin2yrs"})
api.ok(ensemble)

## Batching a prediction

### making the prediction

In [None]:
prediction = api.create_batch_prediction(ensemble, test_dataset, {
    "name": "1st shot",
    "all_fields": True,
    "prediction_name": "Prediction",
    "probabilities": True
})
api.ok(prediction)

### downloading the prediction

In [None]:
api.download_batch_prediction(
    'batchprediction/5dc036bc5299632024000e4f', filename="./prediction.csv")

FILE TO LUNCH

In [None]:
prediction_df = pd.read_csv("./prediction.csv", index_col=0)
prediction_df

## Analysing the prediction's results

In [None]:
# Writing the confusion (FP, FN, TP, TN)


def set_confusion(row):
    if (row['SeriousDlqin2yrs'] == row['Prediction'] and row['SeriousDlqin2yrs'] == 0):
        return 'TN'
    if (row['SeriousDlqin2yrs'] == row['Prediction'] and row['SeriousDlqin2yrs'] == 1):
        return 'TP'
    if row['SeriousDlqin2yrs'] > row['Prediction']:
        return 'FN'
    return 'FP'

In [None]:
prediction_df['confusion'] = prediction_df.apply(set_confusion, axis=1)
prediction_df.head(50)

## Exporting in csv the 100 biggest errors

In [None]:
prediction_df_classified = prediction_df.sort_values(
    by=['confusion', '1 probability'], ascending=[True, True])
confusion = prediction_df_classified.groupby(['confusion']).confusion.count()
confusion

## Optimisation du seuil

creation d'une colone d'erreur en fonction d'un seuil

In [None]:
def set_prediction_with_threshold(row, threshold):
    if row['1 probability'] > threshold:
        return 1
    return 0

In [None]:
prediction_df['my_prediction'] = prediction_df.apply(set_prediction_with_threshold, args=(0.5,), axis=1)

In [None]:
def set_confusion(row):
    if (row['SeriousDlqin2yrs'] == row['my_prediction'] and row['SeriousDlqin2yrs'] == 0):
        return 'TN'
    if (row['SeriousDlqin2yrs'] == row['my_prediction'] and row['SeriousDlqin2yrs'] == 1):
        return 'TP'
    if row['SeriousDlqin2yrs'] > row['my_prediction']:
        return 'FN'
    return 'FP'

In [None]:
prediction_df['confusion'] = prediction_df.apply(set_confusion, axis=1)

Confusion matrix

In [None]:
def count_confusion_values(col):
    try:
        TN = col.value_counts().to_dict()['TN']
    except:
        TN = 0
    try:
        FN = col.value_counts().to_dict()['FN']
    except:
        FN = 0
    try:
        TP = col.value_counts().to_dict()['TP']
    except:
        TP = 0
    try:
        FP = col.value_counts().to_dict()['FP']
    except:
        FP = 0
    return TN, FN, TP, FP

In [None]:
confusion_matrix = count_confusion_values(prediction_df['confusion'])
print(confusion_matrix)

Cost matrix

In [None]:
def profit_matrix(confusion_matrix):
    TN_cost = 500
    FN_cost = -2500
    TP_cost = 0
    FP_cost = -500
    total_profit = (confusion_matrix[0] * TN_cost
                  + confusion_matrix[1] * FN_cost
                  + confusion_matrix[2] * TP_cost
                  + confusion_matrix[3] * FP_cost)
    return total_profit

In [None]:
profit = profit_matrix(confusion_matrix)
print(profit)

## Computing AUC

En fonction du seuil:

- courbe ROC => TPR / FPR
    - TPR = TP / (TP + FN)
    - FPR = FP / (FP + TN)

In [None]:
def set_tpr_fpr(confusion_matrix):
    tpr = confusion_matrix[2] / (confusion_matrix[2] + confusion_matrix[1])
    fpr = confusion_matrix[3] / (confusion_matrix[3] + confusion_matrix[0])
    return [tpr, fpr]

In [None]:
def computing_with_threshold():
    tpr_list = []
    fpr_list = []
    profit_list = []
    threshold_list = []
    
    for i in range(101):
        threshold = i/100
        # set prediction threshold based
        prediction_df['my_prediction'] = prediction_df.apply(
            set_prediction_with_threshold,args=(threshold,), axis=1)
        # creating the confusion column
        prediction_df['confusion'] = prediction_df.apply(set_confusion, axis=1)
        # creating the confusion matrix
        confusion_matrix = count_confusion_values(prediction_df['confusion'])
        # computing profit_list
        profit = profit_matrix(confusion_matrix)
        profit_list.append(profit)
        # setting the fpr and tpr list
        tpr_fpr_list = set_tpr_fpr(confusion_matrix)
        tpr_list.append(tpr_fpr_list[0])
        fpr_list.append(tpr_fpr_list[1])
        # add threshold point
        threshold_list.append(threshold)
        
    return tpr_list, fpr_list, profit_list, threshold_list

In [None]:
tpr_list, fpr_list, profit_list, threshold_list = computing_with_threshold()

### Best profit

In [None]:
plt.plot(threshold_list, profit_list)

ymax = max(profit_list)
xpos = profit_list.index(ymax)
xmax = threshold_list[xpos]

plt.axvline(x=xmax, color='r')
plt.annotate(f"{xmax}", xy=(xmax, 0), xytext=(xmax + 0.05, 0))

plt.xlabel('Threshold')
plt.ylabel('Profit')
plt.show()

print(f"Best profit: {ymax} reached at threshold: {xmax}")

#### ROC curve

In [None]:
plt.plot(fpr_list, tpr_list, color="orange")
# set options here
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot(fpr_list, fpr_list, color="navy", linestyle="--")
plt.show()

### AUC calculation

- auc = P / (N +P)

In [None]:
positive = prediction_df.loc[prediction_df["SeriousDlqin2yrs"] == 1]
negative = prediction_df.loc[prediction_df["SeriousDlqin2yrs"] == 0]

positive_over_negative = 0
total_number = 0

for positive_value in positive["1 probability"]:
    for negative_value in negative["1 probability"]:
        if positive_value > negative_value:
            positive_over_negative += 1
        total_number += 1

auc = positive_over_negative / total_number

In [None]:
print(f"auc = {auc}")

## Learning curves

we need to know the performance of a model bites by bites (10%, 20%, ..., 100%)

In [None]:
# need to split the train dataset and evaluate the model on each splitted set
for i range(1,11):
    train_split = api.create_dataset(
        train_dataset, (
            "name": f"split {i}"),
            "sample_rate": i/10,
            "seed": "my seed")