# Model training & Data Analyse

## Imports

In [2]:
import pandas as pd
import bigml.api
import matplotlib.pyplot as plt

## Instantiate BigMl - need BigML's project id

In [3]:
api = bigml.api.BigML(project='project/5db1644859f5c33b3c00076c')

## Creating datasets

In [None]:
# Sources files
train_full_source = api.create_source('./full_train_edit.csv')
test_source = api.create_source('./test_edit.csv')
api.ok(test_source)

In [None]:
# Datasets
train_full_dataset = api.create_dataset(train_full_source)
test_dataset = api.create_dataset(test_source)
api.ok(train_full_dataset)

Splitting train_full in a train dataset and a validation dataset

In [None]:
train_dataset = api.create_dataset(
    train_full_dataset, {"name": "Train Dataset",
                     "sample_rate": 0.8, "seed": "my seed"})
api.ok(train_dataset)
test_dataset = api.create_dataset(
    train_full_dataset, {"name": "Validation Dataset",
                     "sample_rate": 0.8, "seed": "my seed",
                     "out_of_bag": True})
api.ok(test_dataset)

## Training a model

In [None]:
ensemble = api.create_ensemble(train_dataset, {"objective_field": "SeriousDlqin2yrs"})
api.ok(ensemble)

## Batching a prediction

### making the prediction

In [None]:
prediction = api.create_batch_prediction(ensemble, test_dataset, {
    "name": "1st shot",
    "all_fields": True,
    "prediction_name": "Prediction",
    "probabilities": True
})
api.ok(prediction)

### downloading the prediction

In [4]:
api.download_batch_prediction('batchprediction/5dc036bc5299632024000e4f', filename="./prediction.csv")

'./prediction.csv'

In [5]:
prediction_df = pd.read_csv("./prediction.csv", index_col=0)


## Analysing the prediction's results

In [None]:
# Writing the confusion (FP, FN, TP, TN)

def set_confusion(row):
    if (row['SeriousDlqin2yrs'] == row['Prediction'] and row['SeriousDlqin2yrs'] == 0):
        return 'TN'
    if (row['SeriousDlqin2yrs'] == row['Prediction'] and row['SeriousDlqin2yrs'] == 1):
        return 'TP'
    if row['SeriousDlqin2yrs'] > row['Prediction']:
        return 'FN'
    return 'FP'
        
prediction_df['confusion'] = prediction_df.apply(set_confusion, axis=1)
prediction_df.head(50)

## Exporting in csv the 100 biggest errors

In [None]:
prediction_df_classified = prediction_df.sort_values(by=['confusion','1 probability'], ascending=[True, True])
confusion = prediction_df_classified.groupby(['confusion']).confusion.count()
confusion

## Optimisation du seuil

creation d'une colone d'erreur en fonction d'un seuil

In [27]:
def set_error_with_threshold(row):
    if row['1 probability'] > threshold:
        return 1
    return 0

In [28]:
threshold = 0.5
prediction_df['my_prediction'] = prediction_df.apply(set_error_with_threshold, axis=1)

In [29]:
def set_confusion(row):
    if (row['SeriousDlqin2yrs'] == row['my_prediction'] and row['SeriousDlqin2yrs'] == 0):
        return 'TN'
    if (row['SeriousDlqin2yrs'] == row['my_prediction'] and row['SeriousDlqin2yrs'] == 1):
        return 'TP'
    if row['SeriousDlqin2yrs'] > row['my_prediction']:
        return 'FN'
    return 'FP'
        
prediction_df['confusion'] = prediction_df.apply(set_confusion, axis=1)

In [33]:
def count_confusion_values(col):
    try:
        TN = col.value_counts().to_dict()['TN']
    except:
        TN = 0
    try:
        FN = col.value_counts().to_dict()['FN']
    except:
        FN = 0
    try:
        TP = col.value_counts().to_dict()['TP']
    except:
        TP = 0
    try:
        FP = col.value_counts().to_dict()['FP']
    except:
        FP = 0
    return TN, FN, TP, FP
confusion_matrix = count_confusion_values(prediction_df['confusion'])
print(confusion_matrix)

(27736, 1668, 347, 249)


In [32]:
def profit_matrix(confusion_matrix):
    TN_cost = 500
    FN_cost = -2500
    TP_cost = 0
    FP_cost = -500
    total_cost = (confusion_matrix[0] * TN_cost
                 + confusion_matrix[1] * FN_cost
                 + confusion_matrix[2] * TP_cost
                 +confusion_matrix[3] * FP_cost)
    return total_cost

profit = profit_matrix(confusion_matrix)
print(profit)

9573500
