In [None]:
# Import des libraries

from bigml.api import BigML
import pandas
from pandas import read_csv
import matplotlib

In [None]:
# Fonctions

def threshold(row):
    if row['1 probability'] < threshold_value:
        prediction_value = 0
    else: 
        prediction_value = 1
    return prediction_value

def error_column(row):
    if(row['SeriousDlqin2yrs'] == 0 and row['prediction'] == 0):
        error_value = 'TN'
    if(row['SeriousDlqin2yrs'] == 1 and row['prediction'] == 0):
        error_value = 'FN'
    if(row['SeriousDlqin2yrs'] == 0 and row['prediction'] == 1):
        error_value = 'FP'
    if(row['SeriousDlqin2yrs'] == 1 and row['prediction'] == 1):
        error_value = 'TP'
    return error_value

def confusion_matrix():
    idx = pandas.Index(df['error'])
    count_matrix = idx.value_counts() 
    return count_matrix

def get_accuracy():
    matrix = list(df.error.values)
    tn = matrix.count('TN')
    fn = matrix.count('FN')
    tp = matrix.count('TP')
    fp = matrix.count('FP')
    total = tn + fn + tp + fp
    accuracy = ((tp + tn ) / total ) * 100
    return accuracy

def get_profits():
    matrix = list(df.error.values)
    val_tn = 500
    val_fn = -2500
    val_tp = 0
    val_fp = -500
    profits = matrix.count('TN') * val_tn + matrix.count('FN') * val_fn + matrix.count('TP') * val_tp + matrix.count('FP') * val_fp
    return profits

# On cherche parmis les FN ( Prediction = 0 , Réalité  = 1 ) les plus petites 1 Proba ( Le modele prédisait quasi certainement 0)
def biggest_mistakes():
    filtered = df.loc[df['error'].isin(["FN"])]
    filtered = (filtered.nsmallest(100, '1 probability'))
    filtered.to_csv("100_biggest_mistakes.csv")
    return filtered

In [None]:
# Définition du seuil

threshold_value = 0.5

In [None]:
# Chargement de la feuille csv & applications des modifications
import pandas

df = pandas.read_csv('Prediction2.csv', index_col=0)
df.rename(columns={"SeriousDlqin2yrs.1": "prediction"}, inplace= True)
df['prediction'] = df.apply(threshold, axis = 1)
df['error'] = df.apply(error_column, axis = 1)
print("Load & Modifications : OK")

In [None]:
# Matrice de confusion

confusion_matrix()

In [None]:
# Accuracy

get_accuracy()

In [None]:
# Profits

get_profits()

In [None]:
# 100 plus grosses erreurs de notre modèle ( Parmis les FN , + petites 1 proba)

biggest_mistakes()
print("Enregistrement du fichier .csv en local :  OK")

In [None]:
# fonction(threshold) = profits

def cost_threshold(threshold):
    # Return True si 1 Proba > seuil & False si 1 Proba < seuil
    prediction_threshold = df['1 probability'] > threshold
    
    P_pred = prediction_threshold # Renvoie True si 1 proba > seuil ( sinon False )
    N_pred = ~prediction_threshold # Renvoie True si 1 proba < seuil ( sinon False )
    
    P_real = (df['SeriousDlqin2yrs'] == 1) # Renvoie True si SeriousDlqin2Yrs == 1 ( sinon False )
    N_real = (df['SeriousDlqin2yrs'] == 0) # Renvoie True si SeriousDlqin2Yrs == 0 ( sinon False )
    
    # P_pred True si 1 proba > seuil Prediction =  1 , P_real True si Realite = 1 
    TP = len(df.loc[P_pred & P_real])
    # N_pred True = si 1 proba < seuil = Prediction = 0 , N_real True si Realité 0 
    TN = len(df.loc[N_pred & N_real])
    # P_pred True si 1 proba > seuil Prediction = 1 , N_real True si Réalite = 0
    FP = len(df.loc[P_pred & N_real])
    # N_pred True si 1 proba < seuil =  Prediction 0 , P_real True si Realité = 1
    FN = len(df.loc[N_pred & P_real])
    
    TP_cost = 0
    FN_cost = -2500
    FP_cost = -500
    TN_cost = 500
    
    cost = TP * TP_cost
    cost += TN * TN_cost
    cost += FP * FP_cost
    cost += FN * FN_cost
    
    return cost

In [None]:
# Graphique et affichage du seuil opti

import matplotlib.pyplot as plt
# on fait varier le seuil de 0 à 1000 / 1000 => 0.0001 , 0.0002 ... 1 dans la fonction seuil => gain
table = [cost_threshold(n/1000) for n in range(0,1000)]
plt.xlabel("seuil/1000")
plt.ylabel("Gains")
plt.title("GiveMeCredit - Kaggle")

plt.plot(table)

max_val = max(table)
max_threshold = table.index(max_val)/1000
print(f'gain max {max_val} $ | Seuil {max_threshold}')

In [None]:
# calcul de l'AUC

In [None]:
# Methode Laurent

# Tri des données pour préparer le calcul de AUC

df = read_csv('Prediction2.csv', index_col=0)
df_sorted = df.sort_values('1 probability', ascending=False)
df

#  Calcul de la somme des négatifs en dessous de chaque positifs (1 probability est inférieur)

N_sum_below = 0
for index in range(len(df_sorted)):
    if df_sorted['SeriousDlqin2yrs'].iloc[index] == 1:
        N_sum_below += df_sorted['SeriousDlqin2yrs'][index+1:].value_counts().loc[0]

print(N_sum_below)

# Calcul du produit nombre N * nombre P
N_number = df_sorted['SeriousDlqin2yrs'].value_counts().loc[0]
P_number  = df_sorted['SeriousDlqin2yrs'].value_counts().loc[1]
NP_product = N_number * P_number

# Calcul auc
AUC = N_sum_below / NP_product
AUC

In [None]:
# Methode Maud

df = read_csv('Prediction2.csv', index_col=0)

positive = (df['SeriousDlqin2yrs'] == 1)
nb_positive=len(df.loc[positive])
nb_negative=len(df.loc[~positive])

result = df[['SeriousDlqin2yrs','1 probability']]
threshold_list = result.sort_values(by='1 probability',ascending=False)['SeriousDlqin2yrs'].values


auc = 0
P_cumul = 0

for i in range(len(threshold_list)):
    if threshold_list[i] == 1:
        P_cumul += 1
    else:
        auc += P_cumul
        
auc = auc/(nb_positive*nb_negative)

auc

In [None]:
# # # affichage des 100 plus grandes erreurs du modele

# filtered = df.loc[df['Classification'].isin(["FN"])]
# filtered = (filtered.nsmallest(100, 'My Prediction'))
# # filtered

In [None]:
# # # Calcul du nombre de FP/FN/VP/VN

# df = read_csv('Prediction_with_error.csv', index_col=0)
# df.loc[(df["SeriousDlqin2yrs"] == 0) & (df["My Prediction"] == 0), 'Classification'] = 'TN'
# df.loc[(df["SeriousDlqin2yrs"] == 0) & (df["My Prediction"] == 1), 'Classification'] = 'FP'
# df.loc[(df["SeriousDlqin2yrs"] == 1) & (df["My Prediction"] == 0), 'Classification'] = 'FN'
# df.loc[(df["SeriousDlqin2yrs"] == 1) & (df["My Prediction"] == 1), 'Classification'] = 'TP'

# idx = pandas.Index(df['Classification'])
# idx.value_counts()
# count_matrix = idx.value_counts()

# accuracy = (count_matrix[0] + count_matrix[2]) / ( count_matrix[0] + count_matrix[1] + count_matrix[2] + count_matrix[3]) * 100
# print(accuracy)
# df

# gains = count_matrix[0] * 500 + count_matrix[1] * -2500 + count_matrix[2] * 0 + count_matrix[3] * -500
# print(gains)

In [None]:
# # # Insertion colonne d'erreur avec seuil defini

# df = read_csv('Prediction.csv', index_col=0)
# df.rename(columns={"1 probability": "Probability1"}, inplace= True)
# df.rename(columns={"0 probability": "Probability0"}, inplace= True)
# df.loc[df.Probability1 > 0.25 , 'My Prediction'] = '1'
# df.loc[df.Probability1 < 0.25 , 'My Prediction'] = '0'
# df.to_csv('Prediction_with_error.csv')
