In [1]:
import pandas

In [2]:
prediction = pandas.read_csv("./ensemble_train_prediction.csv")

In [15]:
prediction.describe()

Unnamed: 0,Id,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs,DebtAbsolute,MonthlyBalance,BalancePerPerson,Prediction,probability,0 probability,1 probability
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,29237.0,30000.0,30000.0,30000.0,29237.0,30000.0,30000.0,30000.0,30000.0
mean,75184.962033,5.221595,52.1713,0.409767,349.15855,5362.946,8.451967,0.264767,1.015267,0.236267,0.757157,0.068567,1978.177339,3384.769,2382.365,0.019767,0.936945,0.933041,0.066959
std,43286.522991,219.42409,14.795051,4.122478,2526.704877,13473.5,5.157297,4.105008,1.129399,4.085576,1.111296,0.25272,3050.460551,13496.94,12608.85,0.1392,0.097992,0.114013,0.114013
min,1.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-326442.0,-326442.0,0.0,0.5001,0.2614,0.00192
25%,37689.0,0.029766,41.0,0.0,0.170918,1600.0,5.0,0.0,0.0,0.0,0.0,0.0,521.0,201.481,329.3302,0.0,0.93233,0.93233,0.0082
50%,74657.5,0.155847,52.0,0.0,0.362473,4381.5,8.0,0.0,1.0,0.0,0.0,0.0,1524.813544,2760.143,1699.424,0.0,0.97871,0.97871,0.02129
75%,112785.25,0.565905,63.0,0.0,0.83832,7416.0,11.0,0.0,2.0,0.0,1.0,0.0,2808.093979,5181.26,3532.003,0.0,0.9918,0.9918,0.06767
max,150000.0,29110.0,103.0,98.0,326442.0,1794060.0,57.0,98.0,26.0,98.0,10.0,1.0,326442.0,1789080.0,1789080.0,1.0,0.99808,0.99808,0.7386


We create the confusion matrix logic and the method to get the result.

In [3]:
def confusion_matrix(row):
    x = "TN"
    if row['SeriousDlqin2yrs'] == 1:
        if row['Prediction'] == 1:
            x = "TP"
        else:
            x = "FN"
    else:
        if row['Prediction'] == 1:
            x = "FP"
    return x

In [4]:
def get_confusion_matrix(): 
    prediction['Error'] = prediction.apply(confusion_matrix, axis=1)
    return prediction['Error'].value_counts()

In [5]:
print(get_confusion_matrix())

TN    27708
FN     1699
TP      358
FP      235
Name: Error, dtype: int64


To get the **Accuracy** we add the **True Positives** and **True Negatives** and we divide them by **sample size/100**.

In [6]:
final_confusion_matrix = get_confusion_matrix()
accuracy = (final_confusion_matrix['TP'] + final_confusion_matrix['TN'])/300
print(accuracy)

93.55333333333333


We can have a look at the **100 largest errors**, with the **highest probability**.

In [7]:
filtered = prediction.loc[prediction['Error'].isin(["FN", "FP"])]

filtered = (filtered.nlargest(100, 'probability'))

From the given cost matrix we evaluate the best threshold to maximize our cash gain.

In [8]:
def cost_threshold(threshold):
    prediction_threshold = prediction['1 probability'] > threshold

    P_pred = prediction_threshold
    N_pred = ~prediction_threshold

    P_real = (prediction['SeriousDlqin2yrs'] == 1)
    N_real = (prediction['SeriousDlqin2yrs'] == 0)

    TP = len(prediction.loc[P_pred & P_real])
    TN = len(prediction.loc[N_pred & N_real])
    FP = len(prediction.loc[P_pred & N_real])
    FN = len(prediction.loc[N_pred & P_real])

    TP_cost = 0
    FN_cost = -2500
    FP_cost = -500
    TN_cost = 500

    cost = TP * TP_cost
    cost += TN * TN_cost
    cost += FP * FP_cost
    cost += FN * FN_cost

    return cost

With the **matplotlib** library we draw a plot to get the maximum cash gain value with it's matching threshold.

In [9]:
import matplotlib.pyplot as plt
table = [cost_threshold(n/1000) for n in range(0,1000)]
plt.plot(table)

max_val = max(table)
max_threshold = table.index(max_val)/3000
print(f'Le gain maximum est {max_val} et il est obtenu avec un seuil de {max_threshold}')

Le gain maximum est 9970500 et il est obtenu avec un seuil de 0.09433333333333334


To get the **AUC** we compare the **TP** probability score to the **TN** probability score.

In [17]:
def auc_method():
    auc_ratio = 0
    auc_count = 0
    auc_p_filter = prediction.loc[prediction['Error'].isin(["TP", "FN"])]
    auc_n_filter = prediction.loc[prediction['Error'].isin(["TN", "FP"])]
    
    for p_value in auc_p_filter['1 probability']:
        for n_value in auc_n_filter['1 probability']:
            if p_value > n_value:
                auc_ratio += 1
            auc_count += 1

    print(f"Nombre de comparaisons : {auc_ratio}")
    return auc_ratio/auc_count

In [18]:
print(auc_method())

Nombre de comparaisons : 49267257
0.8571386145812389
