# Give Me Some Credits

### Traitement du fichier Train

* Import et modification du Dataset d'entrainement

In [1]:
# IMPORT DU FICHIER BASE

from bigml.api import BigML
from pandas import read_csv
df = read_csv('https://oml-data.s3.amazonaws.com/kaggle-give-me-credit-train.csv')

# MODIFICATION DU DATASET

df = df.fillna(0) 
df['MonthlyPerPerson'] = df['MonthlyIncome'] / ( df['NumberOfDependents'] + 1 )
df.loc[df.age > 80, 'isOld'] = '1' 
df.loc[df.age <= 80, 'isOld'] = '0'
df['Monthlydebt'] = df['DebtRatio'] *  df['MonthlyIncome']
df['LatePayment'] = df['NumberOfTime30-59DaysPastDueNotWorse'] + df['NumberOfTimes90DaysLate'] + df['NumberOfTime60-89DaysPastDueNotWorse']
df['NumberOfCredits'] = df['NumberRealEstateLoansOrLines'] + df['NumberOfOpenCreditLinesAndLoans']

df.to_csv('GMSC.csv')


* Création du fichier source et du dataset modifié

In [5]:
# CREATION DE LA SOURCE

api = BigML(project='project/5d94a3e85a213962e2000304')
source = api.create_source('GMSC.csv')
api.ok(source)

# CREATION DU DATASET TRAIN

origin_dataset = api.create_dataset(source)
api.ok(origin_dataset)

True

* Split 80/20% du Dataset (Train & Validation)

In [None]:
# SPLIT

train_dataset = api.create_dataset(
    origin_dataset, {"name": "GMSC-Training",
                     "sample_rate": 0.8, "seed": "my seed"})
test_dataset = api.create_dataset(
    origin_dataset, {"name": "GMSC-Test",
                     "sample_rate": 0.8, "seed": "my seed",
                     "out_of_bag": True})

* Création du Model (Ensemble)

In [None]:
# MODELE

ensemble = api.create_ensemble(train_dataset , {"objective_field" : "SeriousDlqin2yrs"})

evaluation = api.create_evaluation(ensemble, test_dataset)

* "Batch" predition et enregistrement du CSV

In [None]:
# BATCH PREDICTION 
batch_prediction = api.create_batch_prediction(ensemble, test_dataset,{"all_fields": True,
                                                                        "header": True,
                                                                      "confidence": True,
                                                                      "probabilities":True})
api.ok(batch_prediction)

# FICHIER CSV

api.download_batch_prediction(batch_prediction, filename='BatchPrediction/GMSC_Prediction_Ensemble.csv')

### Traitement du fichier Test

* Import et modification du Dataset Test

In [None]:
# IMPORT DU FICHIER BASE

df2 = read_csv('https://oml-data.s3.amazonaws.com/kaggle-give-me-credit-test.csv')

# MODIFICATION DU DATASET

df2 = df2.fillna(0) 
df2['MonthlyPerPerson'] = df2['MonthlyIncome'] / ( df2['NumberOfDependents'] + 1 )
df2.loc[df2.age > 80, 'isOld'] = '1' 
df2.loc[df2.age <= 80, 'isOld'] = '0'
df2['Monthlydebt'] = df2['DebtRatio'] *  df2['MonthlyIncome']
df2['LatePayment'] = df2['NumberOfTime30-59DaysPastDueNotWorse'] + df2['NumberOfTimes90DaysLate'] + df2['NumberOfTime60-89DaysPastDueNotWorse']
df2['NumberOfCredits'] = df2['NumberRealEstateLoansOrLines'] + df2['NumberOfOpenCreditLinesAndLoans']

df2.to_csv('KaggleGMSC.csv')

* Création de la source et du Dataset modifié

In [None]:
# SOURCE

api = BigML(project='project/5d94a3e85a213962e2000304')
source = api.create_source('KaggleGMSC.csv')
api.ok(source)

# DATASET

kaggle_dataset = api.create_dataset(source)
api.ok(kaggle_dataset)

* Confrontation Ensemble et Dataset Test

In [None]:
# BATCH PREDICTION

kaggle_test = api.create_batch_prediction(ensemble, kaggle_dataset,{"all_fields": True,
                                                                    "header": True,
                                                                    "confidence": True, 
                                                                    "probabilities":True})
api.ok(kaggle_test)

In [None]:
# FICHIER CSV

api.download_batch_prediction(kaggle_test, filename='BatchPrediction/EnvoiKaggle.csv')

### Préparation à l'envoi sur Kaggle

* Import du fichier

In [None]:
# IMPORT DU FICHIER
prediction = read_csv('BatchPrediction/EnvoiKaggle.csv', index_col=False)

* Préparation d'un dataframe pour l'export

In [None]:
# MODIFICATION AVEC COLONES ID ET PROBABILITY

from pandas import DataFrame
kaggle_prediction=DataFrame()
kaggle_prediction['Id']=prediction['Unnamed: 0']
kaggle_prediction['Probability']=prediction['1 probability']

### Envoi à Kaggle

In [None]:
# EXPORT

import kaggle
kaggle_prediction_file="kaggle_prediction.csv"
kaggle_prediction.to_csv(kaggle_prediction_file, index=False)
kaggle.api.competition_submit(kaggle_prediction_file, "BigML ensemble", "GiveMeSomeCredit")

## Quelques interprétations des résultats

* Définition de la colonne probabilité en fonction du seuil

In [100]:
df = read_csv('BatchPrediction/GMSC_Prediction_Ensemble.csv')
df.columns.values[-2] = "prob"
seuil = 0.5
df.loc[df.prob >= seuil, 'predict'] = 0
df.loc[df.prob < seuil, 'predict'] = 1

* Création de la Matrice de confusion 

In [101]:
df.loc[(df.predict==0) & (df.SeriousDlqin2yrs==0), 'confusion'] = "TN"
df.loc[(df.predict==0) & (df.SeriousDlqin2yrs==1), 'confusion'] = "FN"
df.loc[(df.predict==1) & (df.SeriousDlqin2yrs==0), 'confusion'] = "FP"
df.loc[(df.predict==1) & (df.SeriousDlqin2yrs==1), 'confusion'] = "TP"

* Décompte des valeurs de confusion

In [102]:
err = list(df.confusion.values)
TP = err.count('TP')
TN = err.count('TN')
FP = err.count('FP')
FN = err.count('FN')

print("TP:",TP, "TN:",TN, "FP:", FP, "FN:",FN)

TP: 334 TN: 27746 FP: 239 FN: 1681


In [92]:
#eval = api.get_evaluation(evaluation)
#print(eval["object"]["result"]["model"]["confusion_matrix"])

* Création de la colonne des 100 erreurs les plus importantes

In [99]:
df['errMAJ'] = 1 - (abs(df['SeriousDlqin2yrs'] - df['prob']))
df.sort_values(by='errMAJ', ascending=False).head(100)

* Calculs sur la Matrice de gain

In [74]:
#AVEC SEUIL = 0.5
gain = (TP * 0) + (TN * 500) - (FP * 500) - (FN * 2500)
print("Vous avez empoché:", gain,"$")

Vous avez empoché: 9551000 $
