# BigML and Kaggle API's

Import needed libraries:

In [1]:
import bigml.api
import pandas as pd
import numpy as np
import kaggle
from pprint import pprint
import matplotlib.pyplot as plt
import time

Connect to BigML Api:

In [2]:
api = bigml.api.BigML()

Create dataframes from kaggle training and test datasets and store them in a dataframe:

In [3]:
training_full_df = pd.read_csv('./datasets/cs-training.csv')
test_full_df = pd.read_csv('./datasets/cs-test.csv')
dataframes = [training_full_df, test_full_df]

Modify and add features on both df:

In [4]:
for df in dataframes:
    df.columns.values[0]="Id"
    df.fillna(0, inplace=True)
    df['NumberOfTimes30DaysOrMoreLate'] = df['NumberOfTime30-59DaysPastDueNotWorse']+df['NumberOfTime60-89DaysPastDueNotWorse']+df['NumberOfTimes90DaysLate']
    df['IncomePerPerson'] = df['MonthlyIncome']/(df['NumberOfDependents']+1)
    df['MonthlyDebt'] = (
        np.where(
            df['DebtRatio']>=5,
            df['DebtRatio'],
            df['DebtRatio']*df['MonthlyIncome'],))
    df['MonthlyBalance'] = df['MonthlyIncome']-df['MonthlyDebt']

training_full_df

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,NumberOfTimes30DaysOrMoreLate,IncomePerPerson,MonthlyDebt,MonthlyBalance
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2,3040.000000,7323.197016,1796.802984
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,1300.000000,316.878123,2283.121877
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,2,3042.000000,258.914887,2783.085113
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0,3300.000000,118.963951,3181.036049
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,63588.000000,1584.975094,62003.024906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0,2100.000000,472.774869,1627.225131
149996,149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0,1861.333333,4001.283436,1582.716564
149997,149998,0,0.246044,58,0,3870.000000,0.0,18,0,1,0,0.0,0,0.000000,3870.000000,-3870.000000
149998,149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0,5716.000000,0.000000,5716.000000


Store modified dataframes in csv:

In [5]:
training_full_df.to_csv('./results/full_train_modif.csv', index=False)
test_full_df.to_csv('./results/test_modif.csv', index=False)

Create sources:

In [6]:
train_full_source = api.create_source('./results/full_train_modif.csv')
test_source = api.create_source('./results/test_modif.csv')

Create datasets:

In [7]:
train_full_dataset = api.create_dataset(train_full_source)
test_dataset = api.create_dataset(test_source)

Split full_train into train(80%) and validation(20%)

In [8]:
train_dataset = api.create_dataset(train_full_dataset,{"name":"Train", "sample_rate":0.8, "seed":"my seed"})
validation_dataset = api.create_dataset(train_full_dataset, {"name":"Validation", "sample_rate":0.8, "seed":"my seed", "out_of_bag":True})

Create ensemble:

In [None]:
ensemble = api.create_ensemble(train_dataset, {"objective_field":"SeriousDlqin2yrs"})
api.ok(ensemble)

Get ensemble AUC from BigML:

In [None]:
ensemble_evaluation = api.create_evaluation(ensemble, validation_dataset)
api.ok(ensemble_evaluation)

In [None]:
ensemble_auc = ensemble_evaluation['object']['result']['model']['average_area_under_roc_curve']
print(ensemble_auc)

Create deepnet:

In [None]:
deepnet = api.create_deepnet(train_dataset, {"objective_field":"SeriousDlqin2yrs"})
api.ok(deepnet)

Get deepnet AUC from BigML

In [None]:
deep_evaluation = api.create_evaluation(deepnet, validation_dataset)
api.ok(deep_evaluation)

In [None]:
deepnet_auc = deep_evaluation['object']['result']['model']['average_area_under_roc_curve']
print(deepnet_auc)

Comparaison deepnet and ensemble AUC's:

In [None]:
is_deep_better_ensemble = (deepnet_auc > ensemble_auc)
is_deep_better_ensemble

Error column with pandas:

In [None]:
batch = api.create_batch_prediction(ensemble, validation_dataset, {'all_fields':True, 'probabilities':True})
api.ok(batch)

In [None]:
api.download_batch_prediction(batch, filename= './results/my_prediction.csv')

In [None]:
batch_prediction_df = pd.read_csv('./results/my_prediction.csv',index_col=0)
batch_prediction_df

Create 'error' col with pandas:

In [None]:
def error(row):
    if (row['SeriousDlqin2yrs'] == row['SeriousDlqin2yrs.1'] and row['SeriousDlqin2yrs'] == 0):
        row['error'] = 'TN'
    elif (row['SeriousDlqin2yrs'] == row['SeriousDlqin2yrs.1'] and row['SeriousDlqin2yrs'] == 1):
        row['error'] = 'TP'
    elif row['SeriousDlqin2yrs'] > row['SeriousDlqin2yrs.1']:
        row['error'] = 'FN'
    else:
        row['error'] = 'FP'
    return row['error']


batch_prediction_df['error'] = batch_prediction_df.apply(error, axis=1)
batch_prediction_df

In [None]:
count = batch_prediction_df.error.value_counts().to_dict()
count

In [None]:
TN = count['TN']
FN = count['FN']
TP = count['TP']
FP = count['FP']

In [None]:
d = {'0':[TN, FN], '1':[FP, TP]}
confusion_matrix = pd.DataFrame(data=d)
confusion_matrix

In [None]:
accuracy = (TP+TN)/(TP+TN+FP+FN)
accuracy

In [None]:
accuracy_bigml = ensemble_evaluation['object']['result']['model']['accuracy']
accuracy_bigml

In [None]:
batch_prediction_df = batch_prediction_df.sort_values(['error','1 probability'],ascending=[True,True])
batch_prediction_df

In [None]:
batch_prediction_df[['1 probability','error']]

In [None]:
batch_prediction_df.loc[:,'error_rate'] = abs(batch_prediction_df['1 probability'] - 0.5)
error_df = batch_prediction_df[batch_prediction_df['error'].isin(['FN', 'FP'])]
error_df[['1 probability', 'error', 'error_rate']].sort_values('error_rate', ascending=False)

In [None]:
top_100_error_df = error_df.sort_values('error_rate', ascending=False).head(100)
top_100_error_df

In [None]:
top_100_error_df.to_csv('top100-error.csv')

In [9]:
def make_prediction(df,threshold):
    df['prediction'] = df[['1 probability']].apply(lambda x: x > threshold)

def pred_error(row):
    if (row['SeriousDlqin2yrs'] == row['prediction'] and row['SeriousDlqin2yrs'] == 0):
        row['prediction_error'] = 'TN'
    elif (row['SeriousDlqin2yrs'] == row['prediction'] and row['SeriousDlqin2yrs'] == 1):
        row['prediction_error'] = 'TP'
    elif row['SeriousDlqin2yrs'] > row['prediction']:
        row['prediction_error'] = 'FN'
    else:
        row['prediction_error'] = 'FP'
    return row['prediction_error']

def count_error(col):
    try: 
        TN = col.value_counts().to_dict()['TN'] 
    except: 
        TN = 0
    try: 
        FN = col.value_counts().to_dict()['FN'] 
    except: 
        FN = 0
    try: 
        TP = col.value_counts().to_dict()['TP'] 
    except: 
        TP = 0
    try: 
        FP = col.value_counts().to_dict()['FP']
    except: 
        FP = 0
    return TN, FN, TP, FP

In [10]:
FPR=[]
TPR=[]
start = time.time()
for n in range(0,100):       
    make_prediction(n/100)
    batch_prediction_df['prediction_error'] = batch_prediction_df.apply(pred_error, axis=1)
    TN, FN, TP, FP = count_error(batch_prediction_df['prediction_error'])
    TPR.append(TP/(TP+FN))
    FPR.append(FP/(FP+TN))
    #print(f"seuil={thresholds[threshold]}\nTN={TN}, FN={FN}, TP={TP}, FP={FP}\nTPR={TP/(TP+FN)}\nFPR={FP/(FP+TN)}\n")
end = time.time()
print(f"{int((end-start)//60)} minutes {int((end-start)%60)} secondes")

TypeError: make_prediction() missing 1 required positional argument: 'threshold'

In [None]:
plt.plot(FPR,TPR,color='orange')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.show()

Learning curves

In [16]:
from sklearn.metrics import auc

threshold = 0.5
AUC = []
sizes = []
start = time.time()
for i in range(1,11):
    train_dataset_split = api.create_dataset(train_dataset,{"name":f"Train split {i/10}", "sample_rate":i/10, "seed":"my seed"})
    model = api.create_ensemble(train_dataset_split, {"objective_field":"SeriousDlqin2yrs"})
    batch = api.create_batch_prediction(model, validation_dataset, {'probabilities':True})
    api.ok(batch)
    api.download_batch_prediction(batch, filename= f'./results/my_prediction_{i/10}.csv')
    batch_df = pd.read_csv(f'./results/my_prediction_{i/10}.csv')
    make_prediction(batch_df, threshold)
    batch_df['prediction_error'] = batch_df.apply(pred_error, axis=1)
    TN, FN, TP, FP = count_error(batch_df['prediction_error'])
    tpr = [TP/(TP+FN)]
    fpr = [FP/(FP+TN)]
    myauc = auc(fpr,tpr)
    AUC.append(myauc)
    sizes.appen(i/10)
end = time.time()
print(f"{(end-start)//60} min {(end-start)%60} sec")

TypeError: Expected sequence or array-like, got <class 'float'>

In [None]:
plt.plot(sizes,AUC,color='orange')
plt.xlabel('Training set size')
plt.ylabel('AUC')
plt.title('Learning curve')
plt.show()