# BigML : Evaluation 

Check Install and launch BigML ; 
for help : https://bigml.readthedocs.io/en/latest/index.html#local-predictions ; 
Use this link to retreive the key with your account :  https://bigml.com/account/apikey

In [73]:
import bigml.api
from bigml.api import BigML
api = BigML()

Building Source

In [76]:
source = api.create_source('cs-training.csv')
api.ok(source)

True

Building Dataset

In [77]:
dataset = api.create_dataset(source)
api.ok(dataset) 

True

Random Split Building dataset into 80% train and 20% Validation

In [78]:
train_dataset = api.create_dataset(dataset, {"name": "train", "sample_rate": 0.8, "seed": "my seed"})
api.ok(train_dataset)
validation_dataset = api.create_dataset(dataset, {"name": "validation", "sample_rate": 0.8, "seed": "my seed", "out_of_bag": True})
api.ok(validation_dataset)

True

Building model

In [79]:
ensemble = api.create_ensemble(train_dataset)
api.ok(ensemble)

True

Building evaluation and print evaluation

In [80]:
evaluation = api.create_evaluation(ensemble,validation_dataset)
api.ok (evaluation)

True

Print

In [81]:
api.pprint(evaluation['object']['result'])

{   'class_names': ['0', '1'],
    'mode': {   'accuracy': 0.93283,
                'average_area_under_pr_curve': 0,
                'average_area_under_roc_curve': 0,
                'average_balanced_accuracy': 0.5,
                'average_f_measure': 0.48262,
                'average_kendalls_tau_b': 0,
                'average_ks_statistic': 0,
                'average_max_phi': 0,
                'average_phi': 0,
                'average_precision': 0.46642,
                'average_recall': 0.5,
                'average_spearmans_rho': 0,
                'confusion_matrix': [[27985, 0], [2015, 0]],
                'per_class_statistics': [   {   'accuracy': 0.93283,
                                                'balanced_accuracy': 0.5,
                                                'class_name': '0',
                                                'f_measure': 0.96525,
                                                'phi_coefficient': 0,
                                   

                                                                       0.99828,
                                                                       0.34],
                                                                   [   0.99573,
                                                                       0.99846,
                                                                       0.33],
                                                                   [   0.99743,
                                                                       0.99911,
                                                                       0.31],
                                                                   [   0.9978,
                                                                       0.99921,
                                                                       0.3],
                                                                   [   0.99963,
                                                  

                                                                     [   0.6041,
                                                                         0.13201,
                                                                         0.96557],
                                                                     [   0.60843,
                                                                         0.133,
                                                                         0.96496],
                                                                     [   0.61283,
                                                                         0.13449,
                                                                         0.96429],
                                                                     [   0.6235,
                                                                         0.13846,
                                                                         0.96278],
                

                                                                                                 2661],
                                                                                             0.86493],
                                                                                         [   [   25191,
                                                                                                 777,
                                                                                                 1238,
                                                                                                 2794],
                                                                                             0.87],
                                                                                         [   [   25066,
                                                                                                 760,
                                                                          

                                                                 [   0.66336,
                                                                     0.98478,
                                                                     0.96194],
                                                                 [   0.6734,
                                                                     0.98469,
                                                                     0.96054],
                                                                 [   0.67733,
                                                                     0.98468,
                                                                     0.96],
                                                                 [   0.68258,
                                                                     0.98449,
                                                                     0.9592],
                                                                 

                                                                   [   0.0135,
                                                                       0.12208,
                                                                       0.541],
                                                                   [   0.0136,
                                                                       0.12308,
                                                                       0.54],
                                                                   [   0.01493,
                                                                       0.13449,
                                                                       0.53],
                                                                   [   0.01537,
                                                                       0.13697,
                                                                       0.52703],
                                              

                                                                   [   0.04907,
                                                                       6.98902,
                                                                       0.32],
                                                                   [   0.04977,
                                                                       6.99044,
                                                                       0.31795],
                                                                   [   0.05327,
                                                                       6.86652,
                                                                       0.31],
                                                                   [   0.05517,
                                                                       6.81895,
                                                                       0.3044],
                                           

                                                                         0.39532,
                                                                         0.03014],
                                                                     [   0.45307,
                                                                         0.42123,
                                                                         0.02682],
                                                                     [   0.4611,
                                                                         0.42959,
                                                                         0.02588],
                                                                     [   0.4689,
                                                                         0.43766,
                                                                         0.02496],
                                                                     [   0.49747,
              

                                                                                             0.55139],
                                                                                         [   [   222,
                                                                                                 141,
                                                                                                 27844,
                                                                                                 1793],
                                                                                             0.55884],
                                                                                         [   [   220,
                                                                                                 137,
                                                                                                 27848,
                                                                          

                                                                  [   0.05103,
                                                                      0.47792,
                                                                      0.22671],
                                                                  [   0.05271,
                                                                      0.48387,
                                                                      0.22275],
                                                                  [   0.05389,
                                                                      0.48933,
                                                                      0.21893],
                                                                  [   0.05621,
                                                                      0.49578,
                                                                      0.21352],
                                                

In [82]:
api.pprint(evaluation['object']['result']['model']['accuracy'])
api.pprint(evaluation['object']['result']['model']['average_area_under_roc_curve'])
api.pprint(evaluation['object']['result']['model']['confusion_matrix'])


0.93593
0.85452
[[27749, 236], [1686, 329]]


In [83]:
batch_prediction = api.create_batch_prediction(ensemble, validation_dataset, {"name": "my val prediction", "all_fields":True, "header":True,"probabilities":True})
api.ok(batch_prediction)
api.download_batch_prediction(batch_prediction,filename='my_predictions_val.csv')

'my_predictions_val.csv'

## Matrix confusion via Pandas

In [84]:
import pandas
from pandas import read_csv
df_val = read_csv('my_predictions_val.csv')
df_val.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 17 columns):
field1                                  30000 non-null int64
RevolvingUtilizationOfUnsecuredLines    30000 non-null float64
age                                     30000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    30000 non-null int64
DebtRatio                               30000 non-null float64
MonthlyIncome                           23999 non-null float64
NumberOfOpenCreditLinesAndLoans         30000 non-null int64
NumberOfTimes90DaysLate                 30000 non-null int64
NumberRealEstateLoansOrLines            30000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    30000 non-null int64
NumberOfDependents                      29192 non-null float64
NumberOfTimes90DaysLate_TOTAL           30000 non-null int64
IsOld                                   30000 non-null bool
SeriousDlqin2yrs                        30000 non-null int64
SeriousDlqin2yrs.1            

In [106]:
def CalculError(row):
    v = row['SeriousDlqin2yrs']
    w = row['SeriousDlqin2yrs.1']
    if v == 0 :
        if w == 0 :
            return 'TN'
        else :
            return 'FP'
    else :
        if w == 0 :
            return 'FN'
        else :
            return 'TP'

In [86]:
df_val['error']= df_val.apply (CalculError, axis = 1)
df_val.head (10)

Unnamed: 0,field1,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,NumberOfTimes90DaysLate_TOTAL,IsOld,SeriousDlqin2yrs,SeriousDlqin2yrs.1,0 probability,1 probability,error
0,5,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,False,0,0,0.82094,0.17906,TN
1,20,0.602794,25,0,0.065868,333.0,2,0,0,0,0.0,0,False,0,0,0.95714,0.04286,TN
2,27,0.052436,58,0,0.097672,8333.0,22,0,1,0,0.0,0,False,0,0,0.98582,0.01418,TN
3,28,0.034421,69,0,0.042383,2500.0,17,0,0,0,1.0,0,False,0,0,0.98958,0.01042,TN
4,31,0.704074,28,1,0.155201,4200.0,8,0,0,0,0.0,1,False,0,0,0.86787,0.13213,TN
5,50,8e-05,70,0,0.25634,6900.0,21,1,1,0,0.0,1,False,0,0,0.88169,0.11831,TN
6,51,0.818978,73,0,3095.0,0.0,9,0,1,1,0.0,1,True,0,0,0.72062,0.27938,TN
7,61,0.651603,58,0,0.241136,7783.0,11,0,1,0,0.0,0,False,0,0,0.96489,0.03511,TN
8,74,0.059669,31,0,3162.0,0.0,11,0,2,0,1.0,0,False,0,0,0.96505,0.03495,TN
9,84,0.054497,56,0,0.492022,4950.0,18,0,2,0,0.0,0,False,0,0,0.98371,0.01629,TN


In [120]:
resultat = df_val.groupby('error')['field1'].nunique()
print(resultat)
print(resultat[1])

df_matrixConfusion = pandas.DataFrame({'Prediction 0' : [resultat[2],resultat[0]], 'Prediction 1':[resultat[1],resultat[3]]},index = ['Actual 0', 'Actual 1'] )
print(df_matrixConfusion)

error
FN     1686
FP      236
TN    27749
TP      329
Name: field1, dtype: int64
236
          Prediction 0  Prediction 1
Actual 0         27749           236
Actual 1          1686           329


In [187]:
seuil = 0.5
def CalculError_seuil(row):
    v = row['SeriousDlqin2yrs']
    w = row['1 probability']
    if v == 0 :
        if w < seuil :
            return 'TN'
        else :
            return 'FP'
    else :
        if w < seuil :
            return 'FN'
        else :
            return 'TP'

In [186]:
df_val['error']= df_val.apply (CalculError_seuil, axis = 1)
df_val.head (10)

Unnamed: 0,field1,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,NumberOfTimes90DaysLate_TOTAL,IsOld,SeriousDlqin2yrs,SeriousDlqin2yrs.1,0 probability,1 probability,error
0,5,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,False,0,0,0.82094,0.17906,TN
1,20,0.602794,25,0,0.065868,333.0,2,0,0,0,0.0,0,False,0,0,0.95714,0.04286,TN
2,27,0.052436,58,0,0.097672,8333.0,22,0,1,0,0.0,0,False,0,0,0.98582,0.01418,TN
3,28,0.034421,69,0,0.042383,2500.0,17,0,0,0,1.0,0,False,0,0,0.98958,0.01042,TN
4,31,0.704074,28,1,0.155201,4200.0,8,0,0,0,0.0,1,False,0,0,0.86787,0.13213,TN
5,50,8e-05,70,0,0.25634,6900.0,21,1,1,0,0.0,1,False,0,0,0.88169,0.11831,TN
6,51,0.818978,73,0,3095.0,0.0,9,0,1,1,0.0,1,True,0,0,0.72062,0.27938,TN
7,61,0.651603,58,0,0.241136,7783.0,11,0,1,0,0.0,0,False,0,0,0.96489,0.03511,TN
8,74,0.059669,31,0,3162.0,0.0,11,0,2,0,1.0,0,False,0,0,0.96505,0.03495,TN
9,84,0.054497,56,0,0.492022,4950.0,18,0,2,0,0.0,0,False,0,0,0.98371,0.01629,TN


In [188]:
resultat = df_val.groupby('error')['field1'].nunique()
df_matrixConfusion = pandas.DataFrame({'Prediction 0' : [resultat[2],resultat[0]], 'Prediction 1':[resultat[1],resultat[3]]},index = ['Actual 0', 'Actual 1'] )
print(df_matrixConfusion)

          Prediction 0  Prediction 1
Actual 0         27749           236
Actual 1          1686           329


In [189]:
df_matrixCost = pandas.DataFrame({'Prediction 0' : [500,-2500], 'Prediction 1':[-500, 0]},index = ['Actual 0', 'Actual 1'] )
print(df_matrixCost)

          Prediction 0  Prediction 1
Actual 0           500          -500
Actual 1         -2500             0


In [190]:
total_gain = df_matrixCost * df_matrixConfusion
print (total_gain)
total_gain2 = total_gain.sum (axis = 1)
total_gain3 = total_gain2.sum (axis = 0)
print (total_gain3)

          Prediction 0  Prediction 1
Actual 0      13874500       -118000
Actual 1      -4215000             0
9541500


In [140]:
#Sort by bigerror
df_bigError = df_val.sort_values(by = ['SeriousDlqin2yrs','1 probability'], ascending = [True,False] )
df_bigError.head(100)

Unnamed: 0,field1,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,NumberOfTimes90DaysLate_TOTAL,IsOld,SeriousDlqin2yrs,SeriousDlqin2yrs.1,0 probability,1 probability,error
26571,133069,1.081836,35,3,0.291469,3375.0,3,12,1,0,2.0,15,False,0,1,0.26858,0.73142,FP
29184,146045,0.687749,45,2,0.787285,2500.0,24,6,1,4,2.0,12,False,0,1,0.27221,0.72779,FP
17056,84956,1.165834,47,2,440.000000,,3,3,1,2,0.0,7,False,0,1,0.27826,0.72174,FP
25392,127279,1.445042,38,0,0.136132,4715.0,5,3,0,7,2.0,10,False,0,1,0.28135,0.71865,FP
23370,116889,0.999939,60,1,0.533593,3333.0,8,6,1,1,1.0,8,False,0,1,0.28635,0.71365,FP
12110,60228,1.100895,39,0,0.394822,2780.0,8,2,0,4,0.0,6,False,0,1,0.28958,0.71042,FP
17845,89072,1.000000,68,6,1.308753,1450.0,8,3,2,3,0.0,12,False,0,1,0.28969,0.71031,FP
29586,148013,0.845101,36,3,0.944528,2000.0,7,4,1,1,0.0,8,False,0,1,0.28969,0.71031,FP
13915,69191,0.898204,59,2,0.198976,6834.0,6,3,1,2,0.0,7,False,0,1,0.29227,0.70773,FP
8566,42688,0.645352,46,1,0.096162,4845.0,10,4,0,1,2.0,6,False,0,1,0.29414,0.70586,FP


# Recap

In [160]:
import pandas
from pandas import read_csv
df_val = read_csv('my_predictions_val.csv')
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 17 columns):
field1                                  30000 non-null int64
RevolvingUtilizationOfUnsecuredLines    30000 non-null float64
age                                     30000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    30000 non-null int64
DebtRatio                               30000 non-null float64
MonthlyIncome                           23999 non-null float64
NumberOfOpenCreditLinesAndLoans         30000 non-null int64
NumberOfTimes90DaysLate                 30000 non-null int64
NumberRealEstateLoansOrLines            30000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    30000 non-null int64
NumberOfDependents                      29192 non-null float64
NumberOfTimes90DaysLate_TOTAL           30000 non-null int64
IsOld                                   30000 non-null bool
SeriousDlqin2yrs                        30000 non-null int64
SeriousDlqin2yrs.1            

In [191]:
def CalculError_seuil2 (seuil):
    v = row['SeriousDlqin2yrs']
    w = row['1 probability']
    if v == 0 :
        if w < seuil :
            return 'TN'
        else :
            return 'FP'
    else :
        if w < seuil :
            return 'FN'
        else :
            return 'TP'

In [192]:
def calcul_gain (s): 
    seuil = s
    df_val['error']= df_val.apply (CalculError_seuil2, axis = 1)
    df_val.head (10)
    resultat = df_val.groupby('error')['field1'].nunique()
    df_matrixConfusion = pandas.DataFrame({'Prediction 0' : [resultat[2],resultat[0]], 'Prediction 1':[resultat[1],resultat[3]]},index = ['Actual 0', 'Actual 1'] )
    df_matrixCost = pandas.DataFrame({'Prediction 0' : [500,-2500], 'Prediction 1':[-500, 0]},index = ['Actual 0', 'Actual 1'] )
    total_gain = df_matrixCost * df_matrixConfusion
    total_gain2 = total_gain.sum (axis = 1)
    total_gain3 = total_gain2.sum (axis = 0)
    return total_gain3



In [195]:
gain = []

for i in range(0,10):
    i=i/10
    gain.append(calcul_gain(i))

df_seuil = gain
print(gain)                


[9541500, 9541500, 9541500, 9541500, 9541500, 9541500, 9541500, 9541500, 9541500, 9541500]


# PREDICTION

Building Source for TEST

In [None]:
source_Test = api.create_source('Test_New1.csv')
api.ok(source_Test)

True

Building Dataset for TEST

In [None]:
dataset_Test = api.create_dataset(source_Test)
api.ok(dataset_Test) 

True

Building model

In [None]:
ensemble = api.create_ensemble(dataset)
api.ok(ensemble)

True

Batch Prediction on Test


In [None]:
batch_prediction = api.create_batch_prediction(ensemble, dataset_Test, {"name": "my batch predction", "all_fields":True, "header":True,"probabilities":True})
api.ok(batch_prediction)

True

downloading the results to your computer

In [None]:
api.download_batch_prediction(batch_prediction,filename='my_predictions.csv')

'my_predictions.csv'