# BigML : Evaluation 

Check Install and launch BigML ; 
for help : https://bigml.readthedocs.io/en/latest/index.html#local-predictions ; 
Use this link to retreive the key with your account :  https://bigml.com/account/apikey

In [1]:
import bigml.api
from bigml.api import BigML
api = BigML('ALEXISD', '8fe5e17917b5a9ef7a0ec327a53c3f1d507f0911', project='project/5d94a45aeba31d46690001d3')

Building Source

In [5]:
source = api.create_source('cs-training.csv')
api.ok(source)

True

Building Dataset

In [6]:
dataset = api.create_dataset(source)
api.ok(dataset) 

True

Random Split Building dataset into 80% train and 20% Validation

In [7]:
train_dataset = api.create_dataset(dataset, {"name": "train", "sample_rate": 0.8, "seed": "my seed"})
api.ok(train_dataset)
validation_dataset = api.create_dataset(dataset, {"name": "validation", "sample_rate": 0.8, "seed": "my seed", "out_of_bag": True})
api.ok(validation_dataset)

True

Building model

In [8]:
ensemble = api.create_ensemble(train_dataset)
api.ok(ensemble)

True

Building evaluation and print evaluation

In [9]:
evaluation = api.create_evaluation(ensemble,validation_dataset)
api.ok (evaluation)

True

Print

In [10]:
api.pprint(evaluation['object']['result'])

{   'mean': {   'mean_absolute_error': 0.89952,
                'mean_squared_error': 1.2345,
                'r_squared': 0},
    'model': {   'mean_absolute_error': 0.75826,
                 'mean_squared_error': 1.00026,
                 'per_class_statistics': [],
                 'r_squared': 0.18974},
    'random': {   'mean_absolute_error': 9.34728,
                  'mean_squared_error': 120.41366,
                  'r_squared': -96.54074}}


In [11]:
api.pprint(evaluation['object']['result']['model']['accuracy'])
api.pprint(evaluation['object']['result']['model']['average_area_under_roc_curve'])
api.pprint(evaluation['object']['result']['model']['confusion_matrix'])


KeyError: 'accuracy'

In [12]:
batch_prediction = api.create_batch_prediction(ensemble, validation_dataset, {"name": "my val prediction", "all_fields":True, "header":True,"probabilities":True})
api.ok(batch_prediction)
api.download_batch_prediction(batch_prediction,filename='my_predictions_val.csv')

'my_predictions_val.csv'

## Matrix confusion via Pandas

In [13]:
import pandas
from pandas import read_csv
df_val = read_csv('my_predictions_val.csv')
df_val.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 17 columns):
field1                                  30000 non-null int64
RevolvingUtilizationOfUnsecuredLines    30000 non-null float64
age                                     30000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    30000 non-null int64
DebtRatio                               30000 non-null float64
MonthlyIncome                           23999 non-null float64
NumberOfOpenCreditLinesAndLoans         30000 non-null int64
NumberOfTimes90DaysLate                 30000 non-null int64
NumberRealEstateLoansOrLines            30000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    30000 non-null int64
NumberOfDependents                      29192 non-null float64
NumberOfTimes90DaysLate_TOTAL           30000 non-null int64
IsOld                                   30000 non-null bool
SeriousDlqin2yrs                        30000 non-null int64
SeriousDlqin2yrs.1            

In [14]:
def CalculError(row):
    v = row['SeriousDlqin2yrs']
    w = row['SeriousDlqin2yrs.1']
    if v == 0 :
        if w == 0 :
            return 'TN'
        else :
            return 'FP'
    else :
        if w == 0 :
            return 'FN'
        else :
            return 'TP'

In [15]:
df_val['error']= df_val.apply (CalculError, axis = 1)
df_val.head (10)

Unnamed: 0,field1,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,NumberOfTimes90DaysLate_TOTAL,IsOld,SeriousDlqin2yrs,SeriousDlqin2yrs.1,0 probability,1 probability,error
0,5,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,False,0,0,0.82094,0.17906,TN
1,20,0.602794,25,0,0.065868,333.0,2,0,0,0,0.0,0,False,0,0,0.95714,0.04286,TN
2,27,0.052436,58,0,0.097672,8333.0,22,0,1,0,0.0,0,False,0,0,0.98582,0.01418,TN
3,28,0.034421,69,0,0.042383,2500.0,17,0,0,0,1.0,0,False,0,0,0.98958,0.01042,TN
4,31,0.704074,28,1,0.155201,4200.0,8,0,0,0,0.0,1,False,0,0,0.86787,0.13213,TN
5,50,8e-05,70,0,0.25634,6900.0,21,1,1,0,0.0,1,False,0,0,0.88169,0.11831,TN
6,51,0.818978,73,0,3095.0,0.0,9,0,1,1,0.0,1,True,0,0,0.72062,0.27938,TN
7,61,0.651603,58,0,0.241136,7783.0,11,0,1,0,0.0,0,False,0,0,0.96489,0.03511,TN
8,74,0.059669,31,0,3162.0,0.0,11,0,2,0,1.0,0,False,0,0,0.96505,0.03495,TN
9,84,0.054497,56,0,0.492022,4950.0,18,0,2,0,0.0,0,False,0,0,0.98371,0.01629,TN


In [16]:
resultat = df_val.groupby('error')['field1'].nunique()
print(resultat)
print(resultat[1])

df_matrixConfusion = pandas.DataFrame({'Prediction 0' : [resultat[2],resultat[0]], 'Prediction 1':[resultat[1],resultat[3]]},index = ['Actual 0', 'Actual 1'] )
print(df_matrixConfusion)

error
FN     1686
FP      236
TN    27749
TP      329
Name: field1, dtype: int64
236
          Prediction 0  Prediction 1
Actual 0         27749           236
Actual 1          1686           329


In [17]:
seuil = 0.5
def CalculError_seuil(row):
    v = row['SeriousDlqin2yrs']
    w = row['1 probability']
    if v == 0 :
        if w < seuil :
            return 'TN'
        else :
            return 'FP'
    else :
        if w < seuil :
            return 'FN'
        else :
            return 'TP'

In [18]:
df_val['error']= df_val.apply (CalculError_seuil, axis = 1)
df_val.head (10)

Unnamed: 0,field1,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,NumberOfTimes90DaysLate_TOTAL,IsOld,SeriousDlqin2yrs,SeriousDlqin2yrs.1,0 probability,1 probability,error
0,5,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,False,0,0,0.82094,0.17906,TN
1,20,0.602794,25,0,0.065868,333.0,2,0,0,0,0.0,0,False,0,0,0.95714,0.04286,TN
2,27,0.052436,58,0,0.097672,8333.0,22,0,1,0,0.0,0,False,0,0,0.98582,0.01418,TN
3,28,0.034421,69,0,0.042383,2500.0,17,0,0,0,1.0,0,False,0,0,0.98958,0.01042,TN
4,31,0.704074,28,1,0.155201,4200.0,8,0,0,0,0.0,1,False,0,0,0.86787,0.13213,TN
5,50,8e-05,70,0,0.25634,6900.0,21,1,1,0,0.0,1,False,0,0,0.88169,0.11831,TN
6,51,0.818978,73,0,3095.0,0.0,9,0,1,1,0.0,1,True,0,0,0.72062,0.27938,TN
7,61,0.651603,58,0,0.241136,7783.0,11,0,1,0,0.0,0,False,0,0,0.96489,0.03511,TN
8,74,0.059669,31,0,3162.0,0.0,11,0,2,0,1.0,0,False,0,0,0.96505,0.03495,TN
9,84,0.054497,56,0,0.492022,4950.0,18,0,2,0,0.0,0,False,0,0,0.98371,0.01629,TN


In [19]:
resultat = df_val.groupby('error')['field1'].nunique()
df_matrixConfusion = pandas.DataFrame({'Prediction 0' : [resultat[2],resultat[0]], 'Prediction 1':[resultat[1],resultat[3]]},index = ['Actual 0', 'Actual 1'] )
print(df_matrixConfusion)

          Prediction 0  Prediction 1
Actual 0         27749           236
Actual 1          1686           329


In [20]:
df_matrixCost = pandas.DataFrame({'Prediction 0' : [500,-2500], 'Prediction 1':[-500, 0]},index = ['Actual 0', 'Actual 1'] )
print(df_matrixCost)

          Prediction 0  Prediction 1
Actual 0           500          -500
Actual 1         -2500             0


In [21]:
total_gain = df_matrixCost * df_matrixConfusion
print (total_gain)
total_gain2 = total_gain.sum (axis = 1)
total_gain3 = total_gain2.sum (axis = 0)
print (total_gain3)

          Prediction 0  Prediction 1
Actual 0      13874500       -118000
Actual 1      -4215000             0
9541500


In [22]:
#Sort by bigerror
df_bigError = df_val.sort_values(by = ['SeriousDlqin2yrs','1 probability'], ascending = [True,False] )
df_bigError.head(100)

Unnamed: 0,field1,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,NumberOfTimes90DaysLate_TOTAL,IsOld,SeriousDlqin2yrs,SeriousDlqin2yrs.1,0 probability,1 probability,error
26571,133069,1.081836,35,3,0.291469,3375.0,3,12,1,0,2.0,15,False,0,1,0.26858,0.73142,FP
29184,146045,0.687749,45,2,0.787285,2500.0,24,6,1,4,2.0,12,False,0,1,0.27221,0.72779,FP
17056,84956,1.165834,47,2,440.000000,,3,3,1,2,0.0,7,False,0,1,0.27826,0.72174,FP
25392,127279,1.445042,38,0,0.136132,4715.0,5,3,0,7,2.0,10,False,0,1,0.28135,0.71865,FP
23370,116889,0.999939,60,1,0.533593,3333.0,8,6,1,1,1.0,8,False,0,1,0.28635,0.71365,FP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7320,36549,4.937063,33,0,0.766168,3540.0,12,2,1,0,0.0,2,False,0,1,0.39416,0.60584,FP
15959,79476,1.519308,23,0,0.055925,750.0,2,2,0,0,0.0,2,False,0,1,0.39416,0.60584,FP
22528,112819,1.031011,59,4,0.638397,4916.0,13,1,0,0,1.0,5,False,0,1,0.39468,0.60533,FP
10016,49976,1.000000,59,4,0.386850,3132.0,5,1,1,0,0.0,5,False,0,1,0.39605,0.60395,FP


# Recap

In [23]:
import pandas
from pandas import read_csv
df_val = read_csv('my_predictions_val.csv')
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 17 columns):
field1                                  30000 non-null int64
RevolvingUtilizationOfUnsecuredLines    30000 non-null float64
age                                     30000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    30000 non-null int64
DebtRatio                               30000 non-null float64
MonthlyIncome                           23999 non-null float64
NumberOfOpenCreditLinesAndLoans         30000 non-null int64
NumberOfTimes90DaysLate                 30000 non-null int64
NumberRealEstateLoansOrLines            30000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    30000 non-null int64
NumberOfDependents                      29192 non-null float64
NumberOfTimes90DaysLate_TOTAL           30000 non-null int64
IsOld                                   30000 non-null bool
SeriousDlqin2yrs                        30000 non-null int64
SeriousDlqin2yrs.1            

In [24]:
def CalculError_seuil2 (row):
    v = row['SeriousDlqin2yrs']
    w = row['1 probability']
    if v == 0 :
        if w < seuil :
            return 'TN'
        else :
            return 'FP'
    else :
        if w < seuil :
            return 'FN'
        else :
            return 'TP'

In [25]:
def calcul_gain (j): 
    seuil = j
    df_val['error']= df_val.apply (CalculError_seuil2, axis = 1)
    df_val.head (10)
    resultat = df_val.groupby('error')['field1'].nunique()
    df_matrixConfusion = pandas.DataFrame({'Prediction 0' : [resultat[2],resultat[0]], 'Prediction 1':[resultat[1],resultat[3]]},index = ['Actual 0', 'Actual 1'] )
    df_matrixCost = pandas.DataFrame({'Prediction 0' : [500,-2500], 'Prediction 1':[-500, 0]},index = ['Actual 0', 'Actual 1'] )
    total_gain = df_matrixCost * df_matrixConfusion
    total_gain2 = total_gain.sum (axis = 1)
    total_gain3 = total_gain2.sum (axis = 0)
    return total_gain3



In [26]:
gain = []

for i in range(0,10):
    j=i/10
    gain.append(calcul_gain(j))

df_seuil = gain
print(gain)                


[9541500, 9541500, 9541500, 9541500, 9541500, 9541500, 9541500, 9541500, 9541500, 9541500]


# PREDICTION

Building Source for TEST

In [None]:
source_Test = api.create_source('Test_New1.csv')
api.ok(source_Test)

True

Building Dataset for TEST

In [None]:
dataset_Test = api.create_dataset(source_Test)
api.ok(dataset_Test) 

True

Building model

In [None]:
ensemble = api.create_ensemble(dataset)
api.ok(ensemble)

True

Batch Prediction on Test


In [None]:
batch_prediction = api.create_batch_prediction(ensemble, dataset_Test, {"name": "my batch predction", "all_fields":True, "header":True,"probabilities":True})
api.ok(batch_prediction)

True

downloading the results to your computer

In [None]:
api.download_batch_prediction(batch_prediction,filename='my_predictions.csv')

'my_predictions.csv'