# BigML Kaggle API 

In [1]:
import pandas as pd
import bigml.api
from pprint import pprint

In [2]:
api = bigml.api.BigML()

In [3]:
training_df = pd.read_csv('./datasets/cs-training.csv')
test_df = pd.read_csv('./datasets/cs-test.csv')

dataframes = [training_df, test_df]

In [4]:
for df in dataframes:
    df.columns.values[0]="Id"
    df[['MonthlyIncome','NumberOfDependents']] = df[['MonthlyIncome','NumberOfDependents']].fillna(0)
    df['NumberOfTimes30DaysOrMoreLate'] = df['NumberOfTime30-59DaysPastDueNotWorse']+df['NumberOfTime60-89DaysPastDueNotWorse']+df['NumberOfTimes90DaysLate']
    df['IncomePerPerson'] = df['MonthlyIncome']/(df['NumberOfDependents']+1)
    df['MonthlyDebt'] = df['MonthlyIncome']*df['DebtRatio']
    df['MonthlyBalance'] = df['MonthlyIncome']-df['MonthlyDebt']

In [5]:
training_df.to_csv('./results/full_train_modif.csv', index=False)
test_df.to_csv('./results/test_modif.csv', index=False)

In [6]:
full_train_source = api.create_source('./results/full_train_modif.csv')
api.ok(full_train_source)

True

In [7]:
full_train_dataset = api.create_dataset(full_train_source)
api.ok(full_train_dataset)

True

In [8]:
dataset_train80 = api.create_dataset(full_train_dataset,{"name":"Train", "sample_rate":0.8, "seed":"my seed"})
api.ok(dataset_train80)

True

In [9]:
dataset_valid20 = api.create_dataset(full_train_dataset, {"name":"Validation", "sample_rate":0.8, "seed":"my seed", "out_of_bag":True})
api.ok(dataset_valid20)

True

In [10]:
ensemble = api.create_ensemble(dataset_train80, {"objective_field":"SeriousDlqin2yrs"})
api.ok(ensemble)

True

In [11]:
test_source = api.create_source('./results/test_modif.csv')
api.ok(test_source)

True

In [12]:
test_dataset = api.create_dataset(test_source)
api.ok(test_dataset)

True

In [13]:
batch_prediction = api.create_batch_prediction(ensemble, test_dataset, {'output_fields':['Id'], 'probabilities':True})
api.ok(batch_prediction)

True

In [14]:
api.download_batch_prediction(batch_prediction, filename= './results/my_predictions.csv')

'./results/my_predictions.csv'

In [15]:
evaluation = api.create_evaluation(ensemble, dataset_valid20)
api.ok(evaluation)

True

In [16]:
my_prediction_df = pd.read_csv('./results/my_predictions.csv', index_col = 0)
my_prediction_df = my_prediction_df.drop(columns=['SeriousDlqin2yrs','0 probability'])
my_prediction_df = my_prediction_df.rename(columns={"1 probability": "Probability"})
my_prediction_df

Unnamed: 0_level_0,Probability
Id,Unnamed: 1_level_1
1,0.05625
2,0.03678
3,0.01792
4,0.07931
5,0.10969
...,...
101499,0.01862
101500,0.37527
101501,0.00305
101502,0.12441


In [17]:
my_prediction_df.to_csv('./results/prediction-kaggle.csv')

In [18]:
matrice_confusion = evaluation['object']['result']['model']['confusion_matrix']
pprint(matrice_confusion)

[[27750, 235], [1680, 335]]


In [19]:
roc = evaluation['object']['result']['model']['average_area_under_roc_curve']
pprint(roc)

0.85618


## Save Prediction

In [None]:
kaggle_df.to_csv('./results/prediction-kaggle', index=False)