# Introduction: Model Evaluation

In this notebook we will implement several different machine learning algorithms on the manual engineered features after feature selection. There are a total of around 350 features after the engineering plus selection process. We will test several different sklearn algorithms as well as the Gradient Boosting Machine as implemented in LightGBM

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('../input/m_train_small.csv')
test = pd.read_csv('../input/m_test_small.csv')

train_labels = train['TARGET']
train_ids = train['SK_ID_CURR']
test_ids = test['SK_ID_CURR']

train = train.drop(columns = ['TARGET', 'SK_ID_CURR'])
test = test.drop(columns = ['SK_ID_CURR'])

submission = pd.DataFrame({'SK_ID_CURR' : test_ids})

pipeline = Pipeline([('imputer', Imputer(strategy = 'median')),
                     ('scaler', MinMaxScaler(feature_range = (0, 1)))])
                     
train = pipeline.fit_transform(train)
test = pipeline.transform(test)

print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

Training shape:  (307511, 342)
Testing shape:  (48744, 342)


In [3]:
def make_submission(model, name):
    predictions = model.predict_proba(test)[:, 1]
    submission['TARGET'] = predictions
    submission.to_csv('%s_submission.csv' % name, index = False)
    print('Submission saved to %s_submission.csv' % name)

In [None]:
logreg = LogisticRegressionCV(Cs = 20, n_jobs = -1, cv = 3, verbose = 1)
logreg.fit(train, train_labels)
make_submission(logreg, name = 'logreg')

In [None]:
logreg.cv_results_

In [None]:
rf = RandomForestClassifier(n_estimators = 1000, n_jobs = -1, verbose = 1)
rf.fit(train, train_labels)
make_submission(rf, name = 'rf')

et = ExtraTreesClassifier(n_estimators = 1000, n_jobs = -1, verbose = 1)
et.fit(train, train_labels)
make_submission(et, name = 'et')

In [4]:
gbm = GradientBoostingClassifier(n_estimators = 1000, learning_rate = 0.01, verbose = 1)
gbm.fit(train, train_labels)
make_submission(gbm, name = 'gbm')

      Iter       Train Loss   Remaining Time 
         1           0.5602          120.59m
         2           0.5594          115.26m
         3           0.5586          113.31m
         4           0.5578          112.12m
         5           0.5570          111.62m
         6           0.5562          110.64m
         7           0.5555          110.10m
         8           0.5548          111.30m
         9           0.5541          111.37m
        10           0.5534          110.65m
        20           0.5475          107.29m
        30           0.5427          106.10m
        40           0.5388          105.57m
        50           0.5354          104.42m
        60           0.5325          104.10m
        70           0.5299          103.60m
        80           0.5277          103.66m
        90           0.5257          103.11m
       100           0.5239          102.47m
       200           0.5112           92.02m
       300           0.5033           80.92m
       40

In [5]:
import lightgbm as lgb

# Create the model with several hyperparameters
lgb_gbm = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 1000, 
                             learning_rate = 0.01, class_weight = 'balanced', n_jobs = -1, verbose = 200)
lgb_gbm.fit(train, train_labels)
make_submission(lgb_gbm, 'lgb_gbm')

Submission saved to lgb_gbm_submission.csv


In [8]:
# Read in the submissions 
logreg_sub = pd.read_csv('logreg_submission.csv')
rf_sub = pd.read_csv('rf_submission.csv')
et_sub = pd.read_csv('et_submission.csv')
gbm_sub = pd.read_csv('gbm_submission.csv')
lgb_gbm_sub = pd.read_csv('lgb_gbm_submission.csv')

average_sub = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': 0})

# Average the preditions together
average_sub['TARGET'] = (rf_sub['TARGET'] + et_sub['TARGET'] + gbm_sub['TARGET'] + lgb_gbm_sub['TARGET']) / 4

average_sub.to_csv('average_sub.csv', index = False)

Logistic Regression = 0.768
Random Forest with 1000 trees = 0.708
Extra Trees with 1000 trees = 0.725
Gradient Boosting Machine in Scikit-Learn with 1000 trees = 
Gradient Boosting Machine in LightGBM with 1000 trees = 
Average of all Models =
