In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import impute
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import ensemble
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn import decomposition
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [2]:
merged_train = pd.read_csv('./data/merged_train.csv')
merged_test = pd.read_csv('./data/merged_test.csv')

#### Train-Validation-Test Split (without PCA)

In [3]:
# train-validation split
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(
    merged_train.drop('SK_ID_CURR', axis = 1), merged_train.TARGET,
    test_size=0.30, random_state=0, stratify=merged_train.TARGET)

# upsample
X_train_0 = X_train[X_train.TARGET == 0]
X_train_1 = X_train[X_train.TARGET == 1]
X_train_1 = X_train_1.sample(X_train_0.shape[0], replace=True)
X_train = pd.concat([X_train_0, X_train_1], axis = 0)
y_train = X_train.TARGET
X_train = X_train.drop('TARGET', axis = 1)

X_validation = X_validation.drop('TARGET', axis = 1)


# train-test split for KAGGLE entry
Xs_train = merged_train.copy()
Xs_train_0 = Xs_train[Xs_train.TARGET == 0]
Xs_train_1 = Xs_train[Xs_train.TARGET == 1]
Xs_train_1 = Xs_train_1.sample(Xs_train_0.shape[0], replace=True)
Xs_train = pd.concat([Xs_train_0, Xs_train_1], axis = 0)
ys_train = Xs_train.TARGET
Xs_train = Xs_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)

Xs_test = merged_test.drop('SK_ID_CURR', axis = 1)

#### Random Forest with gridsearch

In [21]:
# Validation
rfmodel = ensemble.RandomForestClassifier()
param_grid = {'n_estimators': [250], 'max_depth':[11], 'max_features':[11]}
gridmodel = model_selection.GridSearchCV(rfmodel, param_grid, scoring = 'roc_auc', cv = 5)
gridmodel.fit(X_train, y_train)

print('n_estimators:', gridmodel.best_estimator_.n_estimators, '  ',
      'max_depth:', gridmodel.best_estimator_.max_depth, '  ',
      'max_features:', gridmodel.best_estimator_.max_features)
print('train_roc:', round(gridmodel.score(X_train, y_train), 3), '  ',
      'validation_roc:', round(gridmodel.score(X_validation, y_validation), 3))


rf = pd.DataFrame(gridmodel.cv_results_)[['param_n_estimators', 'param_max_depth', 'param_max_features',
                                      'mean_test_score', 'mean_train_score', 'std_test_score']]
try:
    rf_results = pd.concat([rf_results, rf])
except:
    rf_results = rf.copy()
rf_results.sort_values(by = 'mean_test_score', ascending = False)

n_estimators: 250    max_depth: 11    max_features: 11
train_roc: 0.88    validation_roc: 0.758


Unnamed: 0,param_n_estimators,param_max_depth,param_max_features,mean_test_score,mean_train_score,std_test_score
0,250,11,11,0.866184,0.881395,0.001611
0,300,11,10,0.865569,0.880678,0.001613
1,250,11,10,0.865482,0.880594,0.001326
0,200,11,10,0.864678,0.879819,0.0018
0,200,10,10,0.837005,0.849773,0.001567
1,200,9,10,0.811644,0.821621,0.002192
0,250,8,10,0.791186,0.79847,0.002254
0,200,8,10,0.791097,0.798303,0.001714
0,200,7,10,0.774925,0.779924,0.001991
2,100,7,10,0.77349,0.77853,0.002006


In [None]:
# validation results
# n_estimators: 100    max_depth: 5    max_features: 10
# train_roc: 0.753    validation_roc: 0.743
#-----------------------------------------------------
# n_estimators: 100    max_depth: 5    max_features: 8
# train_roc: 0.75    validation_roc: 0.739
#-----------------------------------------------------
# n_estimators: 200    max_depth: 7    max_features: 10
# train_roc: 0.779    validation_roc: 0.751
#-----------------------------------------------------
# n_estimators: 250    max_depth: 8    max_features: 10
# train_roc: 0.798    validation_roc: 0.755
#------------------------------------------------------
# n_estimators: 250    max_depth: 11    max_features: 10
# train_roc: 0.879    validation_roc: 0.758
#------------------------------------------------------
# n_estimators: 300    max_depth: 11    max_features: 10
# train_roc: 0.879    validation_roc: 0.758
#------------------------------------------------------
# n_estimators: 250    max_depth: 11    max_features: 11
# train_roc: 0.88    validation_roc: 0.758

In [None]:
# Kaggle prediction
model = ensemble.RandomForestClassifier(n_estimators = 250, max_depth = 11, max_features = 11)
model.fit(Xs_train, ys_train)
y_pred = model.predict_proba(Xs_test)

sol = pd.DataFrame({'SK_ID_CURR': merged_test.SK_ID_CURR, 'TARGET': y_pred[:,1]})
sol.to_csv('./soln/sol_mergedrf.csv', index = False)       # kaggle: 0.74832

#### Principal Components + Logistic

In [62]:
pca_train = merged_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
pca_test = merged_test.drop('SK_ID_CURR', axis = 1)
pca = decomposition.PCA()
pca.fit(pca_train)

In [33]:
pca.explained_variance_ratio_.cumsum()

array([0.09629084, 0.1434437 , 0.18086675, 0.21595072, 0.24429443,
       0.26771517, 0.29046498, 0.3114567 , 0.33148482, 0.34891427,
       0.36538338, 0.38127357, 0.39701726, 0.41227906, 0.42687093,
       0.44129709, 0.45515265, 0.46824045, 0.48062982, 0.4924454 ,
       0.50371793, 0.5148239 , 0.52546174, 0.53608312, 0.54641224,
       0.55661707, 0.56674783, 0.57663001, 0.58638171, 0.59607871,
       0.60555381, 0.61498931, 0.62441449, 0.63379657, 0.64315874,
       0.65249762, 0.66175729, 0.67098652, 0.6801726 , 0.6892577 ,
       0.69825182, 0.70711182, 0.71592715, 0.72467476, 0.73336655,
       0.74198072, 0.7505091 , 0.75898807, 0.76742898, 0.77562808,
       0.7838143 , 0.79191561, 0.79992325, 0.80783446, 0.81559754,
       0.82321767, 0.8308116 , 0.83824203, 0.8455526 , 0.85271439,
       0.85958735, 0.86631893, 0.87295211, 0.87948419, 0.88583125,
       0.89185639, 0.89769615, 0.90334177, 0.90893179, 0.91427557,
       0.9194328 , 0.92444586, 0.92941033, 0.93433102, 0.93923

In [65]:
# dropping last 40 which explain less than 1% variance
pca = decomposition.PCA(n_components = 95)
pca.fit(pca_train)
pca_train = pca.transform(pca_train)
pca_test = pca.transform(pca_test)
pca_train1 = pca_train.copy()
pca_test1 = pca_test.copy()

In [71]:
pca_train = pd.concat([merged_train[['SK_ID_CURR', 'TARGET']], pd.DataFrame(pca_train)], axis = 1)
pca_test = pd.concat([merged_test[['SK_ID_CURR']], pd.DataFrame(pca_test)], axis = 1)

In [72]:
# train-validation split for logistic after pca
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(
    pca_train.drop('SK_ID_CURR', axis = 1), pca_train.TARGET,
    test_size=0.30, random_state=0, stratify=pca_train.TARGET)

# upsample
X_train_0 = X_train[X_train.TARGET == 0]
X_train_1 = X_train[X_train.TARGET == 1]
X_train_1 = X_train_1.sample(X_train_0.shape[0], replace=True)
X_train = pd.concat([X_train_0, X_train_1], axis = 0)
y_train = X_train.TARGET
X_train = X_train.drop('TARGET', axis = 1)

X_validation = X_validation.drop('TARGET', axis = 1)

In [73]:
# train-test split for kaggle entry with logistics after pca
Xs_train = pca_train.copy()
Xs_train_0 = Xs_train[Xs_train.TARGET == 0]
Xs_train_1 = Xs_train[Xs_train.TARGET == 1]
Xs_train_1 = Xs_train_1.sample(Xs_train_0.shape[0], replace=True)
Xs_train = pd.concat([Xs_train_0, Xs_train_1], axis = 0)
ys_train = Xs_train.TARGET
Xs_train = Xs_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)

Xs_test = pca_test.drop('SK_ID_CURR', axis = 1)

In [83]:
# Validation
logitmodel = linear_model.LogisticRegression()
param_grid = {'C': [1e7, 1e8]}
gridmodel = model_selection.GridSearchCV(logitmodel, param_grid, scoring = 'roc_auc', cv = 10)
gridmodel.fit(X_train, y_train)

print(gridmodel.best_estimator_.C, '  ', 'train_roc:', round(gridmodel.score(X_train, y_train), 3), '  ',
      'validation_roc:', round(gridmodel.score(X_validation, y_validation), 3))


pclog = pd.DataFrame(gridmodel.cv_results_)[['param_C', 'mean_test_score', 'mean_train_score', 'std_test_score']]
try:
    pclog_results = pd.concat([pclog_results, pclog])
except:
    pclog_results = pclog.copy()
pclog_results.sort_values(by = 'mean_test_score', ascending = False)

100000000.0    train_roc: 0.763    validation_roc: 0.759


Unnamed: 0,param_C,mean_test_score,mean_train_score,std_test_score
1,100000000.0,0.762813,0.76309,0.001681
0,10000000.0,0.762813,0.76309,0.001681
1,100.0,0.762767,0.763112,0.001639
2,10000.0,0.762767,0.763112,0.001639
0,1000000.0,0.762767,0.763112,0.001639
0,0.1,0.762767,0.763112,0.001639
0,1e-05,0.755,0.755224,0.001757


In [82]:
# Performance on validation set
# 1e-05    train_roc: 0.756    validation_roc: 0.754
# 1000000.0    train_roc: 0.763    validation_roc: 0.759

In [84]:
# Kaggle prediction
model = linear_model.LogisticRegression(C = 100000000)
model.fit(Xs_train, ys_train)
y_pred = model.predict_proba(Xs_test)

sol = pd.DataFrame({'SK_ID_CURR': merged_test.SK_ID_CURR, 'TARGET': y_pred[:,1]})
sol.to_csv('./soln/sol_mergedpclog.csv', index = False)       # kaggle: 0.75518

#### PCA + Logistic Full

In [3]:
pca_train = merged_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
pca_test = merged_test.drop('SK_ID_CURR', axis = 1)
pca = decomposition.PCA()
pca.fit(pca_train)

pca_train = pca.transform(pca_train)
pca_test = pca.transform(pca_test)
pca_train1 = pca_train.copy()
pca_test1 = pca_test.copy()

pca_train = pd.concat([merged_train[['SK_ID_CURR', 'TARGET']], pd.DataFrame(pca_train)], axis = 1)
pca_test = pd.concat([merged_test[['SK_ID_CURR']], pd.DataFrame(pca_test)], axis = 1)

In [4]:
# train-validation split and upsample for logistics after pca
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(
    pca_train.drop('SK_ID_CURR', axis = 1), pca_train.TARGET,
    test_size=0.30, random_state=0, stratify=pca_train.TARGET)

X_train_0 = X_train[X_train.TARGET == 0]
X_train_1 = X_train[X_train.TARGET == 1]
X_train_1 = X_train_1.sample(X_train_0.shape[0], replace=True)     # upsample
X_train = pd.concat([X_train_0, X_train_1], axis = 0)
y_train = X_train.TARGET
X_train = X_train.drop('TARGET', axis = 1)

X_validation = X_validation.drop('TARGET', axis = 1)


# train-test split and upsample for kaggle entry with logistics after pca
Xs_train = pca_train.copy()
Xs_train_0 = Xs_train[Xs_train.TARGET == 0]
Xs_train_1 = Xs_train[Xs_train.TARGET == 1]
Xs_train_1 = Xs_train_1.sample(Xs_train_0.shape[0], replace=True)
Xs_train = pd.concat([Xs_train_0, Xs_train_1], axis = 0)
ys_train = Xs_train.TARGET
Xs_train = Xs_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)

Xs_test = pca_test.drop('SK_ID_CURR', axis = 1)

In [6]:
# Validation
logitmodel = linear_model.LogisticRegression()
param_grid = {'C': [100, 10000]}
gridmodel = model_selection.GridSearchCV(logitmodel, param_grid, scoring = 'roc_auc', cv = 3)
gridmodel.fit(X_train, y_train)

print(gridmodel.best_estimator_.C, '  ', 'train_roc:', round(gridmodel.score(X_train, y_train), 3), '  ',
      'validation_roc:', round(gridmodel.score(X_validation, y_validation), 3))


pclogfull = pd.DataFrame(gridmodel.cv_results_)[['param_C', 'mean_test_score', 'mean_train_score', 'std_test_score']]
try:
    pclogfull_results = pd.concat([pclogfull_results, pclogfull])
except:
    pclogfull_results = pclogfull.copy()
pclogfull_results.sort_values(by = 'mean_test_score', ascending = False)

100    train_roc: 0.77    validation_roc: 0.767


Unnamed: 0,param_C,mean_test_score,mean_train_score,std_test_score
0,1000,0.769309,0.769775,0.000751
0,100,0.769308,0.769774,0.00075
1,10000,0.769308,0.769775,0.00075


In [None]:
# 1000    train_roc: 0.77    validation_roc: 0.767
# 100    train_roc: 0.77    validation_roc: 0.767

In [7]:
# Kaggle prediction
model = linear_model.LogisticRegression(C = 10000)
model.fit(Xs_train, ys_train)
y_pred = model.predict_proba(Xs_test)

sol = pd.DataFrame({'SK_ID_CURR': merged_test.SK_ID_CURR, 'TARGET': y_pred[:,1]})
sol.to_csv('./soln/sol_merged_pclog_full.csv', index = False)       # kaggle: 0.76641