In [109]:
# example of grid searching key hyperparametres for ridge classifier
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.ensemble import RandomForestClassifier



In [110]:

# import dataset
dataset = pd.read_csv('csv/cleaned_train_less.csv')

testdataset = pd.read_csv('csv/cleaned_test_all.csv')


x = dataset.drop(['credit_card_default'], axis = 1)
y = dataset['credit_card_default'].values
print(x.shape, y.shape)


(43508, 5) (43508,)


In [111]:
scalar =  preprocessing.StandardScaler()

# copy of datasets
X_train = x.copy()

# numerical features

#for all 
num_cols = ['net_yearly_income','no_of_days_employed','yearly_debt_payments',
             'credit_limit',"credit_limit_used(%)", "credit_score"]

#for less
num_cols = ["credit_limit_used(%)", "credit_score"]

#apply standardization on numerical features
for i in num_cols:
    
    # fit on training data column
    scale = scalar.fit(X_train[[i]])
    scale2 =  scalar.fit(testdataset[[i]])
    # transform the training data column
    X_train[i] = scale.transform(X_train[[i]])
    testdataset[i] = scale.transform(testdataset[[i]])

X_train = X_train.set_index('customer_id')
testdataset = testdataset.set_index('customer_id')



In [112]:
testdataset.head()

Unnamed: 0_level_0,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
CST_142525,52,0.0,1.0,0,0.0,232640.53,998.0,2.0,0.0,14406.73,26524.4,-1.626042,-0.051326,0,0
CST_129215,48,0.0,0.0,0,1.0,284396.79,1338.0,3.0,0.0,57479.99,68998.72,0.620014,0.215533,0,0
CST_138443,50,0.0,0.0,0,1.0,149419.28,1210.0,3.0,0.0,21611.01,25187.8,0.654045,-2.53213,2,0
CST_123812,30,0.0,0.0,0,1.0,160437.54,503.0,2.0,1.0,28990.76,29179.39,-1.455886,0.304486,0,0
CST_144450,52,1.0,0.0,1,0.0,233480.37,157.0,2.0,1.0,54213.72,82331.82,1.028388,-1.692017,1,1


In [113]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size = 0.25, random_state = 0)



In [85]:
#GRID SEARCH CV
# define models and parameters
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [114]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform, truncnorm, randint



model_params = {
    # randomly sample numbers from 4 to 204 estimators
    'n_estimators': randint(4,200),
    # normally distributed max_features, with mean .25 stddev 0.1, bounded between 0 and 1
    'max_features': truncnorm(a=0, b=1, loc=0.25, scale=0.1),
    # uniform distribution from 0.01 to 0.2 (0.01 + 0.199)
    'min_samples_split': uniform(0.01, 0.199)
}

# create random forest classifier model
rf_model = RandomForestClassifier()

# set up random search meta-estimator
# this will train 100 models over 5 folds of cross validation (500 models total)
clf = RandomizedSearchCV(rf_model, model_params, n_iter=100, cv=5, random_state=1)

# train the random search meta-estimator to find the best model out of 100 candidates
model = clf.fit(X_train, y_train) 

# print winning set of hyperparameters
from pprint import pprint
pprint(model.best_estimator_.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 0.2864742729236049,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 0.15334457419498948,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 141,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [115]:
y_pred_RF =model.predict(X_test)

#Parameter setting that gave the best results on the hold out data.
print(model.best_params_ ) 
#Mean cross-validated score of the best_estimator
print('Best Score - :', model.best_score_ )

print('F1 Score - :', metrics.f1_score(y_test, y_pred_RF , average="macro" ) )  

print('Accuracy Score -  ', metrics.accuracy_score(y_test, y_pred_RF)) 


{'max_features': 0.2864742729236049, 'min_samples_split': 0.15334457419498948, 'n_estimators': 141}
Best Score - : 0.9810303047747022
F1 Score - : 0.919936928827322
Accuracy Score -   0.9789464006619473


In [None]:
import pickle
 
# Save the trained model as a pickle string.
saved_model = pickle.dump(model, open('rfc_RS1.pkl','wb'))
 
# Load the pickled model
rfc = pickle.load(open('rfc_RS1.pkl', 'rb'))

In [93]:
y_pred_RF =model.predict(testdataset)

print(y_pred_RF)

[0 0 1 ... 0 0 0]


In [94]:
type(y_pred_RF)

numpy.ndarray

In [95]:
testdatasetexport = pd.read_csv('csv/cleaned_test_all.csv')


testdatasetexport['credit_card_default'] = y_pred_RF.tolist()



In [96]:
testdatasetexportdone =  testdatasetexport[[ 'customer_id' , 'credit_card_default' ]]

testdatasetexportdone.to_csv('csv/submissiontestv4.csv' , index=False)