# Loading Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import itertools

# Loading the Data

In [2]:
defpred_train = pd.read_csv('Training Data.csv')
defpred_test = pd.read_csv('Test Data.csv')

# Cleaning The Data

In [3]:
defpred_train['married'] = defpred_train['married'] == 'single'
defpred_train["married"] = defpred_train['married'].astype(int)

defpred_test['married'] = defpred_test['married'] == 'single'
defpred_test["married"] = defpred_test['married'].astype(int)

In [None]:
defpred_train['car_ownership'].unique()

In [4]:
defpred_train['house_ownership'] = defpred_train['house_ownership'].map({'rented': 0, 'norent_noown': 1, 
                                                                         'owned' : 2})

defpred_test['house_ownership'] = defpred_test['house_ownership'].map({'rented': 0, 'norent_noown': 1, 
                                                                         'owned' : 2})

In [5]:
defpred_train['car_ownership'] = defpred_train['car_ownership'].map({'no': 0, 'yes': 1, })
defpred_test['car_ownership'] = defpred_test['car_ownership'].map({'no': 0, 'yes': 1, })

# Feature Selection

In [6]:
train_data = defpred_train[['income','age','experience','married','house_ownership','car_ownership',
                            'current_job_years','current_house_years','risk_flag']]
#train_label = defpred_train[['risk_flag']]

test_data = defpred_test[['income','age','experience','married','house_ownership','car_ownership',
                            'current_job_years','current_house_years']]

In [7]:
X = train_data.iloc[:,:-1]
y = train_data.iloc[:,-1]

# Loading Some More Libraries

In [8]:
from sklearn.model_selection import KFold 
from collections import Counter
#from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import SVMSMOTE
from sklearn.metrics import accuracy_score

In [12]:
counter = Counter(y)
print(counter)

Counter({0: 221004, 1: 221004})


# Applying Data Augmentation For Imbalanced Data

In [26]:
oversample = SVMSMOTE()
from sklearn.model_selection import GridSearchCV

In [11]:
X, y = oversample.fit_resample(X, y)

In [23]:
counter = Counter(y)
print(counter)

Counter({0: 221004, 1: 221004})


In [43]:
X.head(5)

Unnamed: 0,income,age,experience,married,house_ownership,car_ownership,current_job_years,current_house_years
0,1303835,23,3,1,0,0,3,13
1,7574516,40,10,1,0,0,9,13
2,3991815,66,4,0,0,0,4,10
3,6256451,41,2,1,0,1,2,12
4,5768871,47,11,1,0,0,3,14


# Normalizing the Data

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [36]:
from sklearn import preprocessing

In [38]:
normalized_X = preprocessing.normalize(X)

In [39]:
normalized_X_test = preprocessing.normalize(test_data)

In [45]:
standardized_X = preprocessing.scale(X)

In [46]:
standardized_X_test = preprocessing.scale(test_data)

In [14]:
sc = StandardScaler()
sc.fit(X)
X_train_std = sc.transform(X)
X_test_std = sc.transform(test_data)

# Using Ensemble Model (RandomForestClassifier)

In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=1000, 
                               bootstrap = True,
                               max_features = 'sqrt'
                              )

In [90]:
# from sklearn.ensemble import RandomForestClassifier

# model1 = RandomForestClassifier(n_estimators=1000, 
#                                bootstrap = True,
#                                max_features = 'sqrt')

# Performing Cross Validation

In [16]:
acc_score = []
k = 10
kf = KFold(n_splits=k, random_state=None)

In [95]:
# acc_score1 = []
# k = 10
# kf = KFold(n_splits=k, random_state=None)

# Training the Model

In [96]:
# c = 0
# for train_index , test_index in kf.split(X):
#     X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
#     y_train , y_test = y[train_index] , y[test_index]
    
# #     if c == 2:  
#     model1.fit(X_train,y_train)
#     pred_values = model1.predict(X_test)
     
#     acc = accuracy_score(pred_values , y_test)
#     acc_score1.append(acc)
# #         acc1 = acc
# #     c += 1
     
# avg_acc_score = sum(acc_score1)/k
# #print('best accuracy: {}'.format(acc))
# print('accuracy of each fold - {}'.format(acc_score1))
# print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.9357028121535712, 0.9449559964706681, 0.870116060722608, 0.8946403927512953, 0.8819257482862378, 0.6079500463790412, 0.940159724893102, 0.9414040406325649, 0.9838461538461538, 0.9950904977375565]
Avg accuracy : 0.89957914738728


In [49]:
c = 0
for train_index , test_index in kf.split(X):
    X_train , X_test = standardized_X[train_index,:],standardized_X[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
#     if c == 2:  
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
#         acc1 = acc
#     c += 1
     
avg_acc_score = sum(acc_score)/k
#print('best accuracy: {}'.format(acc))
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.9357028121535712, 0.9449559964706681, 0.870116060722608, 0.8946403927512953, 0.8862469174905545, 0.6341937965204407, 0.8636229949548653, 0.9632813737245763, 0.9981221719457014, 0.9971945701357466]
Avg accuracy : 0.8988077086870027


# Testing the Model

In [50]:
pred = model.predict(standardized_X_test)

# Data Frame Creation and Saving the results of prediction in CSV File

In [51]:
risk_flag = pd.DataFrame()

In [52]:
risk_flag['risk_flag'] = pd.Series(pred)
risk_flag.insert(0, 'id',risk_flag.index + 1)

In [53]:
risk_flag.to_csv('Prediction Dataset_finall.csv', index=False)

In [17]:
def holdout_grid_search(clf, X_train_hp, y_train_hp, X_val_hp, y_val_hp, hyperparams, fixed_hyperparams={}):
    '''
    Conduct hyperparameter grid search on hold out validation set. Use holdout validation.
    Hyperparameters are input as a dictionary mapping each hyperparameter name to the
    range of values they should iterate over. Use the cindex function as your evaluation
    function.

    Input:
        clf: sklearn classifier
        X_train_hp (dataframe): dataframe for training set input variables
        y_train_hp (dataframe): dataframe for training set targets
        X_val_hp (dataframe): dataframe for validation set input variables
        y_val_hp (dataframe): dataframe for validation set targets
        hyperparams (dict): hyperparameter dictionary mapping hyperparameter
                            names to range of values for grid search
        fixed_hyperparams (dict): dictionary of fixed hyperparameters that
                                  are not included in the grid search

    Output:
        best_estimator (sklearn classifier): fitted sklearn classifier with best performance on
                                             validation set
        best_hyperparams (dict): hyperparameter dictionary mapping hyperparameter
                                 names to values in best_estimator
    '''
    best_estimator = None
    best_hyperparams = {}
    
    # hold best running score
    best_score = 0.0

    # get list of param values
    lists = hyperparams.values()
    
    # get all param combinations
    param_combinations = list(itertools.product(*lists))
    total_param_combinations = len(param_combinations)

    # iterate through param combinations
    for i, params in enumerate(param_combinations, 1):
        # fill param dict with params
        param_dict = {}
        for param_index, param_name in enumerate(hyperparams):
            param_dict[param_name] = params[param_index]
            
        # create estimator with specified params
        estimator = clf(**param_dict, **fixed_hyperparams)

        # fit estimator
        estimator.fit(X_train_hp, y_train_hp)
        
        # get predictions on validation set
        preds = estimator.predict(X_val_hp)
        
        # compute cindex for predictions
        print(preds)
        print(y_val_hp)
        estimator_score = accuracy_score(preds, y_val_hp)

        print(f'[{i}/{total_param_combinations}] {param_dict}')
        print(f'Val Accuracy: {estimator_score}\n')

        # if new high score, update high score, best estimator
        # and best params 
        if estimator_score >= best_score:
                best_score = estimator_score
                best_estimator = estimator
                best_hyperparams = param_dict

    # add fixed hyperparamters to best combination of variable hyperparameters
    best_hyperparams.update(fixed_hyperparams)
    
    return best_estimator, best_hyperparams

In [21]:
def random_forest_grid_search(X_train_dropped, y_train_dropped, X_val_dropped, y_val_dropped):

    # Define ranges for the chosen random forest hyperparameters 
    hyperparams = {
        
        ### START CODE HERE (REPLACE array values with your code) ###

        # how many trees should be in the forest (int)
        'n_estimators': [10],

        # the maximum depth of trees in the forest (int)
        
        'max_depth': [200],
        
        # the minimum number of samples in a leaf as a fraction
        # of the total number of samples in the training set
        # Can be int (in which case that is the minimum number)
        # or float (in which case the minimum is that fraction of the
        # number of training set samples)
        'min_samples_leaf': [1],

        ### END CODE HERE ###
    }

    
    fixed_hyperparams = {
        'random_state': 10,
    }
    
    rf = RandomForestClassifier

    best_rf, best_hyperparams = holdout_grid_search(rf, X_train_dropped, y_train_dropped,
                                                    X_val_dropped, y_val_dropped, hyperparams,
                                                    fixed_hyperparams)

    print(f"Best hyperparameters:\n{best_hyperparams}")

    
    y_train_best = best_rf.predict(X_train_dropped)
    print(f"Train Accuracy: {accuracy_score(y_train_best, y_train_dropped)}")

    y_val_best = best_rf.predict(X_val_dropped)
    print(f"Val Accuracy: {accuracy_score(y_val_best, y_val_dropped)}")
    
    # add fixed hyperparamters to best combination of variable hyperparameters
    best_hyperparams.update(fixed_hyperparams)
    
    return best_rf, best_hyperparams

In [22]:
for train_index , test_index in kf.split(X):
    X_train , X_test = X_train_std[train_index,:],X_train_std[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    best_rf, best_hyperparams = random_forest_grid_search(X_train, y_train, X_test, y_test)
    break

[0 0 0 ... 0 1 1]
0        0
1        0
2        0
3        1
4        1
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       1
15       0
16       0
17       1
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       1
29       0
        ..
44171    0
44172    0
44173    1
44174    0
44175    0
44176    1
44177    0
44178    1
44179    0
44180    0
44181    0
44182    0
44183    0
44184    0
44185    0
44186    0
44187    0
44188    0
44189    0
44190    0
44191    0
44192    0
44193    0
44194    0
44195    0
44196    1
44197    0
44198    0
44199    1
44200    1
Name: risk_flag, Length: 44201, dtype: int64
[1/1] {'n_estimators': 10, 'max_depth': 200, 'min_samples_leaf': 1}
Val Accuracy: 0.9357028121535712

Best hyperparameters:
{'n_estimators': 10, 'max_depth': 200, 'min_samples_leaf': 1, 'random_state': 10}
Train Accuracy: 0.9575120598682275
Val Accuracy: 0.935702812