# Loading Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Loading the Data

In [2]:
defpred_train = pd.read_csv('Training Data.csv')
defpred_test = pd.read_csv('Test Data.csv')

# Cleaning The Data

In [3]:
defpred_train['married'] = defpred_train['married'] == 'single'
defpred_train["married"] = defpred_train['married'].astype(int)

defpred_test['married'] = defpred_test['married'] == 'single'
defpred_test["married"] = defpred_test['married'].astype(int)

In [None]:
defpred_train['car_ownership'].unique()

In [4]:
defpred_train['house_ownership'] = defpred_train['house_ownership'].map({'rented': 0, 'norent_noown': 1, 
                                                                         'owned' : 2})

defpred_test['house_ownership'] = defpred_test['house_ownership'].map({'rented': 0, 'norent_noown': 1, 
                                                                         'owned' : 2})

In [5]:
defpred_train['car_ownership'] = defpred_train['car_ownership'].map({'no': 0, 'yes': 1, })
defpred_test['car_ownership'] = defpred_test['car_ownership'].map({'no': 0, 'yes': 1, })

# Feature Selection

In [6]:
train_data = defpred_train[['income','age','experience','married','house_ownership','car_ownership',
                            'current_job_years','current_house_years','risk_flag']]
#train_label = defpred_train[['risk_flag']]

test_data = defpred_test[['income','age','experience','married','house_ownership','car_ownership',
                            'current_job_years','current_house_years']]

In [4]:
X = train_data.iloc[:,:-1]
y = train_data.iloc[:,-1]

NameError: name 'train_data' is not defined

# Loading Some More Libraries

In [2]:
from sklearn.model_selection import KFold 
from collections import Counter
#from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import SVMSMOTE
from sklearn.metrics import accuracy_score

In [3]:
counter = Counter(y)
print(counter)

NameError: name 'y' is not defined

# Applying Data Augmentation For Imbalanced Data

In [10]:
oversample = SVMSMOTE()

In [11]:
X, y = oversample.fit_resample(X, y)

In [1]:
counter = Counter(y)
print(counter)

NameError: name 'Counter' is not defined

# Normalizing the Data

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [14]:
sc = StandardScaler()
sc.fit(X)
X_train_std = sc.transform(X)
X_test_std = sc.transform(test_data)

# Using Ensemble Model (RandomForestClassifier)

In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=1000, 
                               bootstrap = True,
                               max_features = 'sqrt', ccp_alpha = 0.1)

In [90]:
from sklearn.ensemble import RandomForestClassifier

model1 = RandomForestClassifier(n_estimators=1000, 
                               bootstrap = True,
                               max_features = 'sqrt')

In [15]:
# fit model no training data
model = XGBClassifier()
#model.fit(X_train, y_train)

# Performing Cross Validation

In [22]:
acc_score = []
k = 10
kf = KFold(n_splits=k, random_state=None)

In [95]:
acc_score1 = []
k = 10
kf = KFold(n_splits=k, random_state=None)

# Training the Model

In [96]:
c = 0
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
#     if c == 2:  
    model1.fit(X_train,y_train)
    pred_values = model1.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score1.append(acc)
#         acc1 = acc
#     c += 1
     
avg_acc_score = sum(acc_score1)/k
#print('best accuracy: {}'.format(acc))
print('accuracy of each fold - {}'.format(acc_score1))
print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.9357028121535712, 0.9449559964706681, 0.870116060722608, 0.8946403927512953, 0.8819257482862378, 0.6079500463790412, 0.940159724893102, 0.9414040406325649, 0.9838461538461538, 0.9950904977375565]
Avg accuracy : 0.89957914738728


In [23]:
from tpot import TPOTClassifier
from tpot.config import classifier_config_dict

classifier_config_dict['xgboost.XGBClassifier'] = {
    'n_estimators': [100],
    'max_depth': range(1, 11),
    'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
    'subsample': np.arange(0.05, 1.01, 0.05),
    'min_child_weight': range(1, 21),
    'n_jobs': [1], # replace "nthread"
    'verbosity': [0] # add this line to slient warning message
}
        
# for a quick test
model = TPOTClassifier(generations=100, population_size=100, verbosity=2,
                      config_dict=classifier_config_dict)

In [None]:
c = 0
for train_index , test_index in kf.split(X):
    X_train , X_test = X_train_std[train_index,:],X_train_std[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
#     if c == 2:  
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
#         acc1 = acc
#     c += 1
     
avg_acc_score = sum(acc_score)/k
#print('best accuracy: {}'.format(acc))
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=10100.0, style=ProgressStyle(…

# Testing the Model

In [85]:
pred = model.predict(X_test_std)

# Data Frame Creation and Saving the results of prediction in CSV File

In [87]:
risk_flag = pd.DataFrame()

In [88]:
risk_flag['risk_flag'] = pd.Series(pred)
risk_flag.insert(0, 'id',risk_flag.index + 1)

In [89]:
risk_flag.to_csv('Prediction Dataset_final.csv', index=False)