In [68]:
# Import important library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

### Read the input file and check the data dimension

In [69]:
# You can access from https://www.kaggle.com/uciml/german-credit
#Read input file and understand the data
# "default" is my dependent variable
df = pd.read_csv("german_credit.csv")
X = df.iloc[:, df.columns != 'default']
Y = df.iloc[:, df.columns == 'default']

### Q1 Randomly select 50% data for this use case( 1 Marks)
###### Hint: Use train_test_split

In [70]:
train_size=0.50
test_size = 0.50 # taking 70:30 training and test set
seed = 7 # Random number seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=train_size, test_size=test_size, random_state=seed)

In [71]:
# Lets build a Ensemble model but need to modify the dataset first

### Q2.Prepare the model data by converting non-numeric to dummy ( 1 Marks)
##### Hint: Use get_dummies

In [101]:
# Print Shape of model data
print("X Shape: ", X_train.shape)
print("Y Shape: ", y_train.shape)

X Shape:  (500, 20)
Y Shape:  (500, 1)


In [103]:
for col in X_train.columns:
    if(X_train[col].dtype == object):
        dummies = pd.get_dummies(X_train[col])
        X_train[dummies.columns] = dummies
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 262 to 175
Data columns (total 72 columns):
account_check_status                                                500 non-null object
duration_in_month                                                   500 non-null int64
credit_history                                                      500 non-null object
purpose                                                             500 non-null object
credit_amount                                                       500 non-null int64
savings                                                             500 non-null object
present_emp_since                                                   500 non-null object
installment_as_income_perc                                          500 non-null int64
personal_status_sex                                                 500 non-null object
other_debtors                                                       500 non-null object
present_res_sinc

### Check for highly correlated variables but don't required any treatment for this use case

In [104]:
#Highly Correlated Variables will be those with correlation above 0.3
corrTable = X_train.corr()
corr_triu = corrTable.where(~np.tril(np.ones(corrTable.shape)).astype(np.bool))
corr_triu = corr_triu.stack()
corr_triu.index.names = ['Col1', 'Col2']
corr_triu = corr_triu[corr_triu > 0.3].to_frame()
corr_triu

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Col1,Col2,Unnamed: 2_level_1
duration_in_month,credit_amount,0.669686
credit_amount,unknown / no property,0.307632
credit_amount,management/ self-employed/ highly qualified employee/ officer,0.334242
present_res_since,.. >= 7 years,0.319913
age,.. >= 7 years,0.352581
credits_this_bank,critical account/ other credits existing (not at this bank),0.54003
people_under_maintenance,male : single,0.300646
all credits at this bank paid back duly,bank,0.353811
unemployed,unemployed/ unskilled - non-resident,0.550908
unknown / no property,for free,0.808156


### Drop the original variables which are converted to dummy

In [117]:
for col in X_train.columns:
    if(X_train[col].dtype == object):
        X_train.drop(columns = col, axis = 1, inplace = True)
print("NaN Values (if any): ", X_train[X_train.isna().any(axis=1)].shape)
print("Null Values (if any): ", X_train[X_train.isnull().any(axis=1)].shape)
#No Null

NaN Values (if any):  (0, 59)
Null Values (if any):  (0, 59)


### Q3 Split Train/Test data 70:30 ratio( 1 Marks)
##### Hint:from sklearn.model_selection import train_test_split

In [193]:
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_train, y_train, train_size = 0.7, test_size = 0.3, random_state=seed)

### Q4 Build Random Forest Model( 1 Marks)
#### Hint:from sklearn.ensemble import RandomForestClassifier using n_jobs=2,n_estimators=500,criterion="entropy",random_state=9999

In [194]:
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_jobs = 2, n_estimators = 500, criterion = "entropy", random_state=9999)
rfcl = rfcl.fit(X_train_final, y_train_final)
rfcl_prediction = rfcl.predict(X_test_final)

### Q5 Calculate Confusion Matrix and Accuracy score (1 Marks)
##### Hint: Use confusion_matrix and accuracy_score

In [195]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, auc
confusion_matrix(y_test_final, rfcl_prediction)

array([[96,  7],
       [34, 13]])

In [196]:
accuracy_score(y_test_final, rfcl_prediction)

0.7266666666666667

### Q6 Show the list of the features importance( 1 Marks)

In [197]:
print("Number of Features: ", rfcl.feature_importances_.size)
feature_importance_list = pd.DataFrame(rfcl.feature_importances_)
feature_importance_list = feature_importance_list.T
feature_importance_list.columns = X_train_final.columns
feature_importance_list

Number of Features:  59


Unnamed: 0,duration_in_month,credit_amount,installment_as_income_perc,present_res_since,age,credits_this_bank,people_under_maintenance,0 <= ... < 200 DM,< 0 DM,>= 200 DM / salary assignments for at least 1 year,...,for free,own,rent,management/ self-employed/ highly qualified employee/ officer,skilled employee / official,unemployed/ unskilled - non-resident,unskilled - resident,"yes, registered under the customers name",no,yes
0,0.087359,0.109353,0.034252,0.035137,0.074353,0.015803,0.01056,0.017504,0.028865,0.007796,...,0.006268,0.016677,0.011936,0.008481,0.016435,0.003946,0.01158,0.014117,0.002102,0.001537


### Q7 K-fold cross-validation( 2 Marks)
##### k-fold cross validation( without stratification)
##### Usually k is set as 10-20 in practical settings, depends on data set size

In [198]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [199]:
# Use below values
num_folds = 10
seed = 77
k_fold = KFold(n_splits = num_folds, random_state = seed)

10

In [200]:
#Validate the Random Forest model build above using k fold

In [201]:
score_array = cross_val_score(rfcl, X_train_final, y_train_final, cv = k_fold)
print(score_array)

[0.82857143 0.77142857 0.71428571 0.71428571 0.74285714 0.74285714
 0.68571429 0.74285714 0.77142857 0.6       ]


In [202]:
#Calculate Mean score

In [203]:
avg_score = np.mean(score_array)
print("Average Score: ", avg_score)

Average Score:  0.7314285714285714


In [204]:
# Calculate score standard deviation using std()

In [205]:
print("Standard Score Deviation: ", score_array.std())

Standard Score Deviation:  0.057427860692119394


# Q8 Print the confusion matrix( 1 Marks)

In [210]:
for train_index, test_index in k_fold.split(X_train_final, y_train_final):
    print(train_index, test_index)
    X_train, X_test = X_train_final[train_index], X_test_final[test_index]
    y_train, y_test = y_train_final[train_index], y_test_final[test_index]
    y_pred=rfcl.predict(X_test)
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')
    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

    plt.show()

[ 35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70
  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
 269 270 271 272 273 274 275 276 277 278 279 280 28

KeyError: '[ 35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52\n  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70\n  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88\n  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106\n 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124\n 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142\n 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160\n 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178\n 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196\n 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214\n 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232\n 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250\n 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268\n 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286\n 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304\n 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322\n 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340\n 341 342 343 344 345 346 347 348 349] not in index'

# Q9.Classification accuracy: 
percentage of correct predictions and Calculate sensitivity (or True Positive Rate or Recall) and Precision.
( 1 Marks)

# Q10.Plot Receiver Operating Characteristic (ROC) Curves( 1 Marks)

In [0]:
#Hint: Use roc_curve

ROC curve can help you to choose a threshold that balances sensitivity and specificity in a way that makes sense for your particular context

# Q11. Calculate AUC(the percentage of the ROC plot that is underneath the curve) - optional

### Bootstrapping ( Bonus)
##### Given a dataset of size n, a bootstrap sample is created by sampling n instances uniformly from the data (with/without replacement)
##### Create a model with each bootstrap sample and validate it with the test set
##### Final result is calculated by averaging the accuracy of models

In [0]:
# Number of iterations for bootstrapping
bootstrap_iteration = 10
accuracy = []

In [0]:
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

for i in range(bootstrap_iteration):
    X_, y_ = resample(X_train, y_train)
    rfm.fit(X_, y_)
    y_pred = rfm.predict(X_test)
    
    acc = accuracy_score(y_pred, y_test)
    accuracy.append(acc)

In [0]:
accuracy = np.array(accuracy)
print('Accuracy Score')
print('Avearge: ', accuracy.mean())
print('Standard deviation: ', accuracy.std())

Accuracy Score
Avearge:  0.6893333333333334
Standard deviation:  0.014966629547095768
