In [1]:
# Loading necessary libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('Training Data.csv')

## Model training
##### since the class label is imbalanced and the dataset is large, then some of the suitable algorithems ar:
- Decision Treee (as a baseline, can handle imbalance data because they make splits based on the information gain.
-  Random Forest (can handle class imbalance by using bootstrap aggregating (bagging)) which are going to be the models in subject


In [3]:
# viewing the data set after the preprocessing
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [4]:
# Encoding Categorical Data into different data frame to be able to scale models
encoder = LabelEncoder()
categorical = ['Married/Single','House_Ownership','Car_Ownership','Profession', 'CITY', 'STATE']
for cols in categorical:
    df[cols] = encoder.fit_transform(df[cols])

In [5]:
# Setting the X (features) and Y (label/output/target)
X = df.drop(['Risk_Flag'], axis= 1)
Y = df.Risk_Flag

In [6]:
### Scaling the features using MinMaxScaler 
##### since most of the features has a uniform ditribution
mm_scaler = preprocessing.MinMaxScaler()
X_mm = mm_scaler.fit_transform(X)
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X_mm, Y, test_size = 0.4, random_state=42, stratify=Y) 

### Decision tree

In [7]:
# Initialize the decision tree classifier
dct = DecisionTreeClassifier(random_state=42)
# Fit the classifier to the training data
dct.fit(X_train, y_train)
# Make predictions on the testing data
dtc_y_pred = dct.predict(X_test)
print('Training Accuracy: {:.3f}'.format(dct.score(X_train, y_train)))
print('Test Accuracy: {:.3f}'.format(dct.score(X_test, y_test)))

Training Accuracy: 1.000
Test Accuracy: 0.861


### Random Forest

In [8]:
# Initialize the random forest classifier with class weights
RF_model = RandomForestClassifier(n_estimators=100, class_weight='balanced')
# Train the model on the training data
RF_model.fit(X_train, y_train)
# Make predictions for test data
RF_y_pred = RF_model.predict(X_test)
print('Training Accuracy : {:.3f}'.format(RF_model.score(X_train, y_train)))
print('Test Accuracy : {:.3f}'.format(RF_model.score(X_test, y_test)))

Training Accuracy : 1.000
Test Accuracy : 0.907


### Using undersampling 
##### since we have imbalance class labels and a very large dataset

In [9]:
# Initialize the random under sampler
rus = RandomUnderSampler(random_state=42)
# Resample the training data
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
### Decision tree with undersampled dataset
# Fit the classifier to the undersampled training data
dct.fit(X_train_res, y_train_res)
# Make predictions on the testing data
dtc_y_pred = dct.predict(X_test)
print('Training Accuracy: {:.3f}'.format(dct.score(X_train_res, y_train_res)))
print('Test Accuracy: {:.3f}'.format(dct.score(X_test, y_test)))

Training Accuracy: 1.000
Test Accuracy: 0.751


##### as noticed the model is overfitting , use cross-validation to try to fix the issue.

In [10]:
# Perform 10-fold cross-validation with undersampling
scores = cross_val_score(dct, X_train_res, y_train_res, cv=10, scoring='accuracy')
# Print the cross-validation scores
print(f"Cross-validation scores: {scores}")
print(f"Mean score: {scores.mean():.3f}")

Cross-validation scores: [0.75241935 0.7516129  0.75295699 0.75537634 0.74408602 0.72177419
 0.74589944 0.74697499 0.74885722 0.7058349 ]
Mean score: 0.743


### Random forest with undersampled dataset

In [11]:
# Train the model on the undersampled training data
RF_model.fit(X_train_res, y_train_res)
# Make predictions for test data
RF_y_pred = RF_model.predict(X_test)
print('Training Accuracy : {:.3f}'.format(RF_model.score(X_train_res, y_train_res)))
print('Test Accuracy : {:.3f}'.format(RF_model.score(X_test, y_test)))

Training Accuracy : 1.000
Test Accuracy : 0.809


In [12]:
# Perform 7-fold cross-validation with undersampling
scores = cross_val_score(RF_model, X_train_res, y_train_res, cv=7, scoring='accuracy')
# Print the cross-validation scores
print(f"Cross-validation scores: {scores}")
print(f"Mean score: {scores.mean():.3f}")

Cross-validation scores: [0.85133609 0.85735792 0.85491155 0.85773429 0.85152428 0.85431959
 0.85582533]
Mean score: 0.855


### Oversampling
##### it helps with imbalance data

In [13]:
# Instantiate the SMOTE algorithm
smote = SMOTE(random_state=42)
# Fit and apply SMOTE to the data
X_train_ovsam, y_train_ovsam = smote.fit_resample(X_train, y_train)
# Print the number of samples in each class before and after oversampling
print("Before oversampling:\n", Y.value_counts())
print("After oversampling:\n", y_train_ovsam.value_counts())
print("Shape of X_train_ovsam:", X_train_ovsam.shape)

Before oversampling:
 Risk_Flag
0    221004
1     30996
Name: count, dtype: int64
After oversampling:
 Risk_Flag
0    132602
1    132602
Name: count, dtype: int64
Shape of X_train_ovsam: (265204, 12)


### Decision tree on oversampled data

In [14]:
# Initialize the decision tree classifier
dct = DecisionTreeClassifier(random_state=42)
# Fit the classifier to the training data
dct.fit(X_train_ovsam, y_train_ovsam)
# Make predictions on the testing data
dtc_y_pred = dct.predict(X_test)
print('Training Accuracy: {:.3f}'.format(dct.score(X_train_ovsam, y_train_ovsam)))
print('Test Accuracy: {:.3f}'.format(dct.score(X_test, y_test)))

Training Accuracy: 1.000
Test Accuracy: 0.834


### Random forest on oversampled data

In [15]:
# Initialize the random forest classifier without class weights since the data is now balanced
RF_model = RandomForestClassifier(n_estimators=100)
# Train the model on the training data
RF_model.fit(X_train_ovsam, y_train_ovsam)
# Make predictions for test data
RF_y_pred = RF_model.predict(X_test)
print('Training Accuracy : {:.3f}'.format(RF_model.score(X_train_ovsam, y_train_ovsam)))
print('Test Accuracy : {:.3f}'.format(RF_model.score(X_test, y_test)))

Training Accuracy : 1.000
Test Accuracy : 0.891
