In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

## Reading the data

In [49]:
original_df=pd.read_csv('Processed_train.csv')

In [50]:
# original_df is the original dataframe

In [51]:
print('Shape:',original_df.shape)

Shape: (204277, 17)


In [52]:
original_df.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,18,137576,209136,846,26,2,10.47,60,0.81,1,3,2,1,0,0,0,0
1,47,57194,5970,748,30,2,19.72,36,0.73,1,1,1,0,1,1,0,0
2,26,84328,95065,453,7,2,24.25,12,0.45,3,3,3,0,0,3,1,0
3,53,49795,229582,533,107,3,14.44,60,0.17,2,3,2,1,0,2,1,1
4,49,115450,22072,840,0,4,24.48,12,0.11,2,2,2,0,1,1,1,0


In [53]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204277 entries, 0 to 204276
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Age             204277 non-null  int64  
 1   Income          204277 non-null  int64  
 2   LoanAmount      204277 non-null  int64  
 3   CreditScore     204277 non-null  int64  
 4   MonthsEmployed  204277 non-null  int64  
 5   NumCreditLines  204277 non-null  int64  
 6   InterestRate    204277 non-null  float64
 7   LoanTerm        204277 non-null  int64  
 8   DTIRatio        204277 non-null  float64
 9   Education       204277 non-null  int64  
 10  EmploymentType  204277 non-null  int64  
 11  MaritalStatus   204277 non-null  int64  
 12  HasMortgage     204277 non-null  int64  
 13  HasDependents   204277 non-null  int64  
 14  LoanPurpose     204277 non-null  int64  
 15  HasCoSigner     204277 non-null  int64  
 16  Default         204277 non-null  int64  
dtypes: float64

# Trying Various Models

#### Note : Since the models used here are XGBOOST and Naive Bayes, I have experimented with both scaled input and regular input. However, since XGBOOST is a tree based algorithm, scaling is not necessary and scaling is also not needed generally for Naive Bayes. This is because Naive Bayes classifiers work based on the probabilities of features given a class, rather than distances or magnitudes. 

### Train Test Split

In [54]:
# Separate features and target
X = original_df.drop('Default', axis=1)
y = original_df['Default']

# Split data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

# Check the shape of the splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (163421, 16)
X_test shape: (40856, 16)
y_train shape: (163421,)
y_test shape: (40856,)


## XGBOOST

#### XGBoost, being a tree-based model, is generally insensitive to the scale of features because decision trees split on values rather than relying on distance or magnitude.

In [55]:
# Plain XGBOOST without any hyperparameter tuning at all

In [56]:
from sklearn.metrics import accuracy_score

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(random_state=44)

# Fit the model to the training data
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("XGBoost Model Accuracy:", accuracy)

XGBoost Model Accuracy: 0.8871157235167417


In [57]:
# Let us try to improve model accuracy by hyperparameter tuning using gridSearchCV

In [58]:
# Define parameter grid for GridSearchCV
param_grid = {
      'n_estimators': [180,190,200,210], # let us search for 180,190,200,210
    'learning_rate': [0.1,0.2,0.3], # default is 0.3, so let us search for 0.1,0.2,0.3
    'max_depth': [3,4,5,6,7], # default is 6, so let us search for 3,4,5,6,7
    'subsample': [0.7,0.8,0.9,1.0], # default is 1, so let us search for 0.7,0.8,0.9,1
    'colsample_bytree': [0.8,0.9,1.0],  # default is 1, so let us search for 0.8,0.9,1
}

# Initialize the XGBoost classifier
xgb_clf = XGBClassifier(eval_metric='logloss', random_state=9)

# Set up GridSearchCV
xgb_grid = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available cores
    verbose=1,  # Print progress
)

In [59]:
xgb_random_search = RandomizedSearchCV(estimator=xgb_clf, 
                                       param_distributions=param_grid, 
                                       n_iter=25,  # Try with these many iterations , it works
                                       cv=5, 
                                       n_jobs=-1, 
                                       verbose=1,random_state=9)

In [60]:
xgb_random_search.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [61]:
print(xgb_random_search.best_estimator_)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=200,
              n_jobs=None, num_parallel_tree=None, random_state=9, ...)


In [62]:
best_model=xgb_random_search.best_estimator_
best_model.fit(X_train, y_train)  
y_pred = best_model.predict(X_test)
# Evaluate the test set accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy with best model: ", accuracy)

Test set accuracy with best model:  0.8875562952809869


# Naive Bayes

####  Since data includes mostly continuous and encoded ordinal features, Gaussian Naive Bayes is appropriate

In [63]:
# Initialize the Gaussian Naive Bayes model
gnb = GaussianNB()

In [64]:
gnb.fit(X_train,y_train)

In [65]:
y_pred_guassian_naive_bayes=gnb.predict(X_test)

In [66]:
print('Accuracy of Gaussian Naive Bayes:',accuracy_score(y_test,y_pred_guassian_naive_bayes))

Accuracy of Gaussian Naive Bayes: 0.8861366751517525


## Generating Output

### Read the processed test input

In [67]:
test_df=pd.read_csv('Processed_test.csv')

In [68]:
test_df.isna().sum()

LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
dtype: int64

In [69]:
test_df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,CKV34LU7V7,55,112656,92393,581,113,2,23.54,36,0.15,4,3,2,1,1,4,0
1,62KTYNH93J,56,91569,131575,641,54,1,15.19,12,0.43,1,2,1,1,1,1,1
2,JGFUSOIUH7,26,78169,75417,569,105,3,18.02,12,0.29,3,2,3,1,1,1,1
3,4538THBHOX,26,63033,10804,326,118,1,14.71,24,0.41,1,2,2,0,0,0,1
4,DXLNA06JHR,24,29665,21182,662,102,3,15.02,60,0.69,4,1,2,0,1,0,1


### Prediction using best_model

In [70]:
X_testing_df = test_df.drop('LoanID', axis=1)
y_pred=best_model.predict(X_testing_df)

In [71]:
# Create a DataFrame with LoanID and predictions
results_df = pd.DataFrame({
    'LoanID': test_df['LoanID'],
    'Default': y_pred
})

# Save to CSV
results_df.to_csv('predictions_randomized_xg_boost.csv', index=False)

In [72]:
results_df['Default'].value_counts()

Default
0    50341
1      729
Name: count, dtype: int64

In [73]:
### this gave the best accuracy

### Prediction using regular XGBOOST model

In [74]:
# Create a DataFrame with LoanID and predictions
y_pred2=xgb_model.predict(X_testing_df)
results_df = pd.DataFrame({
    'LoanID': test_df['LoanID'],
    'Default': y_pred2
})

# Save to CSV
results_df.to_csv('predictions_regular_xg_boost.csv', index=False)

In [75]:
results_df['Default'].value_counts()

Default
0    50094
1      976
Name: count, dtype: int64

In [76]:
### this one gave lesser accuracy then the one with randomized search cv

### Prediction using Gaussian Naive Bayes

In [77]:
y_pred=gnb.predict(X_testing_df)
# Create a DataFrame with LoanID and predictions
results_using_naive_bayes_df = pd.DataFrame({
    'LoanID': test_df['LoanID'],
    'Default': y_pred
})

# Save to CSV
results_using_naive_bayes_df.to_csv('predictions_naive_bayes.csv', index=False)

In [78]:
results_using_naive_bayes_df['Default'].value_counts()

Default
0    51009
1       61
Name: count, dtype: int64

In [79]:
# Gave accuracy of 88.4; the least of all the models so far. Naive Bayes makes the assumption that all input features are independent.

In [80]:
# But that isn't really the case in many cases. Loan Amount and Income may be strongly related. So can Income and Education

In [81]:
print(original_df.corr())

                     Age    Income  LoanAmount  CreditScore  MonthsEmployed  \
Age             1.000000  0.000029   -0.001023     0.000265       -0.002529   
Income          0.000029  1.000000   -0.000661    -0.002797        0.001690   
LoanAmount     -0.001023 -0.000661    1.000000     0.001075        0.002110   
CreditScore     0.000265 -0.002797    0.001075     1.000000        0.000182   
MonthsEmployed -0.002529  0.001690    0.002110     0.000182        1.000000   
NumCreditLines -0.002301 -0.003552    0.000293     0.000795        0.002687   
InterestRate   -0.001661 -0.004746   -0.003809    -0.000579       -0.001697   
LoanTerm        0.000873 -0.002284    0.002243     0.000103       -0.000778   
DTIRatio       -0.004562 -0.000954    0.001639    -0.001905        0.001435   
Education      -0.001579  0.002941    0.002322    -0.001177       -0.000789   
EmploymentType  0.001223  0.004298   -0.003011    -0.002151        0.000384   
MaritalStatus   0.001981  0.000363   -0.002527    -0

In [82]:
# Let us see if scaling improves accuracy

# XGBOOST on scaled data 

### Just to see if it can improve the accuracy

In [83]:
# Fit scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [84]:
# transform the test data as well

In [85]:
X_test_scaled = scaler.transform(X_test)

In [86]:
best_model.fit(X_train_scaled, y_train)  
y_pred_on_scaled_input = best_model.predict(X_test_scaled)
# Evaluate the test set accuracy
accuracy = accuracy_score(y_test, y_pred_on_scaled_input)
print("Test set accuracy with best model: ", accuracy)

Test set accuracy with best model:  0.8875562952809869


In [87]:
X_testing_df_scaled=scaler.transform(X_testing_df)

In [88]:
y_pred_using_scaled_data=best_model.predict(X_testing_df_scaled)
results_df = pd.DataFrame({
    'LoanID': test_df['LoanID'],
    'Default': y_pred_using_scaled_data
})

# Save to CSV
results_df.to_csv('predictions_randomized_xg_boost_on_scaled_data.csv', index=False)

In [89]:
results_df['Default'].value_counts()

Default
0    50341
1      729
Name: count, dtype: int64

### Gave exact same accuracy as XGBOOST without scaling hence proving our point

### Naive Bayes on Scaled Data

In [90]:
gnb.fit(X_train_scaled,y_train)

In [91]:
y_pred_naive_bayes_using_scaled_input=gnb.predict(X_test_scaled)

In [92]:
print('Accuracy score:',accuracy_score(y_test,y_pred_naive_bayes_using_scaled_input))

Accuracy score: 0.8865527707068729


In [93]:
y_pred_naive_bayes_using_scaled_input=gnb.predict(X_testing_df_scaled)
results_df = pd.DataFrame({
    'LoanID': test_df['LoanID'],
    'Default': y_pred_naive_bayes_using_scaled_input
})

# Save to CSV
results_df.to_csv('predictions_naive_bayes_on_scaled_data.csv', index=False)

In [94]:
results_df['Default'].value_counts()

Default
0    50903
1      167
Name: count, dtype: int64

### This worked slightly better than naive Bayes on unscaled data