# **Bureau ID Assignment by Sarang Deb Saha**

In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier

##1.  Loading the data

In [152]:
train = pd.read_csv('/content/Assignment_Train.csv')
test = pd.read_csv('/content/Assignment_Test.csv')

##2. Data Overview

In [153]:
print("Training Data Overview:")
train.head()

Training Data Overview:


Unnamed: 0,DEALER ID,APPLICATION LOGIN DATE,HDB BRANCH NAME,HDB BRANCH STATE,FIRST NAME,MIDDLE NAME,LAST NAME,mobile,AADHAR VERIFIED,Cibil Score,...,Phone Social Premium.shaadi,Phone Social Premium.skype,Phone Social Premium.toi,Phone Social Premium.whatsapp,Phone Social Premium.yatra,Phone Social Premium.zoho,phone_digitalage,phone_nameMatchScore,phone_phoneFootprintStrengthOverall,Application Status
0,106989,07/20/2022,DELHI-SF,DELHI,SUNIL,,CHANDER,9210574080,NO,726.0,...,0.0,0.0,1.0,,,0.0,5324.0,67.222222,High,APPROVED
1,108975,07/28/2022,PATNA-SF,BIHAR,AMRIT,,KUMAR,8877987018,NO,,...,0.0,0.0,0.0,,,0.0,1998.0,100.0,High,APPROVED
2,111004,07/15/2022,DARJEELING-SF,WEST BENGAL,ANIMESH,,THAPA,8910862135,NO,737.0,...,0.0,0.0,0.0,,,0.0,-1.0,-1.0,Low,APPROVED
3,192020,07/04/22,SAHARANPUR-SF,UTTAR PRADESH,ADITYA,,SINGH,9758428017,NO,713.0,...,0.0,0.0,1.0,,,0.0,1998.0,72.777778,High,APPROVED
4,55095,07/15/2022,MODASA-SF,GUJARAT,PARMAR,HARESHBHAI,AMRUTBHAI,9687028486,NO,669.0,...,0.0,0.0,1.0,,,0.0,1998.0,68.095238,High,DECLINED


In [154]:
print("\nTest Data Overview:")
test.head()


Test Data Overview:


Unnamed: 0,UID,DEALER ID,APPLICATION LOGIN DATE,HDB BRANCH NAME,HDB BRANCH STATE,FIRST NAME,MIDDLE NAME,LAST NAME,mobile,AADHAR VERIFIED,...,Phone Social Premium.rummycircle,Phone Social Premium.shaadi,Phone Social Premium.skype,Phone Social Premium.toi,Phone Social Premium.whatsapp,Phone Social Premium.yatra,Phone Social Premium.zoho,phone_digitalage,phone_nameMatchScore,phone_phoneFootprintStrengthOverall
0,1844045271814558464,105615,07/12/22,HUBLI-SF,,VENUGOPAL,H,BHARADHVAJ,7019759674,NO,...,,0.0,0.0,0.0,0.0,,0.0,676,100.0,Medium
1,1840349097823778816,91593,07/14/2022,SATNA-SF,MADHYA PRADESH,SHIVDHAR,,CHAUDHARY,7697884828,NO,...,,0.0,0.0,,0.0,,0.0,897,-1.0,Low
2,1488102613362294272,74152,07/07/22,LUCKNOW-SF,UTTAR PRADESH,MANISH,,VERMA,8840079825,NO,...,,0.0,1.0,0.0,1.0,,0.0,839,100.0,Medium
3,555529923942874624,110164,07/13/2022,KOLLAM-SF,KERALA,DEEPU,RAVEENDRAN,RAVEENDRAN,8089276854,NO,...,,0.0,0.0,0.0,,,0.0,2050,-1.0,Low
4,1010213070486150912,113037,07/08/22,PURI-SF,ORISSA,AKASHA,,PRADHAN,9337511958,NO,...,,0.0,0.0,0.0,,,0.0,-1,-1.0,Very Low


## 3. Data Preprocessing

### a. Checking NULL values

In [156]:
train.isnull().sum()

Unnamed: 0,0
DEALER ID,0
APPLICATION LOGIN DATE,0
HDB BRANCH NAME,1
HDB BRANCH STATE,854
FIRST NAME,0
MIDDLE NAME,7145
LAST NAME,681
mobile,0
AADHAR VERIFIED,0
Cibil Score,4297


In [157]:
test.isnull().sum()

Unnamed: 0,0
UID,0
DEALER ID,0
APPLICATION LOGIN DATE,0
HDB BRANCH NAME,0
HDB BRANCH STATE,172
FIRST NAME,0
MIDDLE NAME,1405
LAST NAME,131
mobile,0
AADHAR VERIFIED,0


In [158]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 55 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   DEALER ID                            10000 non-null  int64  
 1   APPLICATION LOGIN DATE               10000 non-null  object 
 2   HDB BRANCH NAME                      9999 non-null   object 
 3   HDB BRANCH STATE                     9146 non-null   object 
 4   FIRST NAME                           10000 non-null  object 
 5   MIDDLE NAME                          2855 non-null   object 
 6   LAST NAME                            9319 non-null   object 
 7   mobile                               10000 non-null  int64  
 8   AADHAR VERIFIED                      10000 non-null  object 
 9   Cibil Score                          5703 non-null   object 
 10  MOBILE VERIFICATION                  10000 non-null  bool   
 11  DEALER NAME                  

### b. Preprocessing function

In [159]:
def preprocess(df):
    # Convert date to datetime and extract date-related features
    df['APPLICATION LOGIN DATE'] = pd.to_datetime(df['APPLICATION LOGIN DATE'], format='%m/%d/%Y', errors='coerce')
    df['APPLICATION_MONTH'] = df['APPLICATION LOGIN DATE'].dt.month
    df['APPLICATION_DAY'] = df['APPLICATION LOGIN DATE'].dt.day
    df['APPLICATION_DAYOFWEEK'] = df['APPLICATION LOGIN DATE'].dt.dayofweek

    # Convert 'Cibil Score' to numeric, forcing errors to NaN
    df['Cibil Score'] = pd.to_numeric(df['Cibil Score'], errors='coerce')

    # Fill missing categorical values with 'Unknown'
    categorical_df = df.select_dtypes(include=['object']).columns
    df[categorical_df] = df[categorical_df].fillna('Unknown')

    # Fill missing boolean (0/1) values and convert to int
    bool_features = [col for col in df.columns if col.startswith('Phone Social Premium.')]
    for feature in bool_features:
        df[feature] = df[feature].fillna(0).astype(int)

    # Impute missing integer and float values with the mean
    imputer = SimpleImputer(strategy='mean')
    df[df.select_dtypes(include=['int64', 'float64']).columns] = imputer.fit_transform(df.select_dtypes(include=['int64', 'float64']))

In [160]:
preprocess(train)
preprocess(test)

In [161]:
train.isna().sum()

Unnamed: 0,0
DEALER ID,0
APPLICATION LOGIN DATE,3540
HDB BRANCH NAME,0
HDB BRANCH STATE,0
FIRST NAME,0
MIDDLE NAME,0
LAST NAME,0
mobile,0
AADHAR VERIFIED,0
Cibil Score,0


In [162]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 58 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   DEALER ID                            10000 non-null  float64       
 1   APPLICATION LOGIN DATE               6460 non-null   datetime64[ns]
 2   HDB BRANCH NAME                      10000 non-null  object        
 3   HDB BRANCH STATE                     10000 non-null  object        
 4   FIRST NAME                           10000 non-null  object        
 5   MIDDLE NAME                          10000 non-null  object        
 6   LAST NAME                            10000 non-null  object        
 7   mobile                               10000 non-null  float64       
 8   AADHAR VERIFIED                      10000 non-null  object        
 9   Cibil Score                          10000 non-null  float64       
 10  MOBILE VERI

### 4. Label Encoding categorical values to Integer

In [163]:
#Converting categorical values to integer values
label_encoder = LabelEncoder()
categorical_df = train.select_dtypes(include=['object']).columns
for i in categorical_df:
   train[i] = label_encoder.fit_transform(train[i])

categorical_df = test.select_dtypes(include=['object']).columns
for i in categorical_df:
   test[i] = label_encoder.fit_transform(test[i])

In [164]:
train

Unnamed: 0,DEALER ID,APPLICATION LOGIN DATE,HDB BRANCH NAME,HDB BRANCH STATE,FIRST NAME,MIDDLE NAME,LAST NAME,mobile,AADHAR VERIFIED,Cibil Score,...,Phone Social Premium.whatsapp,Phone Social Premium.yatra,Phone Social Premium.zoho,phone_digitalage,phone_nameMatchScore,phone_phoneFootprintStrengthOverall,Application Status,APPLICATION_MONTH,APPLICATION_DAY,APPLICATION_DAYOFWEEK
0,106989.0,2022-07-20,140,4,4023,1204,490,9.210574e+09,0,726.000000,...,0.0,0.0,0.0,5324.0,67.222222,0,0,7.0,20.00000,2.000000
1,108975.0,2022-07-28,397,2,207,1204,1409,8.877987e+09,0,706.402118,...,0.0,0.0,0.0,1998.0,100.000000,0,0,7.0,28.00000,3.000000
2,111004.0,2022-07-15,130,24,239,1204,2898,8.910862e+09,0,737.000000,...,0.0,0.0,0.0,-1.0,-1.000000,1,0,7.0,15.00000,4.000000
3,192020.0,NaT,446,21,76,1204,2684,9.758428e+09,0,713.000000,...,0.0,0.0,0.0,1998.0,72.777778,0,0,7.0,21.58808,2.935759
4,55095.0,2022-07-15,340,5,2689,404,127,9.687028e+09,0,669.000000,...,0.0,0.0,0.0,1998.0,68.095238,0,1,7.0,15.00000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,105101.0,NaT,163,21,105,1204,1916,8.400645e+09,0,706.402118,...,0.0,0.0,0.0,-1.0,60.576923,2,0,7.0,21.58808,2.935759
9996,85054.0,NaT,72,2,4042,594,2102,9.708884e+09,0,706.402118,...,0.0,0.0,0.0,1998.0,71.078431,0,0,7.0,21.58808,2.935759
9997,53710.0,NaT,317,15,3425,1204,2,9.888532e+09,0,706.402118,...,1.0,0.0,0.0,1988.0,100.000000,2,0,7.0,21.58808,2.935759
9998,89240.0,2022-07-29,337,21,3425,1204,2684,8.923338e+09,0,706.402118,...,0.0,0.0,0.0,1096.0,-1.000000,1,0,7.0,29.00000,4.000000


### 5. Model Fitting and Evaluation

In [165]:
# Dropping target and redundant features from the training and test set
X = train.drop(columns=['Application Status', 'DEALER ID', 'MOBILE VERIFICATION', 'AADHAR VERIFIED', 'APPLICATION LOGIN DATE'])
y = train['Application Status']
X_test = test.drop(columns=['DEALER ID', 'UID', 'MOBILE VERIFICATION', 'AADHAR VERIFIED', 'APPLICATION LOGIN DATE'])

In [166]:
# Split the data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [173]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_val)

# Support Vector Machine (SVM)
svm = SVC(probability=True)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_val)

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_val)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_val)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

In [174]:
models = {
    "Logistic Regression": y_pred_log_reg,
    "Support Vector Machine": y_pred_svm,
    "K-Nearest Neighbors": y_pred_knn,
    "Naive Bayes": y_pred_nb,
    "Random Forest": y_pred_rf
}

for model_name, y_pred in models.items():
    print(f"{model_name} Accuracy: {accuracy_score(y_val, y_pred)}")
for model_name, y_pred in models.items():
    print(f"{model_name} Classification Report:\n", classification_report(y_val, y_pred))
    print("\n" + "*********************************************************" + "\n")

Logistic Regression Accuracy: 0.6635
Support Vector Machine Accuracy: 0.6635
K-Nearest Neighbors Accuracy: 0.618
Naive Bayes Accuracy: 0.6635
Random Forest Accuracy: 0.8835
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.66      1.00      0.80      1327
           1       0.00      0.00      0.00       673

    accuracy                           0.66      2000
   macro avg       0.33      0.50      0.40      2000
weighted avg       0.44      0.66      0.53      2000


*********************************************************

Support Vector Machine Classification Report:
               precision    recall  f1-score   support

           0       0.66      1.00      0.80      1327
           1       0.00      0.00      0.00       673

    accuracy                           0.66      2000
   macro avg       0.33      0.50      0.40      2000
weighted avg       0.44      0.66      0.53      2000


*********************

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**We notice that Random forest significantly outperforms the other models. Since it is a decision tree-based model, we might want to try a few more tree-based models before concluding our task.**

In [175]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_val)

# Extra Trees Classifier
extra_trees = ExtraTreesClassifier()
extra_trees.fit(X_train, y_train)
y_pred_et = extra_trees.predict(X_val)

In [176]:
# Evaluating Tree-Based Models
tree_based_models = {
    "Decision Tree": y_pred_dt,
    "Extra Trees": y_pred_et
}

for model_name, y_pred in tree_based_models.items():
    print(f"{model_name} Accuracy: {accuracy_score(y_val, y_pred)}")
for model_name, y_pred in tree_based_models.items():
    print(f"{model_name} Classification Report:\n", classification_report(y_val, y_pred))
    print("\n" + "*********************************************************" + "\n")

Decision Tree Accuracy: 0.8655
Extra Trees Accuracy: 0.8555
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.90      0.90      1327
           1       0.80      0.81      0.80       673

    accuracy                           0.87      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.87      0.87      0.87      2000


*********************************************************

Extra Trees Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.83      0.88      1327
           1       0.73      0.90      0.81       673

    accuracy                           0.86      2000
   macro avg       0.84      0.87      0.85      2000
weighted avg       0.87      0.86      0.86      2000


*********************************************************



**Still XGBoost is the best performing model. Thus using the xgb model**

### 6. Prediction

In [177]:
test_predictions = rf.predict(X_test)
test_predictions

array([0, 0, 0, ..., 1, 0, 0])

In [178]:
result = pd.DataFrame({
    'UID': test['UID'],
    'Prediction': test_predictions
})

result['Prediction'] = result['Prediction'].apply(lambda x: 'APPROVED' if x == 1 else 'DECLINED')
result.to_csv('predictions.csv', index=False)