**Step 1: Load and Preprocess the Data**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Load the training dataset
train_data = pd.read_csv('train.csv')

# Display basic information about the dataset
print(train_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63978 entries, 0 to 63977
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        63978 non-null  object 
 1   Customer_ID               63977 non-null  object 
 2   Month                     63977 non-null  object 
 3   Name                      57586 non-null  object 
 4   Age                       63977 non-null  object 
 5   SSN                       63977 non-null  object 
 6   Occupation                63977 non-null  object 
 7   Annual_Income             63977 non-null  object 
 8   Monthly_Inhand_Salary     54404 non-null  float64
 9   Num_Bank_Accounts         63977 non-null  float64
 10  Num_Credit_Card           63977 non-null  float64
 11  Interest_Rate             63977 non-null  float64
 12  Num_of_Loan               63977 non-null  object 
 13  Type_of_Loan              56801 non-null  object 
 14  Delay_

In [2]:
# Load the testing dataset
test_data = pd.read_csv('test.csv')

# Display basic information about the dataset
print(test_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        50000 non-null  object 
 1   Customer_ID               50000 non-null  object 
 2   Month                     50000 non-null  object 
 3   Name                      44985 non-null  object 
 4   Age                       50000 non-null  object 
 5   SSN                       50000 non-null  object 
 6   Occupation                50000 non-null  object 
 7   Annual_Income             50000 non-null  object 
 8   Monthly_Inhand_Salary     42502 non-null  float64
 9   Num_Bank_Accounts         50000 non-null  int64  
 10  Num_Credit_Card           50000 non-null  int64  
 11  Interest_Rate             50000 non-null  int64  
 12  Num_of_Loan               50000 non-null  object 
 13  Type_of_Loan              44296 non-null  object 
 14  Delay_

**Preprocessing and Feature Engineering**

In [3]:
# Handle missing values
train_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='ffill', inplace=True)

In [4]:
# Combine training and test data for label encoding consistency
combined_data = pd.concat([train_data, test_data])

In [5]:
print(combined_data.dtypes)


ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
SSN                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts           float64
Num_Credit_Card             float64
Interest_Rate               float64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date         float64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
Credit_Score                

In [6]:
combined_data['Monthly_Balance'] = combined_data['Monthly_Balance'].astype(str)


In [7]:
combined_data['Credit_Score'] = combined_data['Credit_Score'].astype(str)


In [8]:
print(combined_data.dtypes)


ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
SSN                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts           float64
Num_Credit_Card             float64
Interest_Rate               float64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date         float64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
Credit_Score                

In [9]:
# Encode categorical variables using Label Encoding
label_encoders = {}
for column in combined_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined_data[column] = le.fit_transform(combined_data[column])
    label_encoders[column] = le

In [10]:
# Split data into features and target
X_train = combined_data[:len(train_data)].drop('Credit_Score', axis=1)
y_train = combined_data[:len(train_data)]['Credit_Score']
X_test = combined_data[len(train_data):]

**Feature Scaling**

In [11]:
X_train

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,24608,12320,4,84,338,10205,12,6134,1824.843333,3.0,...,4.0,3,12165,26.822620,180,1,49.574949,92963,3,48321
1,24625,12320,3,84,338,10205,12,6134,1824.843333,3.0,...,4.0,1,12165,31.944960,180,1,49.574949,8707,4,35905
2,24642,12320,7,84,0,10205,12,6134,1824.843333,3.0,...,4.0,1,12165,28.609352,184,1,49.574949,93724,5,55503
3,24659,12320,0,84,338,10205,12,6134,1824.843333,3.0,...,4.0,1,12165,31.377862,185,1,49.574949,34460,6,13095
4,24676,12320,8,84,338,10205,12,6134,1824.843333,3.0,...,4.0,1,12165,24.797347,186,1,49.574949,63037,2,59023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63973,36771,11258,6,3387,690,10662,5,1812,10426.133333,8.0,...,2.0,2,10741,28.088376,312,2,149.982516,16756,6,506
63974,36772,11258,5,3387,690,10662,5,1812,10426.133333,8.0,...,3.0,2,10741,29.237289,315,2,149.982516,68103,3,102350
63975,36773,11258,1,3387,690,10662,5,1812,10426.133333,8.0,...,3.0,2,10741,33.805365,316,2,149.982516,64657,3,104051
63976,36778,11732,4,5676,710,1533,7,2296,10886.000000,0.0,...,4.0,1,12022,44.567643,101,1,106.792038,29117,2,993


In [12]:
X_test

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,24744,12320,11,84,338,10205,12,6134,1824.843333,3.0,...,1,12165,35.030402,190,1,49.574949,41809,6,7552,3
1,24761,12320,10,84,370,10205,12,6134,1824.843333,3.0,...,1,12165,33.053114,181,1,49.574949,36840,2,65050,3
2,24778,12320,9,84,370,10205,12,6134,1824.843333,3.0,...,1,12165,33.811894,181,1,49.574949,20450,5,26555,3
3,24795,12320,2,84,396,10205,12,6134,1824.843333,3.0,...,1,12165,32.430559,191,1,49.574949,61029,2,59789,3
4,24948,1066,11,7491,477,53,15,10568,3037.986667,2.0,...,1,10895,25.926822,244,1,18.816215,61290,1,87636,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,58163,7857,2,7932,1046,406,1,6679,1929.906667,10.0,...,3,8199,34.780553,368,2,60.964772,19868,6,31543,3
49996,58164,8833,11,6528,397,1009,9,11749,1929.906667,4.0,...,1,10234,27.758522,302,0,35.104023,30273,6,76008,3
49997,58165,8833,10,6528,397,1009,9,11749,3359.415833,4.0,...,1,10234,36.858542,311,1,35.104023,103784,4,61640,3
49998,58167,8833,9,6528,397,1009,9,11749,3359.415833,4.0,...,1,10234,39.139840,312,1,35.104023,102508,3,84913,3


In [13]:
# Initialize StandardScaler
scaler = StandardScaler()

# Define the columns to be scaled
columns_to_scale = X_train.columns

# Fit and transform X_train
X_train_scaled = scaler.fit_transform(X_train[columns_to_scale])

# Transform X_test using the same scaler
X_test_scaled = scaler.transform(X_test[columns_to_scale])


In [14]:
# Encode the target variable 'y_train'
le_target = LabelEncoder()
y_train_encoded = le_target.fit_transform(y_train)

In [15]:
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train_scaled, y_train_encoded, test_size=0.2, random_state=42)


In [16]:
# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "K Neighbors": KNeighborsClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "ANN": keras.Sequential([
        keras.layers.Dense(units=128, activation='relu', input_dim=X_train.shape[1]),
        keras.layers.Dense(units=64, activation='relu'),
        keras.layers.Dense(units=len(np.unique(y_train)), activation='softmax')
    ])
}

# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    if name == "ANN":
        if y_train.dtype == 'O':  # Convert labels to integers for ANN
            le = LabelEncoder()
            y_train_encoded = le.fit_transform(y_train)
            y_val_encoded = le.transform(y_val)

            clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            clf.fit(X_train, y_train_encoded, epochs=10, batch_size=32, verbose=0)
            y_pred_prob = clf.predict(X_val)
            y_pred = np.argmax(y_pred_prob, axis=1)
        else:
            clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            clf.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
            y_pred_prob = clf.predict(X_val)
            y_pred = np.argmax(y_pred_prob, axis=1)
    else:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    results[name] = accuracy

# Display results
for name, accuracy in results.items():
    print(f"{name}: Accuracy = {accuracy:.4f}")
    print(f"{name} Classification Report:")
    print(classification_report(y_val, y_pred))
    print("="*50)


Logistic Regression: Accuracy = 0.5986
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.65      0.59      2221
           1       0.67      0.61      0.64      3785
           2       0.72      0.70      0.71      6790

    accuracy                           0.67     12796
   macro avg       0.64      0.66      0.65     12796
weighted avg       0.67      0.67      0.67     12796

SVM: Accuracy = 0.6460
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.65      0.59      2221
           1       0.67      0.61      0.64      3785
           2       0.72      0.70      0.71      6790

    accuracy                           0.67     12796
   macro avg       0.64      0.66      0.65     12796
weighted avg       0.67      0.67      0.67     12796

K Neighbors: Accuracy = 0.6354
K Neighbors Classification Report:
              precision    recall  f1-sc