**Step 1: Load and Preprocess the Data**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Load the training dataset
train_data = pd.read_csv('/content/train.csv')

# Display basic information about the dataset
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14918 entries, 0 to 14917
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        14918 non-null  object 
 1   Customer_ID               14918 non-null  object 
 2   Month                     14918 non-null  object 
 3   Name                      13431 non-null  object 
 4   Age                       14918 non-null  object 
 5   SSN                       14918 non-null  object 
 6   Occupation                14918 non-null  object 
 7   Annual_Income             14918 non-null  object 
 8   Monthly_Inhand_Salary     12666 non-null  float64
 9   Num_Bank_Accounts         14918 non-null  int64  
 10  Num_Credit_Card           14918 non-null  int64  
 11  Interest_Rate             14918 non-null  int64  
 12  Num_of_Loan               14918 non-null  object 
 13  Type_of_Loan              13270 non-null  object 
 14  Delay_

In [3]:
# Load the testing dataset
test_data = pd.read_csv('/content/test..csv')

# Display basic information about the dataset
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        50000 non-null  object 
 1   Customer_ID               50000 non-null  object 
 2   Month                     50000 non-null  object 
 3   Name                      44985 non-null  object 
 4   Age                       50000 non-null  object 
 5   SSN                       50000 non-null  object 
 6   Occupation                50000 non-null  object 
 7   Annual_Income             50000 non-null  object 
 8   Monthly_Inhand_Salary     42502 non-null  float64
 9   Num_Bank_Accounts         50000 non-null  int64  
 10  Num_Credit_Card           50000 non-null  int64  
 11  Interest_Rate             50000 non-null  int64  
 12  Num_of_Loan               50000 non-null  object 
 13  Type_of_Loan              44296 non-null  object 
 14  Delay_

**Preprocessing and Feature Engineering**

In [4]:
# Handle missing values
train_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='ffill', inplace=True)

In [5]:
# Combine training and test data for label encoding consistency
combined_data = pd.concat([train_data, test_data])

In [6]:
print(combined_data.dtypes)

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
SSN                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
Credit_Score                

In [7]:
combined_data['Monthly_Balance'] = combined_data['Monthly_Balance'].astype(str)


In [8]:
combined_data['Credit_Score'] = combined_data['Credit_Score'].astype(str)

In [9]:
print(combined_data.dtypes)

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
SSN                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
Credit_Score                

In [10]:
# Encode categorical variables using Label Encoding
label_encoders = {}
for column in combined_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined_data[column] = le.fit_transform(combined_data[column])
    label_encoders[column] = le


In [11]:
# Split data into features and target
X_train = combined_data[:len(train_data)].drop('Credit_Score', axis=1)
y_train = combined_data[:len(train_data)]['Credit_Score']
X_test = combined_data[len(train_data):]

**Feature Scaling**

In [12]:
X_train

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,8202,12320,4,84,206,10205,12,5304,1824.843333,3,...,4.0,3,11754,26.822620,180,1,49.574949,53013,3,27310
1,8209,12320,3,84,206,10205,12,5304,1824.843333,3,...,4.0,1,11754,31.944960,180,1,49.574949,4963,4,20322
2,8216,12320,7,84,0,10205,12,5304,1824.843333,3,...,4.0,1,11754,28.609352,184,1,49.574949,53420,5,31431
3,8221,12320,0,84,206,10205,12,5304,1824.843333,3,...,4.0,1,11754,31.377862,185,1,49.574949,19527,6,7480
4,8228,12320,8,84,206,10205,12,5304,1824.843333,3,...,4.0,1,11754,24.797347,186,1,49.574949,35946,2,33391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14913,52403,12294,3,4723,420,6821,12,5923,1991.882500,6,...,10.0,0,9651,41.536045,402,0,62.884172,40552,1,30281
14914,52404,12294,7,4723,420,6821,12,5923,1991.882500,6,...,10.0,0,9651,32.874989,403,3,62.884172,29167,2,35909
14915,52405,12294,0,4723,420,6821,12,5923,1991.882500,6,...,11.0,0,9651,28.613831,404,3,62.884172,40995,3,34021
14916,52406,12294,8,4723,420,6821,12,5923,1991.882500,6,...,11.0,0,9651,25.536009,405,3,62.884172,9033,6,21890


In [13]:
X_test

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,8254,12320,11,84,206,10205,12,5304,1824.843333,3,...,1,11754,35.030402,190,1,49.574949,23766,6,4320,3
1,8259,12320,10,84,227,10205,12,5304,1824.843333,3,...,1,11754,33.053114,181,1,49.574949,20913,2,36927,3
2,8266,12320,9,84,227,10205,12,5304,1824.843333,3,...,1,11754,33.811894,181,1,49.574949,11593,5,15039,3
3,8273,12320,2,84,244,10205,12,5304,1824.843333,3,...,1,11754,32.430559,191,1,49.574949,34816,2,33821,3
4,8330,1066,11,7491,296,53,15,9128,3037.986667,2,...,1,10535,25.926822,244,1,18.816215,34959,1,49753,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,34122,7857,2,7932,664,406,1,5776,1929.906667,10,...,3,7926,34.780553,368,3,60.964772,11267,6,17826,3
49996,34123,8833,11,6528,245,1009,9,10169,1929.906667,4,...,1,9895,27.758522,302,0,35.104023,17138,6,43104,3
49997,34124,8833,10,6528,245,1009,9,10169,3359.415833,4,...,1,9895,36.858542,311,1,35.104023,59077,4,34943,3
49998,34126,8833,9,6528,245,1009,9,10169,3359.415833,4,...,1,9895,39.139840,312,1,35.104023,58331,3,48197,3


In [14]:
# Initialize StandardScaler
scaler = StandardScaler()

# Define the columns to be scaled
columns_to_scale = X_train.columns

# Fit and transform X_train
X_train_scaled = scaler.fit_transform(X_train[columns_to_scale])

# Transform X_test using the same scaler
X_test_scaled = scaler.transform(X_test[columns_to_scale])


In [15]:
# Encode the target variable 'y_train'
le_target = LabelEncoder()
y_train_encoded = le_target.fit_transform(y_train)

In [16]:
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train_scaled, y_train_encoded, test_size=0.2, random_state=42)

In [17]:
# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "K Neighbors": KNeighborsClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "ANN": keras.Sequential([
        keras.layers.Dense(units=128, activation='relu', input_dim=X_train.shape[1]),
        keras.layers.Dense(units=64, activation='relu'),
        keras.layers.Dense(units=len(np.unique(y_train)), activation='softmax')
    ])
}

# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    if name == "ANN":
        if y_train.dtype == 'O':  # Convert labels to integers for ANN
            le = LabelEncoder()
            y_train_encoded = le.fit_transform(y_train)
            y_val_encoded = le.transform(y_val)

            clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            clf.fit(X_train, y_train_encoded, epochs=10, batch_size=32, verbose=0)
            y_pred_prob = clf.predict(X_val)
            y_pred = np.argmax(y_pred_prob, axis=1)
        else:
            clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            clf.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
            y_pred_prob = clf.predict(X_val)
            y_pred = np.argmax(y_pred_prob, axis=1)
    else:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    results[name] = accuracy

# Display results
for name, accuracy in results.items():
    print(f"{name}: Accuracy = {accuracy:.4f}")
    print(f"{name} Classification Report:")
    print(classification_report(y_val, y_pred))
    print("="*50)


Logistic Regression: Accuracy = 0.5995
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.56      0.57       519
           1       0.65      0.58      0.61       912
           2       0.67      0.72      0.70      1553

    accuracy                           0.65      2984
   macro avg       0.63      0.62      0.63      2984
weighted avg       0.65      0.65      0.65      2984

SVM: Accuracy = 0.6464
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.56      0.57       519
           1       0.65      0.58      0.61       912
           2       0.67      0.72      0.70      1553

    accuracy                           0.65      2984
   macro avg       0.63      0.62      0.63      2984
weighted avg       0.65      0.65      0.65      2984

K Neighbors: Accuracy = 0.6448
K Neighbors Classification Report:
              precision    recall  f1-sc