Step 1: Imports

In [None]:
!pip install deap
!pip install -U scikit-learn



In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from deap import base, creator, tools, algorithms
from google.colab import drive
import random
from sklearn.impute import SimpleImputer
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 1: Load the Data
application_df = pd.read_csv('/content/drive/MyDrive/AI_FALL24_Project/application_record.csv')
credit_record_df = pd.read_csv('/content/drive/MyDrive/AI_FALL24_Project/credit_record.csv')

Step 2: Preprocessing

In [None]:
# Check for Null Values
print(application_df.isnull().sum())
print(credit_record_df.isnull().sum())

# Drop Duplicates
application_df = application_df.drop_duplicates()
credit_record_df = credit_record_df.drop_duplicates()

# Drop Unnecessary Columns due to lack of impact
if 'FLAG_MOBIL' in application_df.columns:
    # Drop Unnecessary Columns due to lack of impact
    application_df = application_df.drop(columns=['FLAG_MOBIL'])
else:
    print("FLAG_MOBIL has been already dropped")

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134203
CNT_FAM_MEMBERS             0
dtype: int64
ID                0
MONTHS_BALANCE    0
STATUS            0
dtype: int64
FLAG_MOBIL has been already dropped


Merging the datasets

In [None]:
# Merge Datasets
merged_df = pd.merge(application_df, credit_record_df, on='ID', how='inner')

Encoding the labels into binary

In [None]:
# Label Encoding
encoder = LabelEncoder()
for col in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE']:
    merged_df[col] = encoder.fit_transform(merged_df[col])

Feature Scaling

In [None]:
# Feature Scaling
scaler = StandardScaler()
numerical_cols = ['AMT_INCOME_TOTAL', 'AGE', 'YEARS_EMPLOYED']

In [None]:
# Convert DAYS_BIRTH to AGE (in years)
merged_df['AGE'] = -merged_df['DAYS_BIRTH'] // 365

# Convert DAYS_EMPLOYED to YEARS_EMPLOYED (in years)
# Handling anomalies in DAYS_EMPLOYED (e.g., 365243 used as a placeholder for missing values)
merged_df['DAYS_EMPLOYED'] = merged_df['DAYS_EMPLOYED'].replace(365243, np.nan)
merged_df['YEARS_EMPLOYED'] = -merged_df['DAYS_EMPLOYED'] // 365

# Drop the original DAYS_BIRTH and DAYS_EMPLOYED columns if not needed
merged_df.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

Check for Class Imbalance

In [None]:
print(merged_df['STATUS'].value_counts())

STATUS
C    329536
0    290654
X    145950
1      8747
5      1527
2       801
3       286
4       214
Name: count, dtype: int64


Step 3: Data Splitting

In [None]:
# Step 3: Data Splitting
X = merged_df.drop(columns=['STATUS'])
y = merged_df['STATUS']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Step 4 Feature Selection with Genetic Algorithms

In [None]:
# Step 4: Feature Selection with Genetic Algorithms
def evaluate(individual):
    selected_features = [feature for feature, include in zip(X.columns, individual) if include]
    if len(selected_features) == 0:
        return 0,

    # Create a copy of X_train and X_val to avoid modifying the original DataFrames
    X_train_subset = X_train[selected_features].copy()
    X_val_subset = X_val[selected_features].copy()

    # Apply Label Encoding to categorical features in the subset
    for feature in selected_features:
        if X_train_subset[feature].dtype == 'object':  # Check if feature is categorical
            le = LabelEncoder()
            X_train_subset[feature] = le.fit_transform(X_train_subset[feature])
            X_val_subset[feature] = le.transform(X_val_subset[feature])  # Use the same encoder for validation

    model = DecisionTreeClassifier()
    model.fit(X_train_subset, y_train)
    y_pred = model.predict(X_val_subset)
    return accuracy_score(y_val, y_pred),

In [None]:
# Genetic Algorithm Setup
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

population = toolbox.population(n=50)
result = algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=40, verbose=False)



Step 5: Model Selection & Training

In [None]:
# Before fitting KNN, impute missing values in X_train, X_val, and X_test
imputer = SimpleImputer(strategy='most_frequent') # Create an imputer instance

# Fit and transform on training data, then transform validation and test data
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Decision Trees
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# MLP
mlp = MLPClassifier()
mlp.fit(X_train, y_train)

Step 6: Hyperparameter Tuning for Decision Trees

In [None]:
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
}
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=3, scoring='accuracy')
grid.fit(X_train, y_train)
print("Best Parameters for Decision Tree:", grid.best_params_)

Best Parameters for Decision Tree: {'max_depth': 10, 'min_samples_split': 2}


Step 7: Model Evaluation

In [None]:
y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

y_pred_knn = knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))

y_pred_mlp = mlp.predict(X_test)
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))


Decision Tree Accuracy: 0.856915085120609
KNN Accuracy: 0.7599135935812374
MLP Accuracy: 0.4256030447976135
