In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer, KNNImputer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.preprocessing import MinMaxScaler
import pygad

# Load datasets
application_data = pd.read_csv("application_record.csv")
credit_data = pd.read_csv("credit_record.csv")

# Data Exploration
print("Application Data Overview:\n", application_data.head())
print("Credit Data Overview:\n", credit_data.head())

print("Application Data info:\n", application_data.info())
print("Credit Data info:\n", credit_data.info())

# Check for null values
print("\nNull Values in Application Data:\n", application_data.isnull().sum())
print("Null Values in Credit Data:\n", credit_data.isnull().sum())

# Drop the column with the highest percentage of missing values
max_null_column = application_data.isnull().sum().idxmax()
max_null_percentage = application_data[max_null_column].isnull().sum() / application_data.shape[0]
print(f"\nDropping column '{max_null_column}' with the highest null percentage: {max_null_percentage*100:.2f}%")
application_data.drop(columns=[max_null_column], inplace=True)

# Print remaining columns after dropping
print("\nColumns after dropping the column with the highest null percentage:")
print(application_data.columns)

# Check for duplicates before dropping
print(f"\nNumber of duplicate rows before dropping: {application_data.duplicated().sum()}")

# Drop duplicates
application_data = application_data.drop_duplicates()

# Check for duplicates after dropping
print(f"Number of duplicate rows after dropping: {application_data.duplicated().sum()}")
# Check for duplicates in credit_data
print(f"\nNumber of duplicate rows in Credit Data before dropping: {credit_data.duplicated().sum()}")

# Drop duplicates in credit_data
credit_data = credit_data.drop_duplicates()

# Check for duplicates again in credit_data
print(f"Number of duplicate rows in Credit Data after dropping: {credit_data.duplicated().sum()}")


# For numerical columns, replace NaN with the median (or mean if preferred)
numerical_columns = ['AMT_INCOME_TOTAL', 'CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']

for col in numerical_columns:
    if application_data[col].isnull().sum() > 0:  # Check if column has missing values
        median_value = application_data[col].median()
        application_data[col] = application_data[col].fillna(median_value)  # Assign median value to NaN

# After filling missing values, you can check the changes
print(application_data.isnull().sum())  # Check if there are any remaining NaN values

def wisker(col):
  q1,q3 = np.percentile(col,[25,75])
  iqr = q3-q1
  lw= q1- 1.5*iqr
  uw = q3 + 1.5*iqr
  return lw,uw

for i in numerical_columns:
  lw,uw = wisker(application_data[i])
  application_data[i]= np.where(application_data[i] < lw,lw,application_data[i])
  application_data[i]= np.where(application_data[i] > uw,uw,application_data[i])

  print(f"Column: {i}")
  print(application_data[i].describe())  # Print summary statistics to observe changes
  print(application_data[i].head())

# Map STATUS in credit_data
status_mapping = {'C': 0, 'X': 0, '0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1}
credit_data['STATUS'] = credit_data['STATUS'].map(status_mapping)

# Aggregate credit_data by ID (take the worst STATUS per ID)
credit_data = credit_data.groupby('ID')['STATUS'].max().reset_index()

# Merge datasets
merged_data = pd.merge(application_data, credit_data, on='ID', how='inner')

# Label Encoding for categorical features
categorical_cols = merged_data.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    if col != 'ID':  # Exclude ID from encoding
        merged_data[col] = le.fit_transform(merged_data[col])

# Feature and target separation
X = merged_data.drop(columns=['ID', 'STATUS'])
y = merged_data['STATUS']

# Check class imbalance
print("\nClass Distribution:\n", y.value_counts())

# Address class imbalance (Optional: Using SMOTE or undersampling if needed)
# For now, we'll proceed with the existing data.

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

def fitness_func(ga_instance, solution, solution_idx):
    # Select the features based on the solution (chromosome)
    selected_features = [idx for idx, value in enumerate(solution) if value == 1]

    if len(selected_features) == 0:
        return 0  # Return a fitness of 0 if no features are selected

    try:
        # Select features for training and validation data
        X_train_selected = X_train[:, selected_features]
        X_val_selected = X_val[:, selected_features]

        # Train the model using Decision Tree
        model = DecisionTreeClassifier(random_state=42)
        model.fit(X_train_selected, y_train)

        # Evaluate the model on the validation set
        accuracy = model.score(X_val_selected, y_val)

        return accuracy
    except Exception as e:
        print(f"Error in fitness_func: {e}")
        return 0  # Return 0 if any error occurs

try:
    ga_instance = pygad.GA(
        num_generations=20,
        num_parents_mating=5,
        fitness_func=fitness_func,
        sol_per_pop=10,
        num_genes=X_train.shape[1],
        gene_type=int,
        init_range_low=0,  # Binary values
        init_range_high=2,  # Should be exclusive, so use 2 for binary (0 or 1)

        # Explicit Crossover and Mutation Settings
        crossover_type="single_point",  # You can also use "uniform"
        crossover_probability=0.9,  # Probability of applying crossover
        mutation_type="random",  # Mutation type
        mutation_probability=0.1,  # Probability of mutation
        mutation_percent_genes=10  # Percentage of genes to mutate
    )
    ga_instance.run()

    best_solution, best_solution_fitness, _ = ga_instance.best_solution()
    selected_features = [idx for idx, value in enumerate(best_solution) if value == 1]
    print("Selected Features:", selected_features)

    # Apply selected features
    X_train = X_train[:, selected_features]
    X_val = X_val[:, selected_features]
    X_test = X_test[:, selected_features]

except Exception as e:
    print(f"Error with Genetic Algorithm: {e}")





# Model Training
models = {
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42, max_iter=50),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

best_models = {}

for model_name, model in models.items():
    # Define hyperparameter grid for each model
    if model_name == 'KNN':
        param_grid = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
    elif model_name == 'Decision Tree':
        param_grid = {'max_depth': [5, 10, 15], 'min_samples_split': [2, 5, 10]}
    elif model_name == 'MLP':
        param_grid = {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'alpha': [0.0001, 0.001, 0.01]}
    elif model_name == 'Random Forest':
        param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]}
    elif model_name == 'Gradient Boosting':
        param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}


    # Perform Grid Search
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Store the best estimator for each model
    best_models[model_name] = grid_search.best_estimator_

    # Output the best parameters for the model
    print(f"\nBest parameters for {model_name}: {grid_search.best_params_}")


# Evaluate Models
for model_name, model in best_models.items():
    model.fit(X_train, y_train)  # Use resampled data for fitting the model
    y_pred = model.predict(X_test)  # Make predictions on the test set
    print(f"\n{model_name} Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Application Data Overview:
         ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
0  5008804           M            Y               Y             0   
1  5008805           M            Y               Y             0   
2  5008806           M            Y               Y             0   
3  5008808           F            N               Y             0   
4  5008809           F            N               Y             0   

   AMT_INCOME_TOTAL      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
0          427500.0               Working               Higher education   
1          427500.0               Working               Higher education   
2          112500.0               Working  Secondary / secondary special   
3          270000.0  Commercial associate  Secondary / secondary special   
4          270000.0  Commercial associate  Secondary / secondary special   

     NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  \
0        Civil marriage   R