In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier



In [5]:

# Read the CSV file
df = pd.read_csv('CCP.csv')

In [6]:
# Preprocess the data
cat_col = ['EDUCATION', 'MARRIAGE', 'SEX']
num_col = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
           'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
           'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

input_col = cat_col + num_col
target_col = ['default ']

scaler = MinMaxScaler()
df[num_col] = scaler.fit_transform(df[num_col])

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_cols = list(encoder.fit(df[cat_col]).get_feature_names_out(cat_col))
df[encoded_cols] = encoder.transform(df[cat_col])

X = df[encoded_cols + num_col]
y = df[target_col].replace({'Y': 1, 'N': 0}).astype(int).values.ravel()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:

# Evaluation Measures
evaluation_measures = ['accuracy', 'precision', 'recall', 'f1']


In [8]:

# Machine Learning Algorithms
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Gaussian Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machines': SVC(),
    'AdaBoost': AdaBoostClassifier(),
    'Neural Network': MLPClassifier(),
    'Extra Trees Classifier': ExtraTreesClassifier(),
    'XGB Classifier': XGBClassifier()
}


In [9]:

# Train-Test Split Evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

results_train_test = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    metrics = {}
    for measure in evaluation_measures:
        if measure == 'accuracy':
            metrics[measure] = accuracy_score(y_test, y_pred)
        elif measure == 'precision':
            metrics[measure] = precision_score(y_test, y_pred)
        elif measure == 'recall':
            metrics[measure] = recall_score(y_test, y_pred)
        elif measure == 'f1':
            metrics[measure] = f1_score(y_test, y_pred)
    print("DONE")
    results_train_test[model_name] = metrics

DONE


In [10]:
# 10-Fold Cross-Validation Evaluation
results_cross_val = {}
for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
    results_cross_val[model_name] = scores.mean()

In [11]:
# Print the results
print("Train-Test Split Evaluation:")
for model_name, metrics in results_train_test.items():
    print(f"Metrics for {model_name}:")
    for measure, value in metrics.items():
        print(f"{measure}: {value:.4f}")
    print()

print("\n10-Fold Cross-Validation Evaluation:")
for model_name, score in results_cross_val.items():
    print(f"{model_name}: {score:.4f}")

Train-Test Split Evaluation:
Metrics for Gradient Boosting:
accuracy: 0.8203
precision: 0.6667
recall: 0.3580
f1: 0.4658


10-Fold Cross-Validation Evaluation:
Gradient Boosting: 0.8212
