In [108]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

train_data = pd.read_excel("merged_data_final_training.xlsx")
test_data = pd.read_excel("merged_data_final_testing.xlsx")


# Define categorical and numerical columns
categorical_columns = ['bank_account_type', 'bank_name_clients', 'employment_status_clients', 'level_of_education_clients']
numerical_columns = ['loannumber', 'loanamount', 'totaldue', 'termdays', 'average_loannumber', 'average_loanamount', 'on_time_rate', 'referred_or_not', 'age']

# Fill missing values for categorical columns with 'Unknown'
for col in categorical_columns:
    train_data[col].fillna('Unknown', inplace=True)
    test_data[col].fillna('Unknown', inplace=True)

# Fill missing values for numerical columns with the median
for col in numerical_columns:
    train_data[col].fillna(train_data[col].median(), inplace=True)
    test_data[col].fillna(test_data[col].median(), inplace=True)

# Separate features and target variable in training data
X_train = train_data.drop(columns=['customerid', 'good_bad_flag'])
y_train = train_data['good_bad_flag'].map({'Good': 1, 'Bad': 0})

# Separate features in test data (excluding 'customerid')
X_test = test_data.drop(columns=['customerid'])

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

# Create a pipeline that first transforms the data then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define the hyperparameter grid
param_grid = {
    'classifier__max_depth': [3, 5, 7, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__criterion': ['gini', 'entropy']
}

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the Grid Search to the training data
grid_search.fit(X_train, y_train)

# Best parameters from the Grid Search
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the final model with the best parameters
best_dt = grid_search.best_estimator_


# If you have a validation set or labeled training set, you can evaluate the model:
# Split the training data into training and validation sets for evaluation (optional)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
best_dt.fit(X_train_split, y_train_split)
y_val_pred = best_dt.predict(X_val_split)

# Evaluate the final model on validation set (optional)
accuracy_dt = accuracy_score(y_val_split, y_val_pred)
precision_dt = precision_score(y_val_split, y_val_pred)
recall_dt = recall_score(y_val_split, y_val_pred)
f1_dt = f1_score(y_val_split, y_val_pred)

print(f"Validation Accuracy: {accuracy_dt}")
print(f"Validation Precision: {precision_dt}")
print(f"Validation Recall: {recall_dt}")
print(f"Validation F1-score: {f1_dt}")


Best parameters found:  {'classifier__criterion': 'gini', 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2}
Validation Accuracy: 0.7694656488549618
Validation Precision: 0.781055900621118
Validation Recall: 0.9805068226120858
Validation F1-score: 0.8694900605012965


In [112]:
y_pred_dt = best_dt.predict(X_test)

In [114]:
# Save predictions to a CSV file
predictions = pd.DataFrame({'customerid': test_data['customerid'], 'predicted_good_bad_flag': y_pred_dt})
predictions['predicted_good_bad_flag'] = predictions['predicted_good_bad_flag'].map({1: 'Good', 0: 'Bad'})
predictions.to_csv('predictions2.csv', index=False)