In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Load the dataset
data_path = 'train_data.xlsx'
data = pd.read_excel(data_path)

# Separate features and target
X = data.drop(columns=['loan_status', 'customer_id'])
y = data['loan_status']

# Extract time features from 'transaction_date' if it exists
if 'transaction_date' in X.columns:
    X['transaction_year'] = pd.to_datetime(X['transaction_date']).dt.year
    X['transaction_month'] = pd.to_datetime(X['transaction_date']).dt.month
    X['transaction_day'] = pd.to_datetime(X['transaction_date']).dt.day
    X = X.drop(columns=['transaction_date'])  # Drop original column

# Convert 'term' into a numerical feature by extracting the number of months
if 'term' in X.columns:
    X['term'] = X['term'].apply(lambda x: int(x.split()[0]))

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Numerical transformer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42)
}

# Evaluate models
results = {}
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = {
        'accuracy': accuracy,
        'classification_report': classification_report(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    }
    print(f"{model_name} Evaluation:")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{results[model_name]['classification_report']}")
    print(f"Confusion Matrix:\n{results[model_name]['confusion_matrix']}")
    print("="*50)

# Hyperparameter tuning for Random Forest
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

grid_search = GridSearchCV(rf_pipeline, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from grid search
best_rf = grid_search.best_estimator_
best_rf_accuracy = accuracy_score(y_test, best_rf.predict(X_test))

print("Random Forest with Hyperparameter Tuning:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {best_rf_accuracy}")

# Compare the models and select the best one
model_accuracies = {name: result['accuracy'] for name, result in results.items()}
model_accuracies['Random Forest with Hyperparameter Tuning'] = best_rf_accuracy

best_model_name = max(model_accuracies, key=model_accuracies.get)
selected_model_accuracy = model_accuracies[best_model_name]

# Save the best model if it's the Random Forest with Hyperparameter Tuning
if best_model_name == 'Random Forest with Hyperparameter Tuning':
    joblib.dump(best_rf, 'best_rf_model.pkl')

# Model Selection Summary
summary = f"The {best_model_name} was chosen because it achieved the highest accuracy ({selected_model_accuracy}) during evaluation."

print("\nMODEL SELECTION SUMMARY")
print(summary) # SINCE HTE code couldnt complete but the accuracy are there thus  Logistic Regression Evaluation is better model

Logistic Regression Evaluation:
Accuracy: 0.7648334896810507
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.27      0.38      8911
           1       0.78      0.94      0.86     25201

    accuracy                           0.76     34112
   macro avg       0.70      0.61      0.62     34112
weighted avg       0.74      0.76      0.73     34112

Confusion Matrix:
[[ 2418  6493]
 [ 1529 23672]]
Random Forest Evaluation:
Accuracy: 0.7602016885553471
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.29      0.39      8911
           1       0.79      0.92      0.85     25201

    accuracy                           0.76     34112
   macro avg       0.68      0.61      0.62     34112
weighted avg       0.73      0.76      0.73     34112

Confusion Matrix:
[[ 2623  6288]
 [ 1892 23309]]
