# Step 3 - Model Selection


### Logistic Regression

In [3]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
train_data = pd.read_csv('/content/drive/MyDrive/Dsw/train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Dsw/test_data.csv')
target_column = 'loan_status'

# Prepare features and target
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]
X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]

# Logistic Regression Model and Grid
model = LogisticRegression()
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 200, 300]
}

# Hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Evaluation
print("Best parameters for Logistic Regression:", grid_search.best_params_)
y_pred = best_model.predict(X_test)
print(f"Accuracy of Logistic Regression: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Save the model
joblib.dump(best_model, 'logistic_regression_best_model.pkl')


Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters for Logistic Regression: {'C': 1, 'max_iter': 300, 'solver': 'lbfgs'}
Accuracy of Logistic Regression: 0.3225275933336265
              precision    recall  f1-score   support

           0       0.26      0.88      0.40     29689
           1       0.75      0.13      0.22     84016

    accuracy                           0.32    113705
   macro avg       0.50      0.50      0.31    113705
weighted avg       0.62      0.32      0.26    113705



['logistic_regression_best_model.pkl']

### Decision Tree

In [5]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
train_data = pd.read_csv('/content/drive/MyDrive/Dsw/train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Dsw/test_data.csv')
target_column = 'loan_status'

# Prepare features and target
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]
X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]

# Decision Tree Model and Grid
model = DecisionTreeClassifier()
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Evaluation
print("Best parameters for Decision Tree:", grid_search.best_params_)
y_pred = best_model.predict(X_test)
print(f"Accuracy of Decision Tree: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Save the model
joblib.dump(best_model, 'decision_tree_best_model.pkl')


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}
Accuracy of Decision Tree: 0.5903258431907128
              precision    recall  f1-score   support

           0       0.30      0.44      0.36     29689
           1       0.77      0.64      0.70     84016

    accuracy                           0.59    113705
   macro avg       0.53      0.54      0.53    113705
weighted avg       0.64      0.59      0.61    113705



['decision_tree_best_model.pkl']

### XGBoost

In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
train_data = pd.read_csv('/content/drive/MyDrive/Dsw/train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Dsw/test_data.csv')
target_column = 'loan_status'

# Prepare features and target
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]
X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]

# XGBoost Model and Grid
model = xgb.XGBClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

# Hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Evaluation
print("Best parameters for XGBoost:", grid_search.best_params_)
y_pred = best_model.predict(X_test)
print(f"Accuracy of XGBoost: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Save the model
joblib.dump(best_model, 'xgboost_best_model.pkl')


Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Accuracy of XGBoost: 0.745631238731806
              precision    recall  f1-score   support

           0       0.52      0.42      0.46     29689
           1       0.81      0.86      0.83     84016

    accuracy                           0.75    113705
   macro avg       0.66      0.64      0.65    113705
weighted avg       0.73      0.75      0.74    113705



['xgboost_best_model.pkl']

**Model Evaluation**

In [10]:
import pandas as pd
import joblib
import shap

# Load the pre-trained XGBoost model
model_path = 'xgboost_best_model.pkl'
try:
    xgboost_model = joblib.load(model_path)
    print("XGBoost model loaded successfully.")
except FileNotFoundError:
    print(f"Error: Model file '{model_path}' not found.")
    exit()

# Define expected feature names based on the model training
expected_features = [
    'customer_id', 'cibil_score', 'total_no_of_acc', 'annual_inc', 'loan_amnt',
    'installment', 'account_bal', 'emp_length', 'transaction_month',
    'transaction_year', 'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4',
    'sub_grade_C5', 'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3',
    'sub_grade_D4', 'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2',
    'sub_grade_E3', 'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1',
    'sub_grade_F2', 'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5',
    'sub_grade_G1', 'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4',
    'sub_grade_G5', 'term_ 60 months', 'home_ownership_OWN',
    'home_ownership_RENT', 'purpose_credit_card', 'purpose_debt_consolidation',
    'purpose_home_improvement', 'purpose_house', 'purpose_major_purchase',
    'purpose_medical', 'purpose_moving', 'purpose_other',
    'purpose_renewable_energy', 'purpose_small_business', 'purpose_vacation',
    'application_type_Joint App', 'verification_status_Source Verified',
    'verification_status_Verified', 'int_rate_binned_Medium',
    'int_rate_binned_High', 'int_rate_binned_Very High'
]

# Function to allow user input
def get_user_input():
    print("\nEnter feature values for prediction:")
    user_input = {
        'income': float(input("Enter income: ")),
        'loan_amount': float(input("Enter loan amount: ")),
        'credit_score': float(input("Enter credit score: ")),
        'loan_term': float(input("Enter loan term: ")),
        'employment_length': float(input("Enter employment length: "))
    }
    return user_input

# Map user inputs to the model's expected features
def prepare_features(user_input, expected_features):
    # Initialize a DataFrame with zeros for all expected features
    user_features = pd.DataFrame([0] * len(expected_features), index=expected_features).T

    # Map user inputs to appropriate features
    user_features['annual_inc'] = user_input['income']
    user_features['loan_amnt'] = user_input['loan_amount']
    user_features['cibil_score'] = user_input['credit_score']
    user_features['emp_length'] = user_input['employment_length']

    # Set default values for other features (you may update these as needed)
    user_features['term_ 60 months'] = 1 if user_input['loan_term'] > 36 else 0

    return user_features


user_input = get_user_input()


user_df = prepare_features(user_input, expected_features)


try:
    user_prediction = xgboost_model.predict(user_df)
    user_prob = xgboost_model.predict_proba(user_df)[:, 1]
    print("\nPrediction Results:")
    print("Prediction (0: Non-Defaulter, 1: Defaulter):", user_prediction[0])
    print(f"Default Probability: {user_prob[0]:.2f}")
except Exception as e:
    print(f"Error during prediction: {e}")
    exit()





XGBoost model loaded successfully.

Enter feature values for prediction:
Enter income: 450000
Enter loan amount: 100000
Enter credit score: 650
Enter loan term: 24
Enter employment length: 6

Prediction Results:
Prediction (0: Non-Defaulter, 1: Defaulter): 1
Default Probability: 0.94
