### import libraries

In [53]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

### Data Preprocessing and EDA and cleaning, and standardization/ feature selection

In [54]:


# Load the dataset
df = pd.read_csv('data/loan_train.csv')

# Handle missing values if any
df.dropna(inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df['loan_status'] = label_encoder.fit_transform(df['loan_status'])
df['education'] = label_encoder.fit_transform(df['education'])
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Feature selection
X = df[['Principal', 'terms', 'age', 'education', 'Gender']]
y = df['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

### Model Training and initiation of Logistic Regression, K-Nearest Neighbors, Decision Tree, and Support Vector Machine

In [55]:


# Initialize the models
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

svc = SVC()
svc.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Model Evaluation for Classification using logistic regression, K-Nearest Neighbors, Decision Tree, and Support Vector Machine

In [56]:
# Logistic Regression
y_pred_logistic_regression = logistic_regression.predict(X_test)
print("Performance of Logistic Regression:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic_regression))
print("Classification Report:")
print(classification_report(y_test, y_pred_logistic_regression))
print("Accuracy:", accuracy_score(y_test, y_pred_logistic_regression))
print("\n")

# K-Nearest Neighbors
y_pred_knn = knn.predict(X_test)
print("Performance of K-Nearest Neighbors:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\n")

# Decision Tree
y_pred_decision_tree = decision_tree.predict(X_test)
print("Performance of Decision Tree:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_decision_tree))
print("Classification Report:")
print(classification_report(y_test, y_pred_decision_tree))
print("Accuracy:", accuracy_score(y_test, y_pred_decision_tree))
print("\n")

# Support Vector Machine
y_pred_svc = svc.predict(X_test)
print("Performance of Support Vector Machine:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svc))
print("Classification Report:")
print(classification_report(y_test, y_pred_svc))
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print("\n")

Performance of Logistic Regression:
Confusion Matrix:
[[ 0 20]
 [ 0 50]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.71      1.00      0.83        50

    accuracy                           0.71        70
   macro avg       0.36      0.50      0.42        70
weighted avg       0.51      0.71      0.60        70

Accuracy: 0.7142857142857143


Performance of K-Nearest Neighbors:
Confusion Matrix:
[[ 2 18]
 [ 4 46]]
Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.33      0.10      0.15        20
           1       0.72      0.92      0.81        50

    accuracy                           0.69        70
   macro avg       0.53      0.51      0.48        70
weighted avg       0.61      0.69      0.62        70

Accuracy: 0.6857142857142857


Performance of Decision Tree:
Confusion Matrix:
[[ 3 17]
 [15 35]]
Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.15      0.16        20
           1       0.67      0.70      0.69        50

    accuracy                           0.54        70
   macro avg       0.42      0.42      0.42        70
weighted avg       0.53      0.54      0.54        70

Accuracy: 0.5428571428571428


Performance of Support Vector Machine:
Confusion Matrix:
[[ 0 20]
 [ 0 50]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Hyperparameter Tuning

In [57]:
# Logistic Regression
param_grid_log_reg = {'C': [0.1, 1, 10]}
grid_log_reg = GridSearchCV(LogisticRegression(), param_grid_log_reg, cv=5)
grid_log_reg.fit(X_train, y_train)
print("Logistic Regression Best Parameters:", grid_log_reg.best_params_)
print("Logistic Regression Best Score:", grid_log_reg.best_score_)

# K-Nearest Neighbors
param_grid_knn = {'n_neighbors': [3, 5, 7]}
grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
grid_knn.fit(X_train, y_train)
print("K-Nearest Neighbors Best Parameters:", grid_knn.best_params_)
print("K-Nearest Neighbors Best Score:", grid_knn.best_score_)

# Decision Tree
param_grid_decision_tree = {'max_depth': [None, 10, 20, 30]}
grid_decision_tree = GridSearchCV(DecisionTreeClassifier(), param_grid_decision_tree, cv=5)
grid_decision_tree.fit(X_train, y_train)
print("Decision Tree Best Parameters:", grid_decision_tree.best_params_)
print("Decision Tree Best Score:", grid_decision_tree.best_score_)

# Support Vector Machine
param_grid_svc = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_svc = GridSearchCV(SVC(), param_grid_svc, cv=5)
grid_svc.fit(X_train, y_train)
print("Support Vector Machine Best Parameters:", grid_svc.best_params_)
print("Support Vector Machine Best Score:", grid_svc.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Best Parameters: {'C': 0.1}
Logistic Regression Best Score: 0.760909090909091
K-Nearest Neighbors Best Parameters: {'n_neighbors': 7}
K-Nearest Neighbors Best Score: 0.7538311688311689
Decision Tree Best Parameters: {'max_depth': 10}
Decision Tree Best Score: 0.6198701298701298
Support Vector Machine Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Support Vector Machine Best Score: 0.760909090909091


### Ensemble Learning

In [58]:
# Initialize the models
bagging = BaggingClassifier(n_estimators=100, random_state=35)
random_forest = RandomForestClassifier(n_estimators=100, random_state=15)

# Train the models
bagging.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

# Evaluate the models
y_pred_bagging = bagging.predict(X_test)
y_pred_rf = random_forest.predict(X_test)

print("Performance of Bagging Classifier:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bagging))
print("Classification Report:")
print(classification_report(y_test, y_pred_bagging))
print("Accuracy:", accuracy_score(y_test, y_pred_bagging))
print("\n")

print("Performance of Random Forest Classifier:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Performance of Bagging Classifier:
Confusion Matrix:
[[ 1 19]
 [ 7 43]]
Classification Report:
              precision    recall  f1-score   support

           0       0.12      0.05      0.07        20
           1       0.69      0.86      0.77        50

    accuracy                           0.63        70
   macro avg       0.41      0.46      0.42        70
weighted avg       0.53      0.63      0.57        70

Accuracy: 0.6285714285714286


Performance of Random Forest Classifier:
Confusion Matrix:
[[ 1 19]
 [ 8 42]]
Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.05      0.07        20
           1       0.69      0.84      0.76        50

    accuracy                           0.61        70
   macro avg       0.40      0.45      0.41        70
weighted avg       0.52      0.61      0.56        70

Accuracy: 0.6142857142857143


### Making future prediction using SVM because it has the highest accuracy of 0.8 on our dataset.

In [59]:
# Load the dataset again for future observations
df = pd.read_csv('data/loan_train.csv')

# Handle missing values if any
df.dropna(inplace=True)

# Encode categorical variables
label_encoder_loan_status = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_gender = LabelEncoder()

df['loan_status'] = label_encoder_loan_status.fit_transform(df['loan_status'])
df['education'] = label_encoder_education.fit_transform(df['education'])
df['Gender'] = label_encoder_gender.fit_transform(df['Gender'])

# Feature selection
X = df[['Principal', 'terms', 'age', 'education', 'Gender']]
y = df['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

# Train the SVM model
svm = SVC()
svm.fit(X_train, y_train)

# Make predictions on new data (example)
new_data = pd.DataFrame({
    'Principal': [1000, 800],
    'terms': [30, 15],
    'age': [45, 33],
    'education': ['High School or Below', 'Bechalor'],
    'Gender': ['male', 'female']
})

# Transform the new data
new_data = pd.DataFrame(new_data)

# Define mappings
education_mapping = {'High School or Below': 0, 'Bechalor': 1, 'Master or Above': 2}
gender_mapping = {'male': 0, 'female': 1}

# Apply mappings
new_data['education'] = new_data['education'].map(education_mapping)
new_data['Gender'] = new_data['Gender'].map(gender_mapping)

print(new_data)

# Make predictions
predictions = svm.predict(new_data)
print("Predictions for new data:", predictions)

   Principal  terms  age  education  Gender
0       1000     30   45          0       0
1        800     15   33          1       1
Predictions for new data: [1 1]
