### Problem Statement
##  PeerLoanKart is an NBFC(Non-banking Financial Company)that facilitates peer-to-peer loan. it connects people who needs money(borrowers)with people who 
##  have money(investors).As an investor, you would want to invest in people who showed a profile of having a high probability of paying you back. you as  
## an "ML Expert" create a model that will help predict whether a borrower will pay the loan or not.
##  Objective: increase profit up to 20% as NpA will be reduced due to loan disbursal for only creditworthy borrowers.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split

In [None]:
loan = pd.read_csv("loan_data.csv")

In [None]:
loan.describe()

In [None]:
loan.info()



In [None]:
loan.head()

In [None]:
loan.isnull().sum()

## create a histogram of two FICO distributions ontop of each other, one for each credit.policy outcome.

In [None]:
plt.figure(figsize=(10,6))
loan[loan['credit.policy']==1] ['fico'].hist(alpha=0.5,color='blue',
bins=30,label='credit.policy=1')
loan[loan['credit.policy']==0] ['fico'].hist(alpha=0.5,color='red',
bins=30,label='credit.policy=0')
plt.legend()
plt.xlabel('FICO')

## create a similar figure;select the not.fully.paid column

In [None]:
import matplotlib.pyplot as plt  

plt.figure(figsize=(10, 6))  

# Plot FICO scores for both classes
loan[loan['not.fully.paid'] == 1]['fico'].hist(alpha=0.5, color='blue', bins=30, label='not.fully.paid=1')
loan[loan['not.fully.paid'] == 0]['fico'].hist(alpha=0.5, color='red', bins=30, label='not.fully.paid=0')

# Add labels and legend
plt.legend()
plt.xlabel('FICO')
plt.ylabel('Count')
plt.title('FICO Score Distribution by Loan Repayment Status')

# Show the plot
plt.show()


## create a countplot using seaborn showing the count of loans by purpose,with the hue defined by not.fully.paid.

In [None]:
plt.figure(figsize=(11,7))
sns.countplot(x='purpose',hue='not.fully.paid',data=loan,palette='Set1')

In [None]:
loan["purpose"].unique()


In [None]:
loan = pd.get_dummies(loan, columns=["purpose"], drop_first=True)


In [None]:
X = loan.drop('not.fully.paid',axis=1)
y = loan['not.fully.paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

### Training Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

## Evaluating Decision Tree: create prediction from the test set,and create a classification report and confusion matrix.

In [None]:
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

## Confusion Matrix

In [None]:
print(confusion_matrix(y_test,predictions))

## To improve the decisionTreeClassifier we will have to TUNE its Hyperparameter using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 10, 15, None],  # Limits tree depth
    'min_samples_split': [2, 5, 10],  # Minimum samples needed to split a node
    'min_samples_leaf': [1, 2, 5, 10],  # Minimum samples per leaf
    'criterion': ['gini', 'entropy']  # Split method
}

# Initialize DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=42)

# GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(dtree, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Train new model with best parameters
best_dtree = DecisionTreeClassifier(**grid_search.best_params_, random_state=42)
best_dtree.fit(X_train, y_train)

# Predict using the tuned model
best_predictions = best_dtree.predict(X_test)

# Evaluate performance
from sklearn.metrics import classification_report, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, best_predictions))
print("\nClassification Report:\n", classification_report(y_test, best_predictions))


##  The model have not improved hence we use the RandomForest classifier

## Training Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train,y_train)

## Evaluating Random Forest Model

In [None]:
predictions = rfc.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

### Printing the confusion matrix 

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
rfc = RandomForestClassifier(n_estimators=600, class_weight="balanced")


In [None]:
!pip install --upgrade joblib


In [None]:
!pip install xgboost


In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)


In [None]:
xgb_predictions = xgb_model.predict(X_test)

# Print Evaluation Metrics
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_predictions))
print("\nClassification Report:\n", classification_report(y_test, xgb_predictions))


##  Hyperparameter Tuning for XGBoost

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Train the model with best parameters
best_xgb = xgb.XGBClassifier(**grid_search.best_params_, use_label_encoder=False, eval_metric='logloss', random_state=42)
best_xgb.fit(X_train, y_train)

# Make predictions
best_xgb_predictions = best_xgb.predict(X_test)

# Evaluate
from sklearn.metrics import classification_report, confusion_matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, best_xgb_predictions))
print("\nClassification Report:\n", classification_report(y_test, best_xgb_predictions))


In [None]:
import joblib

# Save the trained XGBoost model
joblib.dump(xgb_model, 'xgb_model.pkl')


In [None]:
import joblib

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)  # Retrain
joblib.dump(xgb_model, 'xgb_model.pkl')  # Save properly


In [None]:
import joblib

# Save training feature names
joblib.dump(X_train.columns.tolist(), 'training_columns.pkl')
