In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

___
# DATA PREPROCESSING
___

In [3]:
DATA = 'loan_approval_dataset.csv'

In [4]:
df = pd.read_csv(DATA)

In [6]:
df.min()

In [6]:
df.describe()

In [7]:
df.isnull().sum()

In [8]:
df.drop(columns='loan_id', inplace=True)

In [9]:
cols = df.columns

In [10]:
x_cols = cols.drop(' loan_status')

In [11]:
numerical_cols = df.select_dtypes(include=['number']).columns

In [12]:
numerical_cols

In [13]:
categorical_cols = df.select_dtypes(include=['object']).columns
categorical_cols

In [14]:
categorical_colss = categorical_cols.drop(' loan_status')

In [15]:
categorical_colss

Preparing the Encoding Preprocessor:

In [16]:
# Prepping for "X"

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scalar', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_colss),
        ('num', numerical_transformer, numerical_cols)
        
    ],
    remainder='passthrough'
)

# Prepping for "y"
le = LabelEncoder()

In [17]:
X = df.iloc[:, :-1]
y = df.iloc[:,-1]

___
# Train Test Split
___

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Encoding the Sets

In [19]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [20]:
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

___
# Model Training
___

In [21]:
lgr_model = LogisticRegression(random_state=42)

In [22]:
lgr_model.fit(X_train_transformed, y_train_transformed)

___
# Predictions and Evaluations
___

Evals:

In [23]:
lgr_y_pred = lgr_model.predict(X_test_transformed) 

In [24]:
print("Classification Report:")
print(classification_report(y_test_transformed, lgr_y_pred))

preds:

In [25]:
x_cols

2, Graduate, Yes,9100000,31500000,14,679,10800000,16600000,20900000,5000000, Approved

In [26]:
prediction_data = [[
    2, 'Graduate', 'Yes',100000,31500000,14,500,1008000,100000,20900000,5000000
]]

In [27]:
new_df = pd.DataFrame(prediction_data, columns=x_cols)

In [28]:
transformed_new_df = preprocessor.transform(new_df)

In [29]:

status_mapping = {
    1: 'Rejected',
    0: 'Approved'
}

In [30]:
pred = lgr_model.predict(transformed_new_df)

In [31]:
pred

In [32]:
status_mapping[pred[0]]

___
# Bonus - 1
___
Using SMOTE to Address Data Imbalance:
__

In [33]:
# Shape of our train data before Balancing 
print(f"Shape of X_train Dataset before SMOTE: {X_train_transformed.shape}")
print(f"Shape of y_train Dataset before SMOTE: {y_train_transformed.shape}")

In [34]:
smote = SMOTE(random_state=42)

X_train_transformed_resampled, y_train_transformed_resampled = smote.fit_resample(X_train_transformed, y_train_transformed)

In [35]:
print(f"Shape of X_train Dataset After SMOTE: {X_train_transformed_resampled.shape}")
print(f"Shape of y_train Dataset After SMOTE: {y_train_transformed_resampled.shape}")

In [36]:
# Training a new Logistic Regression Model on SMOTE SAMPLING
smote_lgr_model = LogisticRegression(random_state=42)

smote_lgr_model.fit(X_train_transformed_resampled, y_train_transformed_resampled)

Getting Classification Reports of the Model

In [37]:
smote_lgr_y_pred = smote_lgr_model.predict(X_test_transformed)

In [38]:
print(classification_report(y_test_transformed, smote_lgr_y_pred))

___
# Conclusion:

- Now as seen above, the Logistic Regression Model with normal data Gives out an F-1 Score of 0.90. Thus the full report as follows:
    - Normal LGR Classification Report:
        
```              precision    recall  f1-score   support

           0       0.92      0.93      0.92       536
           1       0.88      0.86      0.87       318
    accuracy                           0.91       854
   macro avg       0.90      0.90      0.90       854
weighted avg       0.90      0.91      0.90       854
```

- And for the Logistic Model with SMOTE Dataset, we get an F-1 Score of 0.91. Thus, Full report as Follows:
    - SMOTE LGR CLASSIFICATION REPORT:

```              precision    recall  f1-score   support

           0       0.95      0.91      0.93       536
           1       0.85      0.91      0.88       318
    accuracy                           0.91       854
   macro avg       0.90      0.91      0.90       854
weighted avg       0.91      0.91      0.91       854  
```

    
In a nutshell, using SMOTE Technique, we did better results however the increase was minimal but still since both the Techniques took minimal time, we will opt for higher accuracy


___
# Bonus - 2
___
Training Decision Tree:
__

In [39]:
# We will be using the SMOTE data as it will be more ideal in this scenario

smote_dt_model = DecisionTreeClassifier(max_depth=7, random_state=42)

In [40]:
smote_dt_model.fit(X_train_transformed_resampled, y_train_transformed_resampled)
print("Model Trained!!")

___
Getting Classification Report

In [49]:
smote_dt_y_pred = smote_dt_model.predict(X_test_transformed)

In [48]:
print("Classification Reports: ")
print(classification_report(y_test_transformed, smote_dt_y_pred))

___
Using the Model to get Prediction on unseen data: Approved: 0 AND Rejected: 1

In [60]:
pred_data = [[1, "Not Graduate", "Yes", 8700000, 28300000, 36, 700, 20400000, 13600000, 27900000, 10200000]]

In [61]:
new_df = pd.DataFrame(pred_data, columns=x_cols)

In [62]:
transformed_new_df = preprocessor.transform(new_df)

In [63]:
status_mapping[smote_dt_model.predict(transformed_new_df)[0]]

___
# Final Verdict:

- Now as seen above, the Decision Tree Classification with SMOTE data Gives out an F-1 Score of 0.96. Thus the full report as follows:
    - SMOTE Decision Tree Classification Report:
        
```              precision    recall  f1-score   support

           0       0.99      0.96      0.97       536
           1       0.93      0.98      0.95       318

    accuracy                           0.96       854
   macro avg       0.96      0.97      0.96       854
weighted avg       0.96      0.96      0.96       854
```

- And for the Logistic Model with SMOTE Dataset, we get an F-1 Score of 0.91. Thus, Full report as Follows:
    - SMOTE LGR CLASSIFICATION REPORT:

```              precision    recall  f1-score   support

           0       0.95      0.91      0.93       536
           1       0.85      0.91      0.88       318
    accuracy                           0.91       854
   macro avg       0.90      0.91      0.90       854
weighted avg       0.91      0.91      0.91       854  
```

    
In a nutshell, using Decision Tree Gives us better accuracy reports and thus will be e better choice for choosing an Ai Model for Real World Application
___

In [47]:
print("Hello")

In [50]:
x_cols

In [53]:
categorical_cols

In [57]:
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('dt', smote_dt_model)
])

In [59]:
joblib.dump(full_pipeline, 'Loan_approval_fullpipeline.joblib')