In [1]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score

In [2]:
# Read in the transaction_fraud_data.csv file into a PandasDataFrame.
lending_data = pd.read_csv(
    Path("lending_data.csv"), 
)

# Review the DataFrame
lending_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# The  column 'loan_status' is the thing you want to predict. 
# Class 0 indicates loan is healthy and class 1 indicates loan is high risk of defaulting
# Using value_counts, how many loan_status are in this dataset?
lending_data["loan_status"].value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [4]:
# The target column should be the binary `loan_status` column.
target=lending_data["loan_status"]


# The features column should be all of the features. 
features= lending_data.drop(columns="loan_status")

In [5]:
# Split the dataset using the train_test_split function
X_train, X_test, y_train, y_test = train_test_split(features, target)

In [6]:
# Declare a logistic regression model.
# Apply a random_state of 7 to the model
logistic_regression_model = LogisticRegression(random_state=7)

In [7]:
# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [8]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [9]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, testing_predictions)

0.9918489475856377

In [10]:
# Import the model for sklearn's confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confustion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[55974   285]
 [  173  1720]]


In [11]:
# Create and save the confustion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[18683    94]
 [   64   543]]


In [12]:
# Create and save the training classifiction report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     56259
           1       0.86      0.91      0.88      1893

    accuracy                           0.99     58152
   macro avg       0.93      0.95      0.94     58152
weighted avg       0.99      0.99      0.99     58152



In [13]:
# Create and save the testing classifiction report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18777
           1       0.85      0.89      0.87       607

    accuracy                           0.99     19384
   macro avg       0.92      0.94      0.93     19384
weighted avg       0.99      0.99      0.99     19384



In [14]:
#How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels? 
#the logistic regression model predict based on the score of accuracy incorporating our precion and recall,this is the good indication that the model does well

In [15]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Import the RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train_scaled, y_train)

In [17]:
# Predict labels for original scaled testing features
y_pred = model.predict(X_test_scaled)

In [18]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

In [19]:
# Fit the training data to the random undersampler model
X_undersampled, y_undersampled = rus.fit_resample(X_train_scaled, y_train)


In [20]:
# Count distinct values for the resampled target data
y_undersampled.value_counts()

0    1893
1    1893
Name: loan_status, dtype: int64

In [21]:
# Instantiate a new RandomForestClassier model
model_undersampled = RandomForestClassifier()

# Fit the undersampled data the new model
model_undersampled.fit(X_undersampled, y_undersampled)

In [22]:
# Predict labels for oversampled testing features
y_pred_undersampled = model_undersampled.predict(X_test_scaled)

In [23]:
# Print classification reports
print(f"Classifiction Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classifiction Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))

Classifiction Report - Original Data
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18777
           1       0.85      0.89      0.87       607

    accuracy                           0.99     19384
   macro avg       0.92      0.94      0.93     19384
weighted avg       0.99      0.99      0.99     19384

---------
Classifiction Report - Undersampled Data
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18777
           1       0.81      0.99      0.89       607

    accuracy                           0.99     19384
   macro avg       0.91      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



In [24]:
# Import RandomOverSampler from imblearn
from imblearn.over_sampling import RandomOverSampler

# Instantiate a RandomOversampler instance
ros = RandomOverSampler(random_state=1)

In [25]:
# Fit the model to the training data
X_oversampled, y_oversampled = ros.fit_resample(X_train_scaled, y_train)

In [26]:
# Count distinct values
y_oversampled.value_counts()

0    56259
1    56259
Name: loan_status, dtype: int64

In [27]:
# Instantiate a new RandomForestClassier model
model_oversampled = RandomForestClassifier()

# Fit the oversampled data the new model
model_oversampled.fit(X_oversampled, y_oversampled)

In [28]:
# Predict labels for oversampled testing features
y_pred_oversampled = model_oversampled.predict(X_test_scaled)

In [29]:
# Print classification reports
print(f"Classifiction Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classifiction Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))
print("---------")
print(f"Classifiction Report - Oversampled Data")
print(classification_report(y_test, y_pred_oversampled))

Classifiction Report - Original Data
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18777
           1       0.85      0.89      0.87       607

    accuracy                           0.99     19384
   macro avg       0.92      0.94      0.93     19384
weighted avg       0.99      0.99      0.99     19384

---------
Classifiction Report - Undersampled Data
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18777
           1       0.81      0.99      0.89       607

    accuracy                           0.99     19384
   macro avg       0.91      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384

---------
Classifiction Report - Oversampled Data
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18777
           1       0.83      0.91      0.87       607

    accuracy                           0.99 

In [53]:

# Create a Logisticregression
lr_models = LogisticRegression(random_state=1)

# Fitting the model
lr_oversampled = lr_models.fit(X_oversampled, y_oversampled)

# Making predictions using the resampler data
lr_oversampled_predictions = lr_models.predict(X_test_scaled)

In [54]:
# Display the accuracy score for the resample dataset.
basr = balanced_accuracy_score(y_test,lr_oversampled_predictions)
basr

0.9953968098129797

In [55]:
# Print the confusion matrix on the resampler data
confusion_matrix(y_test, lr_oversampled_predictions)

array([[18666,   111],
       [    2,   605]], dtype=int64)

In [56]:
# Print the classification report for the original data
print(classification_report_imbalanced(y_test, lr_oversampled_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      1.00      1.00      1.00      0.99     18777
          1       0.84      1.00      0.99      0.91      1.00      0.99       607

avg / total       1.00      0.99      1.00      0.99      1.00      0.99     19384



In [None]:
#random over sampler modele perform very based on the indice of accuracy which is so close of one 