In [None]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, RocCurveDisplay
from sklearn.preprocessing import StandardScaler


# suppress warnings
import warnings
warnings.filterwarnings('ignore')

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [None]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv("Resources/lending_data.csv")
print(df.shape)


# Review the DataFrame
df.head()

In [None]:
df.describe()

In [None]:
df.loan_status.value_counts()

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [None]:
corrs=df.corr()
corrs

In [None]:
sns.heatmap(corrs, annot=True)
plt.show()

In [None]:
abs(corrs.loan_status).sort_values(ascending=False)

In [None]:
df.columns

In [None]:
features = ['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt']

In [None]:
# SCALE the NUMERIC Features FIRST

# subset
df_sub = df.loc[:, features]

# initialize
scaler = StandardScaler()

# fit
scaler.fit(df_sub)

# predict/transform
scaled_data = scaler.transform(df_sub)
df_scaled = pd.DataFrame(scaled_data, columns=features)

df_scaled.head()


In [None]:
df_scaled.describe()

In [None]:
# Step 1: Get the data
X = df_scaled
y = df.loan_status

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) # stratify=True maintains target class percentages

In [None]:
# Function for Classification
def doClassification(model, X_train, X_test, y_train, y_test):
    # Step 3: Fit the model
    model.fit(X_train, y_train)
    
    # Step 4: Evaluate the model
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    train_proba = model.predict_proba(X_train)[:, 1]
    test_proba = model.predict_proba(X_test)[:, 1]

    # Generate metrics TRAIN
    train_cf = confusion_matrix(y_train, train_preds)
    train_cr = classification_report(y_train, train_preds)
    train_auc = roc_auc_score(y_train, train_proba)
    
    train_results = f"""TRAIN METRICS
    Confusion Matrix: {train_cf}
    AUC: {train_auc}
    {train_cr}
    """
    
    print(train_results)

    # Generate metrics TEST
    test_cf = confusion_matrix(y_test, test_preds)
    test_cr = classification_report(y_test, test_preds)
    test_auc = roc_auc_score(y_test, test_proba)
    
    test_results = f"""TEST METRICS
    Confusion Matrix: {test_cf}
    AUC: {test_auc}
    {test_cr}
    """
    
    print(test_results)

    # VISUALIZE TEST RESULTS
    fpr, tpr, _ = roc_curve(y_test.values, test_proba)
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
    roc_display

In [None]:
 # Step 2: Init the Model
lr = LogisticRegression()

# Do Machine Learning
doClassification(lr, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
dt = DecisionTreeClassifier(random_state=42)

# Do Machine Learning
doClassification(dt, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
rf = RandomForestClassifier(random_state=42)

# Do Machine Learning
doClassification(rf, X_train, X_test, y_train, y_test)

In [None]:
#Intiate the model
svc= SVC(probability =True)

#machine learning
doClassification(svc, X_train, X_test, y_train, y_test)


In [None]:
# Step 2: Init the Model
knn = KNeighborsClassifier(n_neighbors=15)

# Do Machine Learning
doClassification(knn, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
et = ExtraTreesClassifier(random_state=42)

# Do Machine Learning
doClassification(et, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
ada = AdaBoostClassifier(random_state=42)

# Do Machine Learning
doClassification(ada, X_train, X_test, y_train, y_test)

In [None]:
gb= GradientBoostingClassifier(random_state=42)

#machine learning
doClassification(gb, X_train,X_test,y_train,y_test)

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
#initiate the model
xgb=XGBClassifier(_random_state=42)
#machine learning
doClassification(xgb, X_train, X_test, y_train, y_test)

In [None]:
#initiate the model
lgbm=LGBMClassifier(_random_state=42, verbose= -1)
#machine learning
doClassification(lgbm, X_train, X_test, y_train, y_test)

In [None]:
 # Step 2: Init the Model
lr = LogisticRegression()

# Do Machine Learning
doClassification(lr, X_train, X_test, y_train, y_test)

In [None]:
fi = pd.DataFrame(list(zip(X.columns, ada.feature_importances_)), columns=["Feature", "Importance"])
fi.sort_values(by="Importance", ascending=False)

In [None]:
fi = pd.DataFrame(list(zip(X.columns, xgb.feature_importances_)), columns=["Feature", "Importance"])
fi.sort_values(by="Importance", ascending=False)

In [None]:
fi = pd.DataFrame(list(zip(X.columns, lgbm.feature_importances_)), columns=["Feature", "Importance"])
fi.sort_values(by="Importance", ascending=False)

In [None]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df['loan_status']
# Separate the X variable, the features
X = df.drop(columns=['loan_status'])

In [None]:
# Review the y variable Series
y.head()

In [None]:
# Review the X variable DataFrame
X.head()

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
# Make a prediction using the testing data
test_predictions = logistic_regression_model.predict(X_test)
pd.DataFrame({'Predictions': test_predictions, 'Actual': y_test})

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Generate a confusion matrix for the model
cf_test_matrix = confusion_matrix(y_test, test_predictions)
cf_test_matrix  

In [None]:
# Print the classification report for the model
testing_report = classification_report(y_test, test_predictions)
print(testing_report)

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model was 95% accurate at predicting the healthy vs high-risk loan labels

---

In [None]:
# We have sever imbalance, which i thought would affect model performance
# low size of the positive soan_status class
# logistic regression looks really good. 
# SVC model is really good -- this is likely because of the linear relationships complex math model
# Knn model is also good similar people are likely to default together
# Random forest/extratrees/decision tree they all seem to be finding patterns int he non=default class and missing patterns in the positive class
# likely due to the small sample.
# Trees are very dependent on sample size
# ADA looks good 
# Boosted trees alos look really good, so they see, to have minimize the nodefault class patterns when predicting positive class.