In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
# Load data
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [None]:
train_df.head()

In [None]:
# Separate target feature for training data
y_train = train_df["target"]
X_train = train_df.drop(columns = ["target"])
X_train.head()

In [None]:
# Separate target feature for testing data
y_test = test_df["target"]
X_test = test_df.drop(columns = ["target"])
X_test.head()

In [None]:
# One-hot encoding dataframes
X_dummies_train = pd.get_dummies(X_train)
X_dummies_test = pd.get_dummies(X_test)
print(f"Train: {X_dummies_train.shape}, Test: {X_dummies_test.shape}")

In [None]:
# Convert output labels to 0 and 1
y_label_train = LabelEncoder().fit_transform(train_df['target'])
y_label_train

y_label_test = LabelEncoder().fit_transform(test_df['target'])
y_label_test

In [None]:
# add missing dummy variables to testing set
for col in X_dummies_train.columns:
    if col not in X_dummies_test.columns:
        X_dummies_test[col]=0

In [None]:
print(f"Train: {X_dummies_train.shape}, Test: {X_dummies_test.shape}")

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
clf_logistic  = LogisticRegression(solver='lbfgs').fit(X_dummies_train, y_label_train) 

print(f"Training Data Score: {clf_logistic.score(X_dummies_train, y_label_train)}")
print(f"Testing Data Score: {clf_logistic.score(X_dummies_test, y_label_test)}");

### Random Forest Classifier Model - Unscaled

In [None]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(random_state=42, n_estimators=200).fit(X_dummies_train, y_label_train)

print(f'Training Score: {clf_forest.score(X_dummies_train, y_label_train)}')
print(f'Testing Score: {clf_forest.score(X_dummies_test, y_label_test)}')

### Results - Unscaled
LR Unscaled: Training Data Score: 0.6516420361247948, Testing Data Score: 0.5091450446618461

RF Unscaled: Training Score: 1.0, Testing Score: 0.6482347937048064

On unscaled data, the Random Forest Classifier performed better with a score of .64. However there is an overfitting problem on the training dataset, showing that complexity may need to be reduced for Random Forest.

In [None]:
### Scale Data

In [None]:
# Scale the data using StandardScaler()
scaler = StandardScaler().fit(X_dummies_train)
X_train_scaled = scaler.transform(X_dummies_train)
X_test_scaled = scaler.transform(X_dummies_test)

In [None]:
### Prediction - Scaled
Prediction: The score for Logistic Regression will improve due to scaling whereas the score for Random Forest will 
remain the same. Graphical-model classifiers like Random Forest are invariant to feature scaling.

In [None]:
### Logistic Regression Model - Scaled

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score
clf_lr = LogisticRegression().fit(X_train_scaled, y_label_train)

print(f'Testing Score: {clf_lr.score(X_test_scaled, y_label_test)}')

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix , classification_report
y_true = y_label_test
y_pred = clf_lr.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_true, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['Low Risk', 'High Risk']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=0)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Logistic Regression Model')
plt.show()

In [None]:
The Confusion Matrix shows that the Logistic Regression classifier struggled a bit more at predicting
the Low Risk label, but overall predicted both labels quite well.

In [None]:
# Classification Report
target_names = ['low risk', 'high risk']
print(classification_report(y_true, y_pred, target_names=target_names))

In [None]:
The classification report shows that precision is high, meaning that the model was careful to avoid labeling things “low risk” that aren’t low risk. On the other hand, recall is a bit lower for low risk, which means that the classifier is missing some 'low risks' because it is being too careful. Because precision and recall are both high, F1 is also high.

### Random Forewr Model - Scaled

In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf_rf = RandomForestClassifier(random_state=42, n_estimators=500).fit(X_train_scaled, y_label_train)

print(f'Testing Score: {clf_rf.score(X_test_scaled, y_label_test)}')

In [None]:
# Confusion matrix for RF model
y_true1 = y_label_test
y_pred1 = clf_rf.predict(X_test_scaled)
confusion_matrix(y_true1, y_pred1)

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_true1, y_pred1)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Blues, linewidths=0.2)

# Add labels to the plot
class_names = ['Low Risk', 'High Risk']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=0)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

In [None]:

# Classification Report for Random Forest model
print(classification_report(y_true1, y_pred1, target_names=target_names))

The classification report shows that precision is high for high risk, meaning that the model was careful to avoid labeling things “high risk” that aren’t low risk. On the other hand, recall is low for high risk, which means that the classifier is missing some 'high risks' because it is being too careful. The F1-score reflects the imbalance.

### Results - Scaled
LR Scaled: Testing Score: 0.7598894087622289

RF Scaled: Testing Score: 0.6456826882177796

Overall, scaling greatly improved the score of the Logistic Regression model from .50 to .75 so that it outperformed the Random Forest model. This shows that sometimes a simple model with scaled data can be a better fit than one with more complexity.