In [None]:
#Import warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# General imports
import numpy as np
import pandas as pd
import os
from pathlib import Path
import datetime
from collections import Counter

# For visualization
import plotly as pl
import hvplot.pandas
import plotly.express as px
import matplotlib.pyplot as plt

# For preprocessing ahead of running ML Models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import sklearn as skl 
from imblearn.over_sampling import RandomOverSampler, SMOTE, 
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroid
from imblearn.combine import SMOTEENN

#For ML models
from sklearn.datasets import make_blobs, make_classification
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf

# For model evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, classification_report  
from imblearn.metrics import classification_report_imbalanced


In [None]:
# Load the data
file_path = Path('C:/Users/esobieski/Documents/Berkeley/TeamPySpark/loans.csv')
loans_df = pd.read_csv(file_path)  
loans_df.head(5)

# Data Preprocessing: Exploration steps

In [None]:
# Get all column names
loans_df.columns

# Value Counts to Explore Columns for keeping/removal of columns

In [None]:
# OK to keep DISTRIBUTION MODEL
clean_loans_df.DISTRIBUTION_MODEL.value_counts()

In [None]:
# OK to keep REPAYMENT INTERVAL
clean_loans_df.REPAYMENT_INTERVAL.value_counts()

In [None]:
# OK to keep REPAYMENT INTERVAL
clean_loans_df.STATUS.value_counts()

In [None]:
# OK to keep ORIGINAL LANGUAGE
clean_loans_df.ORIGINAL_LANGUAGE.value_counts()

In [None]:
# OK to REMOVE   NUMBER OF BULK ENTRIES - assume not useful
clean_loans_df.NUM_BULK_ENTRIES.value_counts()

In [None]:
# OK to remove CURRENCY , keep COUNTRY instead
clean_loans_df.CURRENCY.value_counts()

In [None]:
# OK to remove LOAN USE - too specific, not good for ML model, will do NLP on tags instead
clean_loans_df.LOAN_USE.value_counts()

# Calculate time elapsed and clean up time elapsed outliers

In [None]:
# Name variables to find time elapsed from when request was posted to when it was funded.
raised_time = pd.to_datetime(loans_df["RAISED_TIME"])
posted_time = pd.to_datetime(loans_df["POSTED_TIME"])
elapsed_time_df = raised_time - posted_time

In [None]:
# Histogram of elapsed time amounts
elapsed_time_df.astype("timedelta64[D]").hist(range=[-5, 25])
# 12 days is our cutoff for logistic regression

In [None]:
# Summary stats for elapsed time
elapsed_time_df.describe()

In [None]:
# There are 203 times where elapsed time is negative - REMOVE THEM BELOW
sum(elapsed_time_df < pd.to_timedelta(0))

In [None]:
# Display df
elapsed_time_df.head(5)

In [None]:
# Delete negative elapsed time
clean_elapsed_time_df = elapsed_time_df[elapsed_time_df > pd.to_timedelta(0)]

In [None]:
# Remove bad data from original dataframe using the same index, that way clean elapsed_time_df will have same number of rows
# to be merged back into dataframe
clean_loans_df = loans_df[elapsed_time_df > pd.to_timedelta(0)]

# Transform gender column into Male and Female columns with numbers not words

In [None]:
# Checking that column is still in string
loans_df["BORROWER_GENDERS"]

In [None]:
# Setting up a function for converting strings to number objects
# But then group of all males and one female comes up as 1, looks like 1 single female
# KEEP IN MIND WHEN INTERPRETING RESULTS

clean_loans_df.BORROWER_GENDERS.astype(str)

test_string = "male, female, male"

def female(txt):
    lst = txt.split(", ")
    count = 0
    for x in lst:
        if x == "female":
            count +=1 
    return count

def male(txt):
    lst = txt.split(", ")
    count = 0
    for x in lst:
        if x == "male":
            count +=1 
    return count

female(test_string)

In [None]:
# Make a working copy of the loans_df dataframe
working_loans_df = clean_loans_df.copy()
working_loans_df.head(5)

In [None]:
# Convert the datatypes for gender to string
working_loans_df.astype(str)

In [None]:
# Look at the gender column in working_loans_df
working_loans_df = clean_loans_df['BORROWER_GENDERS'].dropna()
working_loans_df

In [None]:
# Apply the male and female gender functions to the dataset, yields a number value for each field. 
male_df = working_loans_df.apply(male)
female_df = working_loans_df.apply(female)
male_df.head(5)

# Exploration of gender columns male and female

In [None]:
# Gender distrubtion histogram of females
female_df.hist(range=[-2, 50])

In [None]:
# Gender distribution histogram of males 
male_df.hist(range=[-2, 50])

In [None]:
# Check value for number of males (1.3 million borrowers have no men in the group or solo business)
clean_loans_df['MALE'].value_counts()

In [None]:
# Check value for number of females (362 thousand borrowers have no women in the group or solo business)
clean_loans_df['FEMALE'].value_counts()

# Merge Male, Female, and Elapsed Time dataframes back into clean_loans_df

In [None]:
# Merge male into clean_loans_df 
clean_loans_df["MALE"]=male_df
# Check dataframe
clean_loans_df.head(5)

In [None]:
# Merge Female column back into DF
clean_loans_df["FEMALE"]=female_df
# Check dataframe
clean_loans_df.head(5)

In [None]:
# Merge Elapsed Time back into df
clean_loans_df["FUNDING_TIME"]= clean_elapsed_time_df
# Check dataframe
clean_loans_df.head(5)

# Removing unnecessary and repetitive columns from dataframe

In [None]:
# Removing refunded and expired out of status.  This removes all rows that are not "funded" from the dataset.
# Thus we are only looking at the speed at which loans are funded.
clean_loans_df = clean_loans_df.loc[clean_loans_df["STATUS"]=="funded"]
clean_loans_df.head(5)

In [None]:
#dropping all obviously unneeded columns 
clean_loans_df.drop(['LOAN_ID', 'LOAN_NAME', 'LOAN_AMOUNT', 'STATUS','DESCRIPTION','DESCRIPTION_TRANSLATED','IMAGE_ID', 'VIDEO_ID', 'LOAN_USE','COUNTRY_CODE', 'TOWN_NAME', 'CURRENCY_POLICY',
       'CURRENCY_EXCHANGE_COVERAGE_RATE', 'CURRENCY', 'POSTED_TIME', 'PLANNED_EXPIRATION_TIME', 'DISBURSE_TIME','RAISED_TIME', 'LENDER_TERM', 'NUM_JOURNAL_ENTRIES', 'NUM_BULK_ENTRIES', 'BORROWER_NAMES','BORROWER_GENDERS','BORROWER_PICTURED'], axis=1, inplace=True)
clean_loans_df.head(5)

# Creating a manual binary classification of time into model DF, SUCCESS is when speed of borrowing is under 12 days, 12 days or more is not successful in terms of the speed - how fast does the loan fund?

In [None]:
# make a copy of clean_loans_df
bc_model_df = clean_loans_df.copy()

In [None]:
# USE FOR BINARY CLASSIFICATION
# Preprocess FUNDING_TIME such that mean = 12 days is cutoff for successful loan funding.  
Under_12days = bc_model_df["FUNDING_TIME"] < pd.to_timedelta("12 days")
bc_model_df["SUCCESS"] = Under_12days.astype(int)

In [None]:
# REMOVE FUNDING TIME from model_df
bc_model_df.drop(['FUNDING_TIME'], axis=1, inplace=True)
bc_model_df.head()

# Creating mc_reg_df for multiple classification (MC) models (time buckets where measure of success is speed of borrowing).  bc_model_df will be for binary classification.

In [None]:
# make a copy of clean_loans_df
mc_reg_df = clean_loans_df.copy()

In [None]:
#create bucket with time delta intervals

bins = [
    pd.Timedelta(weeks = 0),
    pd.Timedelta(weeks = 1),
    pd.Timedelta(weeks = 2),
    pd.Timedelta(weeks = 3),
    pd.Timedelta(weeks = 4),
    pd.Timedelta(weeks = 5)
]
labels = [1,2,3,4,5,6]
mc_reg_df["FUNDING_WEEKS"] = pd.cut(mc_reg_df["FUNDING_TIME"], bins, labels=labels)
mc_reg_df.head(5)

In [None]:
# REMOVE FUNDING_TIME column from mc_reg_df
mc_reg_df.drop(['FUNDING_TIME'], axis=1, inplace=True)
mc_reg_df.head(5)

# Preparing separate Dataframe for Binary Classification NLP on CoLab via exported CSV

In [None]:
# Make a copy of clean_loans_df for export to NLP projec
tags_df = bc_model_df.copy()
tags_df.head(5)

In [None]:
# Get NLP column names
tags_df.columns

In [None]:
# Drop all columns except TAGS
tags_df.drop(['ORIGINAL_LANGUAGE', 'FUNDED_AMOUNT', 'ACTIVITY_NAME','SECTOR_NAME', 'COUNTRY_NAME', 'PARTNER_ID', 'NUM_LENDERS_TOTAL','REPAYMENT_INTERVAL', 'DISTRIBUTION_MODEL', 'MALE', 'FEMALE', 'SUCCESS'], axis=1, inplace=True)
tags_df.head(5)

In [None]:
# Drop rows with null values in bc_df
tags_df.dropna(axis=0, how="any", inplace=True)
tags_df.head(5)

In [None]:
# Save to csv file
tags_df.to_csv("tags.csv")

# Preparing separate Dataframe for Binary Classification NLP on CoLab via exported CSV

In [None]:
# Make a copy of clean_loans_df for export to NLP projec
bc_nlp_df = bc_model_df.copy()
bc_nlp_df.head(5)

In [None]:
# Get NLP column names
bc_nlp_df.columns

In [None]:
bc_nlp_df.drop(['ORIGINAL_LANGUAGE', 'FUNDED_AMOUNT', 'ACTIVITY_NAME','SECTOR_NAME', 'COUNTRY_NAME', 'PARTNER_ID', 'NUM_LENDERS_TOTAL','REPAYMENT_INTERVAL', 'DISTRIBUTION_MODEL', 'MALE', 'FEMALE'], axis=1, inplace=True)
bc_nlp_df.head(5)

In [None]:
# Save to csv file
bc_nlp_df.to_csv("bc_nlp.csv")

# Preparing separate Dataframe for Multiple Classification NLP on CoLab via exported CSV

In [None]:
# Make a copy of mc_reg_df for export to NLP project
mc_nlp_df = mc_reg_df.copy()
mc_nlp_df.head(5)

In [None]:
# Get NLP column names
mc_nlp_df.columns

In [None]:
mc_nlp_df.drop(['ORIGINAL_LANGUAGE', 'FUNDED_AMOUNT', 'ACTIVITY_NAME','SECTOR_NAME', 'COUNTRY_NAME', 'PARTNER_ID', 'NUM_LENDERS_TOTAL','REPAYMENT_INTERVAL', 'DISTRIBUTION_MODEL', 'MALE', 'FEMALE'], axis=1, inplace=True)
mc_nlp_df.head(5)

In [None]:
# Save to csv file
mc_nlp_df.to_csv("mc_nlp.csv")

In [None]:
mc_nlp_df.head(5)

# Remove Tags after creating both NLP CSVs

In [None]:
#dropping TAGS column from BINARY classification
bc_model_df.drop(['TAGS'], axis=1, inplace=True)
bc_model_df.head(5)

In [None]:
#dropping TAGS column from MULTIPLE classification
mc_reg_df.drop(['TAGS'], axis=1, inplace=True)
mc_reg_df.head(5)

# SAMPLING TECHNIQUES: should all 3 or last 2 be after train/test split???

# Prepare both BC and MC dataframes for ML through sampling

In [None]:
#pick a sample of the data as using all did not work for get dummies (3% used) - not as much needed as only BC
bc_df = bc_model_df.sample(frac =.05)

In [None]:
#pick a sample of the data as using all did not work for get dummies (6% used)
mc_df = mc_reg_df.sample(frac =.15)

# Remove null values from sampled BC and MC dataframes

In [None]:
# See how many NaNs are in bc_df
bc_df.isnull().any(axis=1)

In [None]:
# See how many NaNs are in mc_df
mc_df.isnull().any(axis=1)

In [None]:
# Drop rows with null values in bc_df
bc_df.dropna(axis=0, how="any", inplace=True)

In [None]:
# Drop rows with null values in mc_df
mc_df.dropna(axis=0, how="any", inplace=True)

In [None]:
# Show final bc_df
bc_df.head(5)

In [None]:
# Show final mc_df
mc_df.head(5)

# Prepare data for machine learning models that will use binary classification

For logistic regression, our binary classification is that a successful borrowing event results in full funding within 12 days.  An unsucessful event would be funding taking longer than 12 days, as a reflection of less lender enthusiasm to fund the loan.  This removes the issue in the data that 99%+ of loans get funded and thus the data is very unbalanced if you just look at funding vs didn't fund.  

In [None]:
# Create our features  NOT SURE ABOUT THIS VS THE TRAINING AND TESTING A FEW CELLS DOWN

X = bc_df.copy()
X = X.drop('SUCCESS', axis=1)

# Create our target
y = bc_df[['SUCCESS']]

In [None]:
# Describe X  -- MOST BORROWERS ARE SOLO FEMALES

X.describe()

In [None]:
# List out y

y

In [None]:
# Check the balance of our target values  
# Used a calculated value of TIME TO FULL FUNDING using date stamps in prep for LOGISTIC REGRESSION
# SUCCESS is funding in 12 days or less

y['SUCCESS'].value_counts()

In [None]:
# Encode Labels (DO WE USE THIS OR ONE HOT ENCODER?)

X = pd.get_dummies(X)
X.head()

In [None]:
# ADD TRAIN-TEST SPLIT AFTER GETTING DUMMIES AND BEFORE SCALING, SO RIGHT HERE
# Dataset is split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [None]:
# SCALING X_TRAIN STEP 
X_train_scaled = StandardScaler().fit_transform(X_train)
print(X_train_scaled[0:5])

In [None]:
# SCALING X_TEST STEP HERE 
X_test_scaled = StandardScaler().fit_transform(X_test)
print(X_test_scaled[0:5])

# Prepare and scale data for machine learning models that will use multiple classification

For linear regression, Random Forest and Neural nets, we use buckets of time, 1, 2, 3, 4, and 5 weeks, so that it can both classify and be continuous.  This is our mc_df.  We notate all these with mc in front so we can run either pre-prepared through multiple models below.

In [None]:
# Create our features  NOT SURE ABOUT THIS VS THE TRAINING AND TESTING A FEW CELLS DOWN

mcX = mc_df.copy()
mcX = mcX.drop('FUNDING_WEEKS', axis=1)

# Create our target
mcy = mc_df[['FUNDING_WEEKS']]

In [None]:
# Describe X  -- MOST BORROWERS ARE SOLO FEMALES

mcX.describe()

In [None]:
# List out y

mcy

In [None]:
# Check the balance of our target values  
# Used a calculated value of TIME TO FULL FUNDING using date stamps in prep for LOGISTIC REGRESSION
# SUCCESS is funding in 12 days or less

mcy['FUNDING_WEEKS'].value_counts()

In [None]:
# Encode Labels (DO WE USE THIS OR ONE HOT ENCODER?)

mcX = pd.get_dummies(mcX)
mcX.head()

# Train/Test split

In [None]:
# ADD TRAIN-TEST SPLIT AFTER GETTING DUMMIES AND BEFORE SCALING
# Dataset is split into training and testing sets
mcX_train, mcX_test, mcy_train, mcy_test = train_test_split(mcX,
    mcy, random_state=1, stratify=mcy)

In [None]:
# SCALING X_TRAIN STEP 
mcX_train_scaled = StandardScaler().fit_transform(mcX_train)
print(mcX_train_scaled[0:5])

In [None]:
# SCALING X_TEST STEP HERE 
mcX_test_scaled = StandardScaler().fit_transform(mcX_test)
print(mcX_test_scaled[0:5])

Oversampling: because "y" is very right skewed leading to 50% of the data in one of 6 one-week buckets, thus model performance is 50%.  When it is 2 buckets, set at the mean of 12 days, then model performance is 69%, equal to the amount of "y" data in the 1 position, so again model is not effective, same prediction as the distribution of "y" data.

# Random Oversampling on binary classification model

In [None]:
# Check imbalance in y
Counter(y_train)

In [None]:
# Random oversampling to rebalance y ???? SHOULD I DO THIS FOR X AND Y or just Y?
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [None]:
# Check on how much Y was rebalanced for Random oversampling
Counter(y_resampled)

# Synthetic Minority Undersamping on binary classification model

In [None]:
# Synthetic Minority Oversampling
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [None]:
# See results of Synthetic Minority Undersampling
Counter(y_resampled)

# Random Oversampling on binary classification model

In [None]:
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
# See results of Synthetic Minority Undersampling
Counter(y_resampled)

# Random Undersampling on binary classification model

In [None]:
rus = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

# Cluster Centroid Undersampling on binary classification model

In [None]:
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

# Resampling: combination oversampling and undersampling with SMOTEEN on binary classification model

In [None]:
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Random Oversampling on multiple classification model

In [None]:
# Check imbalance in y
Counter(mcy_train)

In [None]:
# Random oversampling to rebalance y ???? SHOULD I DO THIS FOR X AND Y or just Y?
ros = RandomOverSampler(random_state=1)
mcX_resampled, mcy_resampled = ros.fit_resample(mcX_train, mcy_train)

In [None]:
# Check on how much Y was rebalanced for Random oversampling
Counter(mcy_resampled)

# Synthetic Minority Undersamping on multiple classification model

In [None]:
# Synthetic Minority Oversampling
mcX_resampled, mcy_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   mcX_train, mcy_train)

In [None]:
# See results of Synthetic Minority Undersampling
Counter(mcy_resampled)

# Random Undersampling on multiple classification model

In [None]:
# Random Undersampling on multiple classification model
ros = RandomUnderSampler(random_state=1)
mcX_resampled, mcy_resampled = ros.fit_resample(mcX_train, mcy_train)
Counter(mcy_resampled)

# Cluster Centroid Undersampling of multiple classification model

In [None]:
cc = ClusterCentroids(random_state=1)
mcX_resampled, mcy_resampled = cc.fit_resample(mcX_train, mcy_train)

# Resampling: combination oversampling and undersampling with SMOTEEN on binary classification model

In [None]:
smote_enn = SMOTEENN(random_state=0)
mcX_resampled, mcy_resampled = smote_enn.fit_resample(mcX, mcy)

# Run PCA on Binary Classification model

In [None]:
# PCA model intialization 277 columns, reducing complexity
pca = PCA(n_components=32)

In [None]:
# PCA fit and transform for training
train_loans_pca = pca.fit_transform(X_train_scaled)

In [None]:
# Transform testing data using PCA to a DataFrame 
test_loans_pca = pca.transform(X_test_scaled)
X_test_pca_df = pd.DataFrame(data=test_loans_pca)
X_test_pca_df.head(5)

In [None]:
# transform PCA data to a DataFrame 
X_train_pca_df = pd.DataFrame(data=train_loans_pca)
X_train_pca_df.head(5)

In [None]:
# See explained variance ratio sum - Optimized to explain as much as possible - 225 components is ideal at 95%, no one feature explains 
# All features are equally important  - 10 features = 9%  - 100 features 50%  -
# Mostly linear relationship number of features and explainabilty
# PCA DID NOT HELP, WHEN IT REDUCES THE NUMBER OF FEATURES IT ALSO REDUCES EXPLAINABILITY IN A NEARLY LINEAR RELATIONSHIP
# THIS ALSO SHOWED UP IN THE ML MODELS, WHERE USING X-SCALED WAS BETTER THAN THE PCA VERSION.
sum(pca.explained_variance_ratio_)

# Run PCA on Multiple Classification model

In [None]:
# PCA model intialization 277 variables, reducing complexity
mc_pca = PCA(n_components=32)

In [None]:
# PCA fit and transform for training
mc_train_loans_pca = mc_pca.fit_transform(mcX_train_scaled)

In [None]:
# Transform testing data using PCA to a DataFrame 
mc_test_loans_pca = mc_pca.transform(mcX_test_scaled)
mc_X_test_pca_df = pd.DataFrame(data=mc_test_loans_pca)
mc_X_test_pca_df.head(5)

In [None]:
# transform PCA data to a DataFrame 
mcX_train_pca_df = pd.DataFrame(data=mc_train_loans_pca)
mcX_train_pca_df.head(5)

In [None]:
# See explained variance ratio sum - Optimized to explain as much as possible - 225 components is ideal at 95%, no one feature explains 
# All features are equally important  - 10 features = 9%  - 100 features 50%  -
# Mostly linear relationship number of features and explainabilty
# PCA DID NOT HELP, WHEN IT REDUCES THE NUMBER OF FEATURES IT ALSO REDUCES EXPLAINABILITY IN A NEARLY LINEAR RELATIONSHIP
# THIS ALSO SHOWED UP IN THE ML MODELS, WHERE USING X-SCALED WAS BETTER THAN THE PCA VERSION.
sum(mc_pca.explained_variance_ratio_)

# Linear Regression using Multiple Classification buckets as a continuous series

# QUESTIONS HERE TOO

In [None]:
# ????? DO I DO THIS WITHOUT THE SCALED DATA?  Unspecified number of rows, non-scaled data has 
# mcX = mc_df..values.reshape(-1, 1)  # NEED TO CHANGE SECOND NUMBER TO NUMBER OF COLUMNS, scaled, dummies, pca???

In [None]:
# Look at first 5 entries in X
mcX_test_scaled[:5]

In [None]:
# Look at shape of x, ie number of rows and columns
mcX_test.shape

In [None]:
# Look at shape of x, ie number of rows and columns
mcy_train.shape

In [None]:
# Set the target, the dependent variable, to FUNDING_WEEKS (pre-dummies????  OTHERWISE TOO MANY Ys, but pre-dummies) ????????
mcy_train.head()

In [None]:
model = LinearRegression()

In [None]:
model.fit(mcX_train_scaled, mcy_train)

In [None]:
model.score(mcX_test_scaled, mcy_test)   #means square error, make customized accuracy function

In [None]:
y_pred = model.predict(mcX_test_scaled)
print(y_pred.shape)

In [None]:
# WHAT OTHER STATISTICS CAN I RUN AND PRINT?????  CORRELATIONS?  PLOT?
print(model.coef_)
print(model.intercept_)

# Run Logistic Regression on SUCCESS and Analyze Results

In [None]:
# A synthetic dataset is generated with Scikit-learn’s make_blobs module
X, y = make_blobs(centers=2, random_state=42)

print(f"Labels: {y[:10]}")
print(f"Data: {X[:10]}")

In [None]:
# dataset is visualized
plt.scatter(X[:, 0], X[:, 1], c=y)

In [None]:
# Instantiate a Logistic Regression Model, Step 1 of 2
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

In [None]:
# Instantiate a Logistic Regression Model, Step 2 of 2
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, l1_ratio=None, max_iter=100,
   multi_class='warn', n_jobs=None, penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
   warm_start=False)

In [None]:
# Train the Logistic Regression Model  - Using scaled data takes prediction accuracy from 66% to 69%
classifier.fit(X_train_scaled, y_train)

In [None]:
# Validate the Logistic Regression Model
y_pred = classifier.predict(X_test_scaled)

In [None]:
# Step 2 run predictions
pd.DataFrame({"Prediction": y_pred, "Actual": y_test["SUCCESS"]})

In [None]:
# Evaluate Model Performance
accuracy_score(y_test, y_pred)

In [None]:
# Run Confusion Matrix to determine the biggest factors of a fast funding loan
matrix = confusion_matrix(y_test,  y_pred)
print(matrix)

In [None]:
# Run Classification Report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
# TRY OUT NEW DATA -- DO WE NEED THIS STEP??? ????????????????????????????????????????
# predictions = classifier.predict(new_data)
# print("Classes are either 0 (Fast) or 1 (Slow)")
# print(f"The new point was classified as: {predictions}")

# Run SVMs using linear kernel as well as Radical Basis Kernel SVM and Analyze Results.  Tried incorporating PCA but results worsened. 

In [None]:
# Import the SVC module from Scikit-learn, and instantiate it using linear for the orientation of the hyperplane 68% accuracy
model = SVC(kernel='linear')  

In [None]:
# Use Radical Basis Kernel SVM *** THIS STEP TAKES 20 MINUTES ***  64.5% accuracy, commented out as Linear SVC above is more accurate

# C_2d_range = [1e-2, 1, 1e2]
# gamma_2d_range = [1e-1, 1, 1e1]
# classifiers = []
# for C in C_2d_range:
#     for gamma in gamma_2d_range:
#         clf = SVC(C=C, gamma=gamma)
#         clf.fit(X_train_scaled, y_train)
#         classifiers.append((C, gamma, clf))

In [None]:
# Fit the model
model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred,"Actual": y_test["SUCCESS"]}).reset_index(drop=True)
results.head(5)

In [None]:
# Assess the Accuracy Score
accuracy_score(y_test, y_pred)

In [None]:
# Run the Confusion Matrix
confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

# Random Forest Model on Binary Classification

# QUESTIONS on how to tune

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=78, max_depth=5) # goal is to add more here...

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, 
# min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
# min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, 
# random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None

# >>> X, y = make_classification(n_samples=1000, n_features=4,
# ...                            n_informative=2, n_redundant=0,
# ...                            random_state=0, shuffle=False)
# >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
# >>> clf.fit(X, y)
# RandomForestClassifier(...)
# >>> print(clf.predict([[0, 0, 0, 0]]))

# see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
# Making predictions using the testing data.
y_pred = rf_model.predict(X_test_scaled)

In [None]:
# Look at Importance of Each Parameter  ****NEED TO DO THIS*****
# rf_getparams

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred)
acc_score

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Displaying results  ********ECHO************ HELP ME MAKE ALL THESE THE SAME FOR ALL MODELS ************
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

# Random Forest Model on Multiple Classification

# QUESTIONS on how to tune, also on inputs

Random Forest may work better looking at buckets of time and what impacts speed of funding a loan

In [None]:
# Create a random forest classifier.
mc_rf_model = RandomForestClassifier(n_estimators=100, random_state=78, max_depth=5) # goal is to add more here...

# Fitting the model
mc_rf_model = mc_rf_model.fit(mcX_train_scaled, mcy_train)

# RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, 
# min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
# min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, 
# random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None

# >>> X, y = make_classification(n_samples=1000, n_features=4,
# ...                            n_informative=2, n_redundant=0,
# ...                            random_state=0, shuffle=False)
# >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
# >>> clf.fit(X, y)
# RandomForestClassifier(...)
# >>> print(clf.predict([[0, 0, 0, 0]]))

# see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from collections import Counter 
Counter(mcy_train)

In [None]:
mcy_train['FUNDING_WEEKS'].value_counts()

In [None]:
y_train['SUCCESS'].value_counts()

In [None]:
# Making predictions using the testing data.
mcy_pred = mc_rf_model.predict(mcX_test_scaled)

In [None]:
# Look at Importance of Each Parameter  ****NEED TO DO THIS*****
# rf_getparams

In [None]:
# Calculating the accuracy score. 
acc_score = accuracy_score(mcy_test, mcy_pred)
acc_score

In [None]:
# BALANCED ACCURACY SCORE


In [None]:
mcy_pred[5]

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(mcy_test, mcy_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Displaying results  ********ECHO************ HELP ME MAKE ALL THESE THE SAME FOR ALL MODELS ************
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(mcy_test, mcy_pred))

# Run Neural nets using mc_df, multiple classification

# QUESTIONS ABOUND!

In [None]:
# Generate dummy dataset  -- OUT OF WHAT??????????????????  SHOULD I USE MY PREVIOUS GET DUMMIES?????
mcX, mcy = make_blobs(n_samples=1000, centers=2, n_features=2, random_state=78)

# Creating a DataFrame with the dummy data
nn_df = pd.DataFrame(mcX, columns=["Feature 1", "Feature 2"])  # HOW MANY COLUMNS?????
mm_df["Target"] = y

# Plotting the dummy data
nn_df.plot.scatter(x="Feature 1", y="Feature 2", c="Target", colormap="winter")

In [None]:
# Use sklearn to split dataset    --  COMMENTED OUT BECAUSE ALREADY SPLIT ONCE ABOVE IN DATA PREPROCESSING

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
# ALSO ALREADY SCALED DATA ABOVE

In [None]:
# Create the Keras Sequential model. Sequential groups a linear stack of layers
nn_model = tf.keras.models.Sequential()

In [None]:
# Add our first and only Dense layer, including the input layer  (relu between zero and infinity.  
# signmoid activation for S curve b/t 0-1 - could use with bc), or Linear function
nn_model.add(tf.keras.layers.Dense(units=1, activation="relu", input_dim=2))

In [None]:
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

In [None]:
# Check the structure of the Sequential model
nn_model.summary()

In [None]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Fit the model to the training data
fit_model = nn_model.fit(mcX_train_scaled, mcy_train, epochs=100)

In [None]:
# Create a DataFrame containing training history
history_df = pd.DataFrame(fit_model.history, index=range(1,len(fit_model.history["loss"])+1))

# Plot the loss
history_df.plot(y="loss")

In [None]:
# Plot the accuracy
history_df.plot(y="accuracy")

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Predict the classification of a new set of blob data
new_X, new_Y = make_blobs(n_samples=10, centers=2, n_features=2, random_state=78)
new_X_scaled = X_scaler.transform(new_X)
nn_model.predict_classes(new_X_scaled)