<a href="https://colab.research.google.com/github/NoahOshana17/Loan-Default-Predictor/blob/main/Loan_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Logistic Regression

In [None]:
!pip install pandas

In [None]:
!pip install -U scikit-learn

## **Dependencies**

In [None]:
import pandas as pd
import math
from matplotlib import pyplot as plt
import seaborn as sns
from statistics import mode
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

## **Setup Dataset**

In [None]:
train_ds_original = pd.read_csv('/content/sample_data/loan_predictor_train.csv')
test_ds_original = pd.read_csv('/content/sample_data/loan_predictor_test.csv')

In [None]:
train_ds = train_ds_original
test_ds = test_ds_original

In [None]:
submission = pd.read_csv('/content/sample_data/submission.csv')

## **View Dataset**

In [None]:
train_ds

In [None]:
train_ds.dtypes

In [None]:
test_ds.head(15)

In [None]:
test_ds.dtypes

In [None]:
list(train_ds.columns)

In [None]:
list(test_ds.columns)

## **Viewing Target Variable**

In [None]:
train_ds_original['Loan_Status'].value_counts(normalize=True).plot.bar(figsize=(20,10), title='Loan Status')

## **Viewing Independent Variabels Separately**

### Gender

In [None]:
train_ds['Gender'].value_counts(normalize=True).plot.bar(figsize=(20,10), title='Gender')

### Married

In [None]:
train_ds['Married'].value_counts(normalize=True).plot.bar(figsize=(20,10), title='Married')

### Dependents

In [None]:
train_ds['Dependents'].value_counts(normalize=True).plot.bar(figsize=(20,10), title='Dependents')

### Education

In [None]:
train_ds['Education'].value_counts(normalize=True).plot.bar(figsize=(20,10), title='Education')

### Self-Employed

In [None]:
train_ds['Self_Employed'].value_counts(normalize=True).plot.bar(figsize=(20,10), title='Self-Employed')

### Income

In [None]:
sns.displot(train_ds['ApplicantIncome'])

In [None]:
sns.boxplot(train_ds['ApplicantIncome'])

### CoApplicant Income

In [None]:
sns.displot(train_ds['CoapplicantIncome'])

### Loan Amount

In [None]:
sns.displot(train_ds['LoanAmount'])

## Correlation matrix with heatmap

In [None]:
matrix = train_ds.corr()
matrix
f, ax = plt.subplots(figsize=(9,6))
sns.heatmap(matrix, vmax=.75, square=True, cmap="BuPu", annot=True)

## Change dependents 3+ to 3 and Loan status to 0 and 1

In [None]:
train_ds['Dependents'].replace('3+', 3, inplace=True)
train_ds['Loan_Status'].replace('N', 0, inplace=True)
train_ds['Loan_Status'].replace('Y', 1, inplace=True)
test_ds['Dependents'].replace('3+', 3, inplace=True)

## **NaN Handling Method 1:**Getting Methods to Replace 'NaN' values in Dataset

In [None]:
# Drop rows with NaN values
train_ds = train_ds.dropna()
test_ds = test_ds.dropna()

### Check to see if any NaN values exist

In [None]:
print(train_ds.isnull().sum())

In [None]:
print(test_ds.isnull().sum())

### Replacing LoanAmount NaN using averaging

In [None]:
median_loanamount = math.floor(train_ds.LoanAmount.median())
median_loanamount

In [None]:
train_ds.LoanAmount = train_ds.LoanAmount.fillna(median_loanamount)
test_ds.LoanAmount = test_ds.LoanAmount.fillna(median_loanamount)

### Replacing Gender NaN Values using averaging

In [None]:
mode_gender = mode(train_ds.Gender)
mode_gender

In [None]:
train_ds.Gender = train_ds.Gender.fillna(mode_gender)
test_ds.Gender = test_ds.Gender.fillna(mode_gender)

### Replacing Marriage Status NaN values using averaging

In [None]:
mode_married = mode(train_ds.Married)
mode_married

In [None]:
train_ds.Married = train_ds.Married.fillna(mode_married)
test_ds.Married = test_ds.Married.fillna(mode_married)

### Replacing Dependents NaN values using averaging

In [None]:
mode_dependents = mode(train_ds.Dependents)
mode_dependents

In [None]:
train_ds.Dependents = train_ds.Dependents.fillna(mode_dependents)
test_ds.Dependents = test_ds.Dependents.fillna(mode_dependents)

### Replacing Self-Employed NaN values using averaging

In [None]:
mode_self_employed = mode(train_ds.Self_Employed)
mode_self_employed

In [None]:
train_ds.Self_Employed = train_ds.Self_Employed.fillna(mode_self_employed)
test_ds.Self_Employed = test_ds.Self_Employed.fillna(mode_self_employed)

### Replacing Loan Amount Term NaN values using averaging

In [None]:
mode_loan_amount_term = math.trunc(mode(train_ds.Loan_Amount_Term))
mode_loan_amount_term

In [None]:
train_ds.Loan_Amount_Term = train_ds.Loan_Amount_Term.fillna(mode_loan_amount_term)
test_ds.Loan_Amount_Term = test_ds.Loan_Amount_Term.fillna(mode_loan_amount_term)

### Replacing Credit History NaN values using averaging

In [None]:
mode_credit_history = math.trunc(mode(train_ds.Credit_History))
mode_credit_history

In [None]:
train_ds.Credit_History = train_ds.Credit_History.fillna(mode_credit_history)
test_ds.Credit_History = test_ds.Credit_History.fillna(mode_credit_history)

### Changing datatypes for variable to match train set

In [None]:
test_ds.Dependents = test_ds.Dependents.astype(int)
test_ds = test_ds.astype({'Dependents' : 'object'})
test_ds = test_ds.astype({'CoapplicantIncome' : 'float64'})

## Outlier Treatment

### Applicant Income

In [None]:
train_ds['ApplicantIncome'] = np.log(train_ds['ApplicantIncome'])
test_ds['ApplicantIncome'] = np.log(test_ds['ApplicantIncome'])

In [None]:
train_ds['ApplicantIncome'].replace([np.inf, -np.inf], 0, inplace=True)
test_ds['ApplicantIncome'].replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
sns.displot(train_ds['ApplicantIncome'])

### Loan Amount

In [None]:
train_ds['LoanAmount'] = np.log(train_ds['LoanAmount'])
test_ds['LoanAmount'] = np.log(test_ds['LoanAmount'])

In [None]:
sns.displot(train_ds['LoanAmount'])

## Model Building

### Drop Loan_ID from dataset as his has no effect on target variable

In [None]:
train_ds = train_ds.drop('Loan_ID', axis=1)
test_ds = test_ds.drop('Loan_ID', axis=1)

### Need target variable in separate dataset

In [None]:
X = train_ds.drop('Loan_Status', 1)

In [None]:
y = train_ds.Loan_Status

### Dummy Variables for Categorical Values

In [None]:
X = pd.get_dummies(X)
train_ds = pd.get_dummies(train_ds)
test_ds = pd.get_dummies(test_ds)

### Train-Test Split

In [None]:
x_train, x_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3)

## Use Logistic Regression and Fit the Model

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)
LogisticRegression()

### Test on validation

In [None]:
pred_cv = model.predict(x_cv)
accuracy_score(y_cv, pred_cv)

### Make sure features in test align with features used to fit the model

In [None]:
list(test_ds[X.columns].columns)

In [None]:
test_ds = test_ds[X.columns]

### Test on test data

In [None]:
pred_test = model.predict(test_ds)

### Prepare submission format needed for hackathon

In [None]:
submission.head()

In [None]:
submission['Loan_Status'] = pred_test
submission['Loan_ID'] = test_ds_original['Loan_ID']

In [None]:
submission['Loan_Status'].replace(0, 'N', inplace=True)
submission['Loan_Status'].replace(1, 'Y', inplace=True)

In [None]:
pd.DataFrame(submission, columns=['Loan_ID', 'Loan_Status']).to_csv('logistic.csv', index=False)

#### Submission 1: 76.38% test accuracy on Analytics Vidhya Loan Prediction Hackathon. **Did NOT include outlier treatment.**

#### Submission 2: 78.47% test accuracy on Analytics Vidhya Hackathon. Adding log strategy for handling outliers.

## Feature Engineering strat #1: Combine Applicant and Coapplicant incomes

In [None]:
train_ds['TotalIncome'] = train_ds['ApplicantIncome'] + train_ds['CoapplicantIncome']
test_ds['TotalIncome'] = test_ds['ApplicantIncome'] + test_ds['CoapplicantIncome']

In [None]:
train_ds.head(15)

In [None]:
test_ds.head(10)

In [None]:
train_ds['TotalIncome'] = np.log(train_ds['TotalIncome'])
test_ds['TotalIncome'] = np.log(test_ds['TotalIncome'])

In [None]:
sns.displot(data=train_ds['TotalIncome'])

In [None]:
sns.displot(data=test_ds['TotalIncome'])

In [None]:
train_ds = train_ds.drop(['ApplicantIncome', 'CoapplicantIncome'], axis=1)
test_ds = test_ds.drop(['ApplicantIncome', 'CoapplicantIncome'], axis=1)