# MIS 637 - Final Project (Group 3)

In [1]:
################################
# Import the libraries
################################

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import f_oneway
from sklearn.metrics import classification_report,precision_score, recall_score,f1_score



# Data Understanding

In [None]:
# Loan_ID					Unique Loan ID
# Gender					Male/ Female
# Married					Applicant married (Y/N)
# Dependents				Number of dependents
# Education					Applicant Education (Graduate/ Under Graduate)
# Self_Employed				Self employed (Y/N)
# ApplicantIncome			Applicant income
# CoapplicantIncome			Coapplicant income
# LoanAmount				Loan amount in thousands
# Loan_Amount_Term			Term of loan in months
# Credit_History			(1- has all debts paid, 0- not paid)
# Property_Area				Urban/ Semi Urban/ Rural
# Loan_Status				(Target) Loan approved (Y/N)


In [2]:
# Load the data
main_loan_df = pd.read_csv('Loan_Data.csv')

# Make sure that the result is as expected
main_loan_df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
# copying data to new variables 
loan_df = main_loan_df.copy()

In [4]:
loan_df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
loan_df.shape

(614, 13)

In [6]:
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [None]:
# check the loan approval status 
loan_df['Loan_Status'].value_counts()

In [None]:
# get a % of approved vs non-approved
loan_df['Loan_Status'].value_counts(normalize=True)

Looks like 422 people (68.72%) out of 614 people got approved for a loan

In [None]:
#loan_df['Loan_Status'].value_counts().plot.bar()
fig = px.bar(loan_df.Loan_Status, y ="Loan_Status", title ="Distribution of Loan Status")
fig.show()

In [None]:
loan_df['Married'].value_counts(normalize=True)

In [None]:
loan_df['Self_Employed'].value_counts(normalize=True)

In [None]:
loan_df['Credit_History'].value_counts(normalize=True)

In [None]:
loan_df['Gender'].value_counts(normalize=True)

In [None]:
loan_df['Dependents'].value_counts(normalize=True)

In [None]:
loan_df['Education'].value_counts(normalize=True)

In [None]:
loan_df['Property_Area'].value_counts(normalize=True)

In [None]:
loan_df['Loan_Amount_Term'].value_counts(normalize=True)

## Explorative Data Analysis

### Heat Map 

In [7]:

loan_df = loan_df.drop(columns = ['Loan_ID'], axis=1)
loan_df.dropna(inplace=True) 
X = loan_df.drop("Loan_Status", axis=1)
X = pd.get_dummies(X, drop_first=True)

In [8]:
cm = X.corr()

In [9]:
#Create heatmap
# ax = plt.subplots(figsize=(12,10))
# mask = np.triu(np.ones_like(cm, dtype = bool))
# cmap = sns.diverging_palette(230, 20,as_cmap=True)
# # sns.heatmap(cm, mask = mask, annot = True, cmap = 'BrBG')
# final_hm = sns.heatmap(cm, mask = mask, annot = True, cmap = cmap)
# final_hm.set_title('Triangle Correlation Heatmap', fontdict = {'fontsize':18}, pad = 16)

# sns.heatmap(cm, annot=True, cmap='coolwarm')

# # Show the plot
# plt.show()

# heatmap
sns.heatmap(cm, annot=True, cmap='coolwarm')

# title
plt.title("Correlation Heatmap")

# plot
plt.show()

KeyboardInterrupt: 

In [None]:
### Exploring relation between applicant income and coapplicant 

In [None]:
#Explore the relationship between applicatn income and coapplicant income via scatterplot
fig = px.scatter(loan_df, x="ApplicantIncome", y="CoapplicantIncome",trendline = 'ols')
fig.show()

#### "ApplicantIncome" and "CoapplicantIncome" have some slight but minimum correlation per analysis above

### Exploring relationship between gender and loan status

In [None]:
#Explore relationship between gender and loan status
ctgl = pd.crosstab(loan_df["Gender"],loan_df["Loan_Status"], normalize = 'index')
ctgl.plot.bar()

In [None]:
pd.crosstab(loan_df["Gender"],loan_df["Loan_Status"], normalize = 'index')

##### Analysis above shows that gender has no effect on approval status as the rejection and approval rates are the same for both genders

### Exploring relationship between property area and loan status

In [None]:
#Explore relationship between property area and loan status
ctgl = pd.crosstab(loan_df["Property_Area"],loan_df["Loan_Status"], normalize = 'index')
ctgl.plot.bar()

In [None]:
pd.crosstab(loan_df["Property_Area"],loan_df["Loan_Status"], normalize = 'index')

#### Rural and urban settings shows similar effect on loan status
#### However Semiurban does make a difference. Change this variable to semiurban or not

### Exploring relationship between married and loan status

In [None]:
#Explore relationship between married and loan status
ctgl = pd.crosstab(loan_df["Married"],loan_df["Loan_Status"], normalize = 'index')
ctgl.plot.bar()

In [None]:
pd.crosstab(loan_df["Married"],loan_df["Loan_Status"], normalize = 'index')

#### Married or not does have an effect on loan status. keeping married

### Explore relationship between dependents and loan status

In [None]:
#Explore relationship between dependents and loan status
ctgl = pd.crosstab(loan_df["Dependents"],loan_df["Loan_Status"], normalize = 'index')
ctgl.plot.bar()

In [None]:
pd.crosstab(loan_df["Dependents"],loan_df["Loan_Status"], normalize = 'index')

#### Dependents seem to have some but not significant impact on approval.

### Explore relationship between education and loan status

In [None]:
#Explore relationship between education and loan status
ctgl = pd.crosstab(loan_df["Education"],loan_df["Loan_Status"], normalize = 'index')
ctgl.plot.bar()

In [None]:
pd.crosstab(loan_df["Education"],loan_df["Loan_Status"], normalize = 'index')

#### Education does have an impact on loan status. Leave as it is.

### Explore relationship between self-employed and loan status

In [None]:
#Explore relationship between self-employed and loan status
ctgl = pd.crosstab(loan_df["Self_Employed"],loan_df["Loan_Status"], normalize = 'index')
ctgl.plot.bar()

In [None]:
pd.crosstab(loan_df["Self_Employed"],loan_df["Loan_Status"], normalize = 'index')

#### Self_employed does not appear to impact loan status. remove self-employed

### Explore relationship between loan term and loan status

In [None]:
#Explore relationship between loan term and loan status
ctgl = pd.crosstab(loan_df["Loan_Amount_Term"],loan_df["Loan_Status"], normalize = 'index')
ctgl.plot.bar()

In [None]:
pd.crosstab(loan_df["Loan_Amount_Term"],loan_df["Loan_Status"], normalize = 'index')

#### No evidence showing that loan term doesn't impact loan status. keep loan term

### Explore relationship between credit history and loan status

In [None]:
#Explore relationship between credit history and loan status
ctgl = pd.crosstab(loan_df["Credit_History"],loan_df["Loan_Status"], normalize = 'index')
ctgl.plot.bar()

In [None]:
pd.crosstab(loan_df["Credit_History"],loan_df["Loan_Status"], normalize = 'index')

#### Strong evidence showing that Credit history impacts loan status. keep credit history

# Data Preparation: Data Quality and Preprocessing

In [None]:
# drop unwanted column 
loan_df = loan_df.drop(columns = ['Loan_ID'], axis=1)

#### Drop "gender" column since "Gender" proven to have no effect on loan status

In [None]:
# Drop "gender" column since "Gender" proven to have no effect on loan status
loan_df = loan_df.drop(columns = ['Gender'], axis=1)

#### Drop "Self_Emloyed" column since "Self-Employed" proven to have no effect on loan status

In [None]:
#Drop "self_employed" column since "Gender" proven to have no effect on loan status
loan_df = loan_df.drop(columns = ['Self_Employed'], axis=1)

#### Drop "Dependents" column since "Dependents" proven to have no effect on loan status

In [None]:
loan_df = loan_df.drop(columns = ['Dependents'], axis=1)

### Treating "Property_Area" column

In [None]:
#Convert proper_area into binary of semiurban or not as as a new variable
positive = ['Semiurban']
negative = ['Rural', 'Urban']
loan_df["Semiurban"] = loan_df["Property_Area"].isin(positive).astype(int)

In [None]:
loan_df['Property_Area'].value_counts(normalize=True)

In [None]:
loan_df['Semiurban'].value_counts(normalize=True)

In [None]:
# Drop "Proper_Area" column since new variable is created
loan_df = loan_df.drop(columns = ['Property_Area'], axis=1)

### Treating coapplicant income columns

In [None]:
# Drop "ApplicantIncome" column since new variable is created
#loan_df = loan_df.drop(columns = ['CoapplicantIncome'], axis=1)

In [None]:
loan_df.info()

## Handling Missing Data

#### Drop all applicants with missing values

In [None]:
# Checking for null values in the columns
loan_df.isnull().sum()

In [None]:
#loan_df.Married.value_counts()

In [None]:
# get the number of missing values in the Gender column
#num_missing = loan_df["Married"].isnull().sum()

# generate an array of random values with the same length as the number of missing values
#random_values = np.random.choice(["Yes", "No"], size=num_missing)

# fill in the missing values with the random values
#loan_df.loc[loan_df["Married"].isnull(), "Married"] = random_values

#replace null with mode
#loan_df.Married.dropna()

In [None]:
#loan_df.Married.value_counts()

In [None]:
#loan_df.Dependents.value_counts()

In [None]:
# get the number of missing values in the Gender column
#num_missing = loan_df["Dependents"].isnull().sum()

# generate an array of random values with the same length as the number of missing values
#random_values = np.random.choice(["0", "1", "2", "3+"], size=num_missing)

# fill in the missing values with the random values
#loan_df.loc[loan_df["Dependents"].isnull(), "Dependents"] = random_values

#replace null with mode
#loan_df.Dependents.fillna(loan_df.Dependents.mode()[0],inplace=True)

#drop null values


In [None]:
#loan_df.Dependents.value_counts()

In [None]:
#loan_df.Self_Employed.value_counts()

In [None]:
# get the number of missing values in the Gender column
#num_missing = loan_df["Self_Employed"].isnull().sum()

# generate an array of random values with the same length as the number of missing values
#random_values = np.random.choice(["Yes", "No"], size=num_missing)

# fill in the missing values with the random values
#loan_df.loc[loan_df["Self_Employed"].isnull(), "Self_Employed"] = random_values

#replace null with mode
#loan_df.Self_Employed.fillna(loan_df.Self_Employed.mode()[0],inplace=True)

In [None]:
#loan_df.Self_Employed.value_counts()

In [None]:
#loan_df.LoanAmount.value_counts()

In [None]:
#replace null with median
#loan_df.LoanAmount.fillna(loan_df.LoanAmount.median(),inplace=True)

#replace null with mode
#loan_df.LoanAmount.fillna(loan_df.LoanAmount.mode()[0],inplace=True)

In [None]:
#loan_df.LoanAmount.value_counts()

In [None]:
#loan_df.Loan_Amount_Term.value_counts()

In [None]:
#replace null with median
#loan_df.Loan_Amount_Term.fillna(loan_df.Loan_Amount_Term.median(),inplace=True)

#replace null with mode
#loan_df.Loan_Amount_Term.fillna(loan_df.Loan_Amount_Term.mode()[0],inplace=True)

In [None]:
#loan_df.Loan_Amount_Term.value_counts()

In [None]:
#loan_df.Credit_History.value_counts()

In [None]:
# get the number of missing values in the Gender column
#num_missing = loan_df["Credit_History"].isnull().sum()

# generate an array of random values with the same length as the number of missing values
#random_values = np.random.choice([1, 0], size=num_missing)

# fill in the missing values with the random values
#loan_df.loc[loan_df["Credit_History"].isnull(), "Credit_History"] = random_values

#replace null with mode
#loan_df.Credit_History.fillna(loan_df.Credit_History.mode()[0],inplace=True)

In [None]:
#loan_df.Credit_History.value_counts()

In [None]:
#drop all rows with null values
loan_df = loan_df.dropna()

In [None]:
# Checking for null values in the columns
loan_df.isnull().sum()

# Identifying and Treating Outliers
### Option 1: Leave outliers as it is (best result so far)
### Option 2: Outliers capped with IQR bounds 
### Treating outliers seem to skew the result so leave outliers as it is

In [None]:
# create a function to find outliers via IQR
def find_outliers_IQR(loan_df):
    q1 = loan_df.quantile(0.25)
    q3 = loan_df.quantile(0.75)
    IQR = q3-q1
    outliers = loan_df[((loan_df<(q1-1.5*IQR))|(loan_df>q3+1.5*IQR))]
    return outliers

In [None]:
# create a function to replace outliers with median
#def impute_outliers_IQR_mode(loan_df):
   # q1 = loan_df.quantile(0.25)
   # q3 = loan_df.quantile(0.75)
   #IQR = q3-q1
    #lower = loan_df[(q1-1.5*IQR)]
    #upper = loan_df[(q3+1.5*IQR)]
    
   # loan_df = np.where(loan_df > upper, loan_df.mode()[0], np.where(loan_df < lower, loan_df.median(),loan_df))
    
    #return loan_df

### Check and treat outliers of Loan amount

In [None]:
# Check outliers of loan amount via histogram
fig = px.histogram(loan_df.LoanAmount, x="LoanAmount", title ="Distribution of Loan Amount")
fig.show()

In [None]:
# Check outliers of loan amount via box plot
fig = px.box(loan_df.LoanAmount, x="LoanAmount", title ="Distribution of Loan Amount")
fig.show()

In [None]:
# find outliers of LoanAmount via IQR
outliers = find_outliers_IQR(loan_df["LoanAmount"])
print("Number of outliers under 'LoanAmount': "+ str(len(outliers)))
print("Max Outlier Value: " + str(outliers.max()))
print("Min Outlier Value: " + str(outliers.min()))
#outliers

In [None]:
#replace outliers with mode
#loan_df["LoanAmount"] = impute_outliers_IQR_mode(loan_df["LoanAmount"])

#drop outliers
#loan_df["LoanAmount"] = drop_outliers_IQR(loan_df["LoanAmount"])

#cap outliers
q1 = loan_df["LoanAmount"].quantile(0.25)
q3 = loan_df["LoanAmount"].quantile(0.75)
IQR = q3-q1
upper_limit = loan_df["LoanAmount"][~(loan_df["LoanAmount"]>(q3+1.5*IQR))].max()
lower_limit = loan_df["LoanAmount"][~(loan_df["LoanAmount"]>(q1-1.5*IQR))].min()
loan_df["LoanAmount"] = np.where(loan_df["LoanAmount"] > upper_limit, upper_limit, np.where(loan_df["LoanAmount"]<lower_limit, lower_limit, loan_df["LoanAmount"]))

In [None]:
#check for outliers of LoanAmount again via histagram 
fig = px.histogram(loan_df.LoanAmount, x="LoanAmount", title ="Distribution of Loan Amount")
fig.show()

In [None]:
#check for outliers of LoanAmount again via boxplot 
fig = px.box(loan_df.LoanAmount, x="LoanAmount", title ="Distribution of Loan Amount")
fig.show()

In [None]:
# find outliers of LoanAmount via IQR
outliers = find_outliers_IQR(loan_df["LoanAmount"])
print("Number of outliers: "+ str(len(outliers)))
print("Max Outlier Value: " + str(outliers.max()))
print("Min Outlier Value: " + str(outliers.min()))
outliers

In [None]:
#loan_df["LoanAmount"] = impute_outliers_IQR_mode(loan_df["LoanAmount"])

### Check and Treat outliers of coappicant income

In [None]:
# Check outliers of coapplicant income via histogram
fig = px.histogram(loan_df.CoapplicantIncome, x="CoapplicantIncome", title ="Distribution of Coapplicant Income")
fig.show()

In [None]:
# Check outliers of coapplicant income via box plot
fig = px.box(loan_df.CoapplicantIncome, x="CoapplicantIncome", title ="Distribution of Coapplicant Income")
fig.show()

In [None]:
# find outliers of coapplicant Income via IQR
outliers = find_outliers_IQR(loan_df["CoapplicantIncome"])
print("Number of outliers under 'CoapplicantIncome': "+ str(len(outliers)))
print("Max Outlier Value: " + str(outliers.max()))
print("Min Outlier Value: " + str(outliers.min()))
#outliers

In [None]:
#loan_df["ApplicantIncome"] = drop_outliers_IQR(loan_df["ApplicantIncome"])
q1 = loan_df["CoapplicantIncome"].quantile(0.25)
q3 = loan_df["CoapplicantIncome"].quantile(0.75)
IQR = q3-q1
upper_limit = loan_df["CoapplicantIncome"][~(loan_df["CoapplicantIncome"]>(q3+1.5*IQR))].max()
lower_limit = loan_df["CoapplicantIncome"][~(loan_df["CoapplicantIncome"]>(q1-1.5*IQR))].min()
loan_df["CoapplicantIncome"] = np.where(loan_df["CoapplicantIncome"] > upper_limit, upper_limit, np.where(loan_df["CoapplicantIncome"]<lower_limit, lower_limit, loan_df["CoapplicantIncome"]))


In [None]:
# Check outliers of coapplicant income again via histogram
fig = px.histogram(loan_df.CoapplicantIncome, x="CoapplicantIncome", title ="Distribution of Coapplicant Income")
fig.show()

In [None]:
# Check outliers of applicant income again via box plot
fig = px.box(loan_df.CoapplicantIncome, x="CoapplicantIncome", title ="Distribution of Coapplicant Income")
fig.show()

In [None]:
# find outliers of Applicant Income again via IQR
outliers = find_outliers_IQR(loan_df["CoapplicantIncome"])
print("Number of outliers: "+ str(len(outliers)))
print("Max Outlier Value: " + str(outliers.max()))
print("Min Outlier Value: " + str(outliers.min()))
outliers

In [None]:
#loan_df["ApplicantIncome"] = impute_outliers_IQR_mode(loan_df["ApplicantIncome"])


In [None]:
#loan_df["ApplicantIncome"] = impute_outliers_IQR(loan_df["ApplicantIncome"])

In [None]:
# Check outliers of applicant income again via histogram
#fig = px.box(loan_df.ApplicantIncome, x="ApplicantIncome", title ="Distribution of Applicant Income")
#fig.show()

### Change and treat applicant income 

In [None]:
# Check outliers of loan amount via histogram
fig = px.histogram(loan_df.ApplicantIncome, x="ApplicantIncome", title ="Distribution of Applicant Income")
fig.show()

In [None]:
fig = px.box(loan_df.ApplicantIncome, x="ApplicantIncome", title ="Distribution of Applicant Income")
fig.show()

In [None]:
# find outliers of Applicant Income again via IQR
outliers = find_outliers_IQR(loan_df["ApplicantIncome"])
print("Number of outliers under 'ApplicantIncome: "+ str(len(outliers)))
print("Max Outlier Value: " + str(outliers.max()))
print("Min Outlier Value: " + str(outliers.min()))
#outliers

In [None]:
#cap outliers
q1 = loan_df["ApplicantIncome"].quantile(0.25)
q3 = loan_df["ApplicantIncome"].quantile(0.75)
IQR = q3-q1
upper_limit = loan_df["ApplicantIncome"][~(loan_df["ApplicantIncome"]>(q3+1.5*IQR))].max()
lower_limit = loan_df["ApplicantIncome"][~(loan_df["ApplicantIncome"]>(q1-1.5*IQR))].min()
loan_df["ApplicantIncome"] = np.where(loan_df["ApplicantIncome"] > upper_limit, upper_limit, np.where(loan_df["ApplicantIncome"]<lower_limit, lower_limit, loan_df["ApplicantIncome"]))


In [None]:
fig = px.box(loan_df.ApplicantIncome, x="ApplicantIncome", title ="Distribution of Applicant Income")
fig.show()

In [None]:
fig = px.histogram(loan_df.ApplicantIncome, x="ApplicantIncome", title ="Distribution of Applicant Income")
fig.show()

In [None]:
# find outliers of Applicant Income again via IQR
outliers = find_outliers_IQR(loan_df["ApplicantIncome"])
print("Number of outliers: "+ str(len(outliers)))
print("Max Outlier Value: " + str(outliers.max()))
print("Min Outlier Value: " + str(outliers.min()))
outliers

# Modeling Phase
### Logistic Regression

In [None]:
loan_df.shape

In [None]:
# extract the feature matrix X and target variable y
X = loan_df.drop("Loan_Status", axis=1)
y = loan_df["Loan_Status"]

In [None]:
X.head(5)

In [None]:
loan_df.info()

In [None]:
# The different categories

# Categorical features: These features have categories 
# => Gender, Married, Self_Employed, Credit_History   (This is the final reasult = Loan_Status)

# Ordinal features: Variables in categorical features having some order involved 
# => Dependents, Education, Property_Area

# Numerical features: These features have numerical values 
# => ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term

In [None]:
X = pd.get_dummies(X, drop_first=True)

In [None]:
X.head(5)

In [None]:
# Training the dataset : training (60%) and validation (40%)
    # What should be the split ?????
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
# Logistic Regression model
model = LogisticRegression()

# Fit it into the training data
model.fit(X_train, y_train)

# Predictions on the data
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label = 'Y')
recall = recall_score (y_test, y_pred, pos_label = 'Y')
f1 = f1_score (y_test, y_pred,pos_label = 'Y')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"f1: {f1_score}")

print(classification_report(y_test, y_pred))


# with null values filled with random (Option 1 for null)
# Accuracy: 0.7357723577235772

# with dropping the null values (Option 2 for null)
# Accuracy: 0.8020833333333334

#With null values filled with random and replace outliers with median 
#accuracy: 0.7723

#with null values filled with mode (Option 3 for null)
#0.7804878048780488

#with both null values and outliers filled with mode
#0.7845528455284553

#Null values removed and outliers capped via IQR boundaries
#0.8020304568527918

#Null values removed
#outliers capped IQR boundaries
#gender removed 
#property_area variable changed to binary of semiurban or not
# accuracy: 0.817258883248731

#Null values removed
#outliers capped IQR boundaries
#gender removed 
#property_area variable changed to binary of semiurban or not
#self-employed dropped
# accuracy:0.8260869565217391

#Null values removed
#gender removed 
#property_area variable changed to binary of semiurban or not
#self-employed dropped
#dependents dropped
#random seed = 42
# accuracy: 0.8436018957345972


In [None]:
# .score() takes the input and output as arguments and returns the ratio 
# of the number of correct predictions to the number of observations.
model.score(X_train, y_train)

# with null values filled with random (Option 1 for null)
# 0.7907608695652174

# with dropping the null values (Option 2 for null)
# 0.8263888888888888

#with replace outliers with median
#0.8098

#with null values filled with mode (option 3 for null)
#0.8288043478260869

#with both null values and outliers filled with mode
#0.8260869565217391

#Null values removed and outliers capped via IQR boundaries
#0.8067796610169492

#Null values removed
#outliers capped IQR boundaries
#gender removed 
#property_area variable changed to binary of semiurban or not
#training score: 0.8169491525423729

#Null values removed
#outliers capped IQR boundaries
#gender removed 
#property_area variable changed to binary of semiurban or not
#self-employed dropped
#training score: 0.8064516129032258

#Null values removed
#outliers capped IQR boundaries
#gender removed 
#property_area variable changed to binary of semiurban or not
#self-employed dropped
#applicant and coapplicant incomes dropped
#training score: 0.8032258064516129

#Null values removed
#gender removed 
#property_area variable changed to binary of semiurban or not
#self-employed dropped
#dependents dropped 
#random seed = 42
#training score: 0.8006329113924051


In [None]:
# FYI things for final project
# Create a ppt
    # Mention whats the prob
    # what are we working on
    # graphs
    # normal presentation (voice over)
# no need to submit the code
