# Bank Loans Data - Hypothesis Testing

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import re

import scipy.stats as stats

# set the graphs to show in the jupyter notebook
%matplotlib inline

# set seabor graphs to a better style
sns.set(style="ticks")

from dateutil.relativedelta import relativedelta

In [None]:
# Load the data
bankloans = pd.read_csv( 'LoansData.csv' )
bankloans

In [None]:
#Cleaning up the column names

bankloans.columns = [i.replace ('.','_') for i in bankloans.columns]

In [None]:
# Data Type conversions:

bankloans['Interest_Rate'] = bankloans['Interest_Rate'].str.replace('%','').astype('float')

bankloans['Debt_To_Income_Ratio'] = bankloans['Debt_To_Income_Ratio'].str.replace('%','').astype('float')

bankloans['Loan_Length'] = bankloans['Loan_Length'].str.replace('months','').astype('int')

bankloans['Employment_Length'] = bankloans['Employment_Length'].str.replace('years','').str.replace('year','').str.replace('<','').str.replace('+','').astype('float')

bankloans['FICO_Range'] = bankloans['FICO_Range'].str.split('-',expand = True)[0].astype('float')

In [None]:
bankloans.dtypes

In [None]:
# Identify the duplicates if any

bankloans.duplicated().sum()/bankloans.duplicated().count() #no duplicate values found

# Missing values detection and treatment

def missing_treatment(x):
    if x.dtype == 'object':
        x = x.fillna(x.mode()[0])
    else:
        x = x.fillna(x.median())
    return x
 
# Outliers Detection and Treatment

def  outlier_treatment(x):
    if ((x.dtype == 'float') or (x.dtype == 'int')):
        x.clip(lower = x.quantile(0.01), upper = x.quantile(0.99))
    else:
        x
    return x

In [None]:
bankloans = bankloans.apply(lambda x:missing_treatment(x))
bankloans = bankloans.apply(lambda x:outlier_treatment(x))

In [None]:
bankloans.isna().sum()

# Interest rate is varied for different loan amounts.

In [None]:
Loan_vs_rate = bankloans[['Amount_Requested','Interest_Rate']]
Loan_vs_rate.head(10).sort_values(by = 'Amount_Requested', ascending = True)

**Null hypothesis (Ho): r = 0**
    
**Alternate Hypothesis (Ha) : r <> 0**
    
**Confidence Interval : 95% ; p-value : 0.05**

In [None]:
relate_1 = stats.pearsonr(Loan_vs_rate.Amount_Requested, Loan_vs_rate.Interest_Rate)
relate_1

***Correlation coefficient r = 0.332. There is a weak positive relationship between the Loan amounts and the interest rates.***

# Loan length impact on interest rate.

In [None]:
bankloans

In [None]:
bankloans.Loan_Length.nunique()

In [None]:
Time_vs_rate = bankloans[['Loan_Length','Interest_Rate']]
Time_vs_rate

In [None]:
Time_vs_rate.groupby('Loan_Length')['Interest_Rate'].mean()

**Null hypothesis (Ho): There is no relation between loan length and the interest rate.**
    
**Alternate Hypothesis (Ha) : There is a relation between loan length and interest rate.**
    
**Confidence Interval : 95% ; p-value : 0.05**

In [None]:
s1 = Time_vs_rate.loc[bankloans.Loan_Length == 36,'Interest_Rate']
s2 = Time_vs_rate.loc[bankloans.Loan_Length == 60,'Interest_Rate']

In [None]:
relate_2 = stats.f_oneway(s1,s2)
relate_2

***p-value <<< 0.05, which means Loan length directly impacts the rate of interest.***

# Interest rate varies for different purpose of loans.

In [None]:
bankloans.Loan_Purpose.nunique()

In [None]:
bankloans.Loan_Purpose.value_counts()

In [None]:
Purpose_vs_rate = bankloans[['Loan_Purpose','Interest_Rate']]
Purpose_vs_rate

In [None]:
Purpose_vs_rates = pd.DataFrame(Purpose_vs_rate.groupby('Loan_Purpose')['Interest_Rate'].mean().round(2))
Purpose_vs_rates = Purpose_vs_rates.reset_index().sort_values(by = 'Interest_Rate', ascending = False)
Purpose_vs_rates.reset_index()

**Null hypothesis (Ho): Interest rates does not vary with different purposes of Loans.**
    
**Alternate Hypothesis (Ha) : Interest rates varies for different purposes of loans.**
    
**Confidence Interval : 95% ; p-value : 0.05**

In [None]:
a1 = bankloans.loc[bankloans.Loan_Purpose == 'moving','Interest_Rate']
a2 = bankloans.loc[bankloans.Loan_Purpose == 'debt_consolidation','Interest_Rate']
a3 = bankloans.loc[bankloans.Loan_Purpose == 'house','Interest_Rate']
a4 = bankloans.loc[bankloans.Loan_Purpose == 'other','Interest_Rate']
a5 = bankloans.loc[bankloans.Loan_Purpose == 'credit_card','Interest_Rate']
a6 = bankloans.loc[bankloans.Loan_Purpose == 'small_business','Interest_Rate']
a7 = bankloans.loc[bankloans.Loan_Purpose == 'wedding','Interest_Rate']
a8 = bankloans.loc[bankloans.Loan_Purpose == 'vacation','Interest_Rate']
a9 = bankloans.loc[bankloans.Loan_Purpose == 'medical','Interest_Rate']
a10 = bankloans.loc[bankloans.Loan_Purpose == 'home_improvement','Interest_Rate']
a11 = bankloans.loc[bankloans.Loan_Purpose == 'car','Interest_Rate']
a12 = bankloans.loc[bankloans.Loan_Purpose == 'educational','Interest_Rate']
a13 = bankloans.loc[bankloans.Loan_Purpose == 'major_purchase','Interest_Rate']
a14 = bankloans.loc[bankloans.Loan_Purpose == 'renewable_energy','Interest_Rate']

In [None]:
relate_3 = stats.f_oneway(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14)
relate_3

***p-value << 0.05, hence we can say we interest rates varies for different purposes of loans.***

# There is relationship between FICO scores and Home Ownership.

**Null hypothesis (Ho): There is no relationship between FICO score and the Home Ownership.**
    
**Alternate Hypothesis (Ha) : There is a relationship between FICO score and the Home Ownership.**
    
**Confidence Interval : 95% ; p-value : 0.05**

In [None]:
bankloans.Home_Ownership.nunique()

In [None]:
bankloans.FICO_Range.nunique()

In [None]:
observe = pd.crosstab(index = bankloans.Home_Ownership,columns = bankloans.FICO_Range)
observe

In [None]:
stats.chi2_contingency(observe)

***p-value <<< 0.05  therefore we can say that there is relationship between FICO scores and Home Ownership. It means that, People 
with owning home will have high FICO scores.***