#### import all necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### as data is in the form of excel, use read_excel command

In [None]:
df=pd.read_csv('../input/bank-personal-loan-modelling/Bank_Personal_Loan_Modelling.csv')
df.head()

### Perform Exploratory Data Analysis

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
# ID and Zipcode might be removed as they may not be useful for our analysis

df.drop(['ID','ZIP Code'],axis=1,inplace=True)

In [None]:
df.columns

In [None]:
import plotly.express as px

In [None]:
fig=px.box(df,y=['Age', 'Experience', 'Income', 'Family', 'Education'])
fig.show()

#### Five point summary suggest that Experience has negative value(This should be fixed).
    we can see the Min, Max, mean and std deviation for all key attributes of the dataset
    Income has too much noise and slightly skewed right, Age and exp are equally distributed.

#### check if there is skewness in data or not!!

In [None]:
df.skew()

In [None]:
df.dtypes

#### now visualise Skewness by distribution

In [None]:
df.hist(figsize=(20,20))

#### INFERENCE from Histogram
    1.Age & Experience are to an extent equally distributed
    2.Income & Credit card spending are skewed to the left
    3.We have more Undergraduates than Graduate and Advanced & Professional
    4.60% of customers have enabled online banking and went digital

In [None]:
import seaborn as sns

In [None]:
sns.distplot(df['Experience'])

In [None]:
df['Experience'].mean()

In [None]:
Negative_exp=df[df['Experience']<0]
Negative_exp.head()

In [None]:
sns.distplot(Negative_exp['Age'])

In [None]:
Negative_exp['Experience'].mean()

In [None]:
Negative_exp.size

In [None]:
print('There are {} records which has negative values for experience, approx {} %'.format(Negative_exp.size , ((Negative_exp.size/df.size)*100)))

In [None]:
data=df.copy()

In [None]:
data.head()

#### use numpy where function to change the negative values to mean value derived from data with the same age group

In [None]:
data['Experience']=np.where(data['Experience']<0,data['Experience'].mean(),data['Experience'])

In [None]:
data[data['Experience']<0]

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(),annot=True)

#### We could see that Age & Experience are very strongly correlated,
#### Hence it is fine for us to go with Age and drop Experience to avoid multi-colinearity issue.

In [None]:
data=data.drop(['Experience'],axis=1)

In [None]:
data.head()

In [None]:
data['Education'].unique()

In [None]:
def mark(x):
    if x==1:
        return 'Undergrad'
    elif x==2:
        return 'Graduate'
    else:
        return 'Advanced/Professional'

In [None]:
data['Edu_mark']=data['Education'].apply(mark)

In [None]:
data.head()

In [None]:
EDU_dis=data.groupby('Edu_mark')['Age'].count()

In [None]:
EDU_dis

In [None]:
fig=px.pie(data,values=EDU_dis, names=EDU_dis.index,title='Pie CHart')
fig.show()

#### Inference :We could see that We have more Undergraduates 41.92% than graduates(28.06%) & Advanced Professional(30.02%)

In [None]:
data.columns

#### Lets Explore the account holder's distribution

In [None]:
def Security_CD(row):
    if (row['Securities Account']==1) & (row['CD Account']==1):
        return 'Holds Securites & Deposit'
    elif (row['Securities Account']==0) & (row['CD Account']==0):
        return 'Does not Holds Securites or Deposit'
    elif (row['Securities Account']==1) & (row['CD Account']==0):
        return ' Holds only Securites '
    elif (row['Securities Account']==0) & (row['CD Account']==1):
        return ' Holds only Deposit'
    

In [None]:
data['Account_holder_category']=data.apply(Security_CD,axis=1)

In [None]:
data.head()

In [None]:
values=data['Account_holder_category'].value_counts()
values.index

In [None]:
fig=px.pie(data,values=values, names=values.index,title='Pie CHart')
fig.show()

#### We could see that alomst 87% of customers do not hold any securities or deposit, and 3 % hold both securities as well as deposit. It will be good if we encourage those 87% to open any of these account as it will improve the assests of the bank

In [None]:
data.columns

In [None]:
px.box(data,x='Education',y='Income',facet_col='Personal Loan')

#### Inference:From the above plot we could say that Income of customers who availed personal loan are alomst same irrescpective of their Education

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(data[data['Personal Loan']==0]['Income'],hist=False,label='Income with no personal loan')
sns.distplot(data[data['Personal Loan']==1]['Income'],hist=False,label='Income with personal loan')
plt.legend()

#### Conclusion: Customers Who have availed personal loan seem to have higher income than those who do not have personal loan

### automate above stuffs

In [None]:
def plot(col1,col2,label1,label2,title):
    plt.figure(figsize=(12,8))
    sns.distplot(data[data[col2]==0][col1],hist=False,label=label1)
    sns.distplot(data[data[col2]==1][col1],hist=False,label=label2)
    plt.legend()
    plt.title(title)

In [None]:
plot('Income','Personal Loan','Income with no personal loan','Income with personal loan','Income Distribution')

In [None]:
plot('CCAvg','Personal Loan','Credit card avg with no personal loan','Credit card avg with personal loan','Credit card avg Distribution')

In [None]:
plot('Mortgage','Personal Loan','Mortgage of customers with no personal loan','Mortgage of customers  with personal loan','Mortgage of customers  Distribution')

#### People with high mortgage value, i.e more than 400K have availed personal Loan

In [None]:
data.columns

In [None]:
col_names=['Securities Account','Online','Account_holder_category','CreditCard']


In [None]:
for i in col_names:
    plt.figure(figsize=(10,5))
    sns.countplot(x=i,hue='Personal Loan',data=data)

#### From the above graph we could infer that , customers who hold deposit account & customers who do not hold either a securities account or deposit account have aviled personal loan

#### Perform Hypothesis Testing


### Q.. How Age of a person is going to be a factor in availing loan ??? Does Income of a person have an impact on availing loan ??? Does the family size makes them to avail loan ???¶

In [None]:
sns.scatterplot(data['Age'],data['Personal Loan'],hue=data['Family'])

In [None]:
import scipy.stats as stats

In [None]:
Ho='Age does not have impact on availing personal loan'
Ha='Age does  have impact on availing personal loan'

In [None]:
Age_no=np.array(data[data['Personal Loan']==0]['Age'])
Age_yes=np.array(data[data['Personal Loan']==1]['Age'])

In [None]:
t,p_value=stats.ttest_ind(Age_no,Age_yes,axis=0)
if p_value<0.05:
    print(Ha,' as the p_value is less than 0.05 with a value of {}'.format(p_value))
else:
    print(Ho,' as the p_value is greater than 0.05 with a value of {}'.format(p_value))

#### automate above stuffs

In [None]:
def Hypothesis(col1,col2,HO,Ha):
    arr1=np.array(data[data[col1]==0][col2])
    arr2=np.array(data[data[col1]==1][col2])
    t,p_value=stats.ttest_ind(arr1,arr2,axis=0)
    if p_value<0.05:
        print('{}, as the p_value is less than 0.05 with a value of {}'.format(Ha,p_value))
    else:
        print('{} as the p_value is greater than 0.05 with a value of {}'.format(HO,p_value))

In [None]:
Hypothesis('Personal Loan','Age',HO='Age does not have impact on availing personal loan',Ha='Age does  have impact on availing personal loan')

### Q..Income of a person has significant impact on availing Personal Loan or not?

In [None]:
Hypothesis(col1='Personal Loan',col2='Income',HO='Income does not have impact on availing personal loan',Ha='Income does  have impact on availing personal loan')

#### Income have phenomenal significance on availing personal Loan , As the P_value is less than 0.05 with a value of :0.0

### Q..Number of persons in the family has significant impact on availing Personal Loan or not?

In [None]:
Hypothesis('Personal Loan','Family',HO='AgFamily does not have impact on availing personal loan',Ha='Family does  have impact on availing personal loan')

#### Family have phenomenal significance on availing personal Loan , As the P_value is less than 0.05 with a value of :1.4099040685673807e-05