In [3]:
import pandas as pd
from numpy import mean
from numpy import std
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_json('loan_data.json')
df.head()

Unnamed: 0,Application_ID,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
0,LP001002,Male,No,0,Graduate,No,1,Urban,medium,Y
1,LP001003,Male,Yes,1,Graduate,No,1,Rural,medium,N
2,LP001005,Male,Yes,0,Graduate,Yes,1,Urban,low,Y
3,LP001006,Male,Yes,0,Not Graduate,No,1,Urban,low,Y
4,LP001008,Male,No,0,Graduate,No,1,Urban,medium,Y


1. Find % of total applicants for each unique value of dependents
2. Find the average number of dependents per applicant
3. Find the %of applications approved for self-employed applicants
4. What is the % of rejections for married male applicants
5. Which property area has the maximum approval ratio
6. Find average dependents per income group
7. Create a simple predictive model to assess whether a loan application will be approved or rejected and provide the accuracy score

#### 1. Find % of total applicants for each unique value of dependents

In [5]:
#unique dependent values
df.Dependents.unique()

array(['0', '1', '2', '3+'], dtype=object)

In [6]:
print(' % of total applicants for each unique value of dependents :')
df.groupby('Dependents').size()/len(df.index) * 100

 % of total applicants for each unique value of dependents :


Dependents
0     57.534247
1     16.634051
2     17.221135
3+     8.610568
dtype: float64

#### 2. Find the average number of dependents per applicant


In [7]:
#assuming 3 for every 3+ dependents
total_applicants = len(df.index)
(df.groupby('Dependents').size()).values 

array([294,  85,  88,  44], dtype=int64)

In [8]:
print('average number of dependents per applicant: ',sum((df.groupby('Dependents').size())*[0,1,2,3])/total_applicants)


average number of dependents per applicant:  0.7690802348336595


#### 3. Find the %of applications approved for self-employed applicants

In [9]:
approved_selfemp = df.groupby(['Self_Employed', 'Application_Status']).size()[3]/total_applicants
print('%of applications approved for self-employed applicants :{} % '.format(approved_selfemp))

%of applications approved for self-employed applicants :0.09001956947162426 % 


#### 4. What is the % of rejections for married male applicants

In [10]:
print('% of rejections for married male applicants : ', end =" ")
print(df.groupby(['Gender', 'Married', 'Application_Status']).size()[6]/total_applicants, ' %')

% of rejections for married male applicants :  0.17025440313111545  %


#### 5. Which property area has the maximum approval ratio

In [11]:
property_area = df.groupby(['Property_Area']).size()
status = df.groupby(['Property_Area', 'Application_Status']).size()
approval_ratio = status/property_area

In [12]:
# filtering by property area for approved application
arr_rt = pd.DataFrame(approval_ratio)
arr_rt = arr_rt[arr_rt.index.get_level_values('Application_Status').isin(['Y'])]
# arr_rt = pd.DataFrame(arr_rt)
print(arr_rt, '\n\n')
max_apr_property = arr_rt.idxmax()
print('Property area has the maximum approval ratio is ',max_apr_property)

                                         0
Property_Area Application_Status          
Rural         Y                   0.604027
Semiurban     Y                   0.776650
Urban         Y                   0.630303 


Property area has the maximum approval ratio is  0    (Semiurban, Y)
dtype: object


#### 6. Find average dependents per income group

In [13]:
def convert_int(string):
    if string == '3+':
        return(3)
    else:
        return(int(string))

In [14]:
df['Dependents'] = df['Dependents'].apply(lambda x: convert_int(x) )
df2 = pd.DataFrame({'count':df.groupby(['Income','Dependents']).size()}).reset_index()
df2['aggr'] = df2['Dependents'] * df2['count']
df2.groupby(['Income'])['aggr'].sum()/df2.groupby('Income')['count'].sum()

Income
high      0.888889
low       0.641026
medium    0.922280
dtype: float64


#### 7. Create a simple predictive model to assess whether a loan application will be approved or rejected and provide the accuracy score

In [15]:
y = df.Application_Status
X = df.drop(['Application_Status', 'Application_ID'],1)

In [16]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_train)
X_train = ordinal_encoder.transform(X_train)
X_test = ordinal_encoder.transform(X_test)
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)
# predict on test set
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.47


In [17]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# one-hot encode input variables
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)
# predict on test set
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 79.29
