In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle

### 1.	Import required libraries and read the dataset. (2)
### 2.	Check the first few samples, shape, info of the data and try to familiarize yourself with different features. (2)
### 3.	Check for missing values in the dataset, if present? handle them with appropriate methods and drop redundant features. (2)

In [None]:
df=pd.read_csv('F:\Python graded project 3\loan_approval_data.csv')
df.head(5)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['property_area'].unique()

In [None]:
df['education'].unique()

In [None]:
df['loan_status'].unique()

In [None]:
df=df.replace('', np.nan) 

In [None]:
# missing values check
df.isnull().sum()/len(df)*100


In [None]:
len(df[df.duplicated()])


In [None]:
# missing value treatment
#majority of credit history has value 1 as per df.describe()
df["credit_history"].fillna(1, inplace=True)

In [None]:
#loanamount and loan_amount_term can be replaced with median
df["loanamount"].fillna((df["loanamount"].median()), inplace=True)
df["loan_amount_term"].fillna((df["loan_amount_term"].median()), inplace=True)

In [None]:
# categoricals gender,dependents,self_employed,married can be replaced with mode values
df["gender"].fillna((df["gender"].mode()[0]), inplace=True)
df["dependents"].fillna((df["dependents"].mode()[0]), inplace=True)
df["self_employed"].fillna((df["self_employed"].mode()[0]), inplace=True)
df["married"].fillna((df["married"].mode()[0]), inplace=True)

In [None]:
# missing values check
df.isnull().sum()/len(df)*100

In [None]:
## loan_id is an identifier but is redundant for model building
df=df.drop('loan_id',axis=1 )


## 4.	Visualize the distribution of the target column 'loan_status' with respect to various categorical features and write your observations. (2)

In [None]:
categorical_features=df.select_dtypes(include=[np.object])
#univariate analysis of churn
i=1
plt.figure(figsize=(30,25))
for feature in categorical_features:
    plt.subplot(6,3,i)
    sns.countplot(x=feature,hue='loan_status',data=df)
    i+=1

## Observations
## More males as compared to females have their loan approved
## More married people have their loans approved
## Most people with no dependents have their loans approved
## Most graduates have their loans approved
## Most Non self employed have their loans approved
## Most people in semi urban have their loans approved

## 5.	Encode the categorical data. (2)

In [None]:
# select multiple category datatype variables for label encoding
label_type_variables=[i for i in df[['dependents','property_area']] ]
le=LabelEncoder()
def encoder(df):
    for i in label_type_variables:
        q=le.fit_transform(df[i].astype('str'))
        df[i]=q
        df[i]=df[i].astype('int')
encoder(df)
df.head(5)

In [None]:
# one hot encoding the remaining categoricals
df=pd.get_dummies(df,drop_first=True)
df.head(5)

In [None]:
df.columns

In [None]:
#renaming education_not graduate
df.rename(columns = {"education_not graduate":'education_notGrad'}, inplace = True)

In [None]:
df.columns

## 6.	Separate the target and independent features and split the data into train and test. 


In [None]:
X=df.drop('loan_status_y',axis=1)
y=df['loan_status_y']

In [None]:
y.value_counts()

In [None]:
# split 70:30

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

In [None]:
X_train.head(2)

In [None]:
X_test.head(2)

## 7.	Build any classification model to predict the loan status of the customer and save your model using pickle. 

In [None]:
# 11.function for model fit
def fit_n_print(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    pred=model.predict(X_test)
    accuracy=accuracy_score(y_test,pred)
    return accuracy

In [None]:
lr=LogisticRegression()
nb=GaussianNB()
knn=KNeighborsClassifier()
dt=DecisionTreeClassifier()
rf=RandomForestClassifier()
adb=AdaBoostClassifier()
gb=GradientBoostingClassifier()

estimators=[('rf',rf),('knn',knn),('gb',gb),('adb',adb)]
sc=StackingClassifier(estimators=estimators, final_estimator=rf)

In [None]:
result=pd.DataFrame(columns=['Accuracy'])
for model,model_name in zip([lr,nb,knn,dt,rf,adb,gb,sc],['Logistic Regression','Naive Bayes','KNN','Decision tree','Random Forest','Ada Boost','Gradient Boost','Stacking']):
    result.loc[model_name]=fit_n_print(model,X_train,X_test,y_train,y_test)

In [None]:
result

## Conclusion
## Gradient Boost gives the highest accuracy 81.62% amongst all the classifier models ## and hence can be possibly used as the 
## chosen classifier for predicting loan approval status

In [None]:
## let us also check f1 score for logistic regression

In [None]:
from sklearn.metrics import  f1_score

In [None]:
model=gb.fit(X_train,y_train)
pred=model.predict(X_test)
f1_score=f1_score(y_test,pred)

In [None]:
f1_score

In [None]:
## pickle the model
filename = '/Users/moni/Documents/loan_prediction/loan_approval_predictor_final.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
pred

In [None]:
X_test.columns

In [None]:
X_train.columns