In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**<span style="color:crimson;">Kindly upvote if you like the pipeline 😃</span>** 

### **Lets Dive into finding the best model for production**

*** Short description of different steps followed ***
   

    1. Understanding dataset.

    2. Exploratory data analysis.
      * Most importantly, plot a countplot of the target variable, this will reveal if the dataset is imbalanced.
      *  Since this is classification problem, we need to check if there is imbalance in the dataset 
      *  if the dataset is imbalanced we need to perform either undersampling or oversampling 
    3. Creating Baseline model 
    4. Gridsearch and Pipeline
      * we define series of 5 models and define a pipeline to run all these 5 models through gridsearch 
      * The model with highest accuracy is noted, Its hyper-parameters are noted 

    5. Define production model with the found best parameter
    

*** Summary of Interesting Finding from Exploratory data analysis**

    *  The loan acceptance and rejection rate is balanced
    *  The dataset is balanced dataset so we don't need undersampling or oversampling
    *  Male applicants are obiously more than female applicants
    *  More graudates are given loan than the non graudates
    *  Male's median loan amount is more than Females
    *  Males tend to ask more loan than females
    *  Graduates tend to ask more loan than non graduates



## Import necessary libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier


from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

import seaborn as sns
!pip install miceforest
import miceforest as mf

In [None]:
df_train = pd.read_csv('/kaggle/input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')
df_test = pd.read_csv('/kaggle/input/loan-prediction-problem-dataset/test_Y3wMUE5_7gLdaTN.csv')
print("Train has {} rows" .format(len(df_train)))
print("Test has {} rows" .format(len(df_test)))

In [None]:
from IPython.display import display
pd.options.display.max_columns = None
display(df_train.head(5))
display(df_test.head(5))

### Helper function

In [None]:
def drop_unnecessary_columns(df, column_name):
    """
    Function to delete the list of columns 
    Parameters
    ----------
    df : dataframe
            pass in full dataframe
    column_name : list
            pass in list of full column
    ----------
    Returns: Dataframe
    """
    
    df = df.drop(column_name, axis=1)
    return df

def print_unique_values(df):
    """
    Function to print unique values in categorical datatypes 
    Parameters
    ----------
    df : dataframe
            pass in full dataframe
    ----------
    Returns: None
    """
    
    print("unique values\n")
    for col in df.columns:
        if df[col].dtypes=='object':
            if len(df[col].unique())>5:
                print('{:>15s} \t more than 5 unique'.format(col))
            else:
                print('{:>15s} \t {}'.format(col,df[col].unique() ))

def split_categ_numer(df):
    """
    Function to split dataframe into two, one having categorical columns and another having numerical columns
    Parameters
    ----------
    df : dataframe
            pass in full dataframe
    ----------
    Returns: 
        dataframe with categorical columns
        dataframe with numerical columns
    """
    categorical_col = []
    numerical_col = []
    for c in df.columns:
        if df[c].dtype =='object':
            categorical_col.append(c)
        else:
            numerical_col.append(c)
    return df[categorical_col], df[numerical_col]


## Preliminary data analysis

* we see from the below preliminary analysis that 

    * There are no duplicate row
    * There are some nan's across columns in both train and test
    * The dataset is balanced
    

In [None]:
print("number of duplicate records in train - {}".format(df_train.duplicated().sum()))
print("number of duplicate records in test - {}".format(df_test.duplicated().sum()))

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df_train.info()

* We see below that the categorical variables are not more sparse, except for Loan_id which will be removed later**

In [None]:
print_unique_values(df_train)

* The data set is balanced with almost 45% of "N" values compared to 55% of "Y" values

In [None]:
approved = len(df_train[df_train['Loan_Status']=='Y'])
rejected = len(df_train[df_train['Loan_Status']=='N'])
print("proportion of 'No' vs 'Yes' {:>3.2f}%".format(rejected/approved*100))

* By experience, we can say that the loan_id is not an useful attribute for our classification problem
* hence we remove the "loan_id" from both train and test 

In [None]:
# preparing the train set
df_train = drop_unnecessary_columns(df_train, ['Loan_ID'])
# # preparing the test set
x_test = drop_unnecessary_columns(df_test, ['Loan_ID'])

In [None]:
x = df_train.drop(['Loan_Status'], axis=1)
y = df_train['Loan_Status']

## Exploratory analysis

In [None]:
df_train.head(2)

* The loan acceptance and rejection rate is balanced
* The dataset is balanced dataset so we don't need undersampling or oversampling

In [None]:
sns.countplot(x='Loan_Status', data=df_train)

* Male applicants are obiously more than female applicants

In [None]:
sns.countplot(x='Loan_Status', data=df_train, hue='Gender')

* More graudates are given loan than the non graudates

In [None]:
sns.countplot(x='Loan_Status', data=df_train, hue='Education')

* Male's median loan amount is more than Females

In [None]:
sns.boxplot(x='Gender', y='LoanAmount', data=df_train)

* Males tend to ask more loan than females

In [None]:
fg = sns.FacetGrid(df_train, col='Gender')
fg.map(sns.barplot, 'Loan_Status', 'LoanAmount' )

* Graduates tend to ask more loan than non graduates

In [None]:
fg = sns.FacetGrid(df_train, col='Education')
fg.map(sns.barplot, 'Loan_Status', 'LoanAmount' )

## Data preprocessing

> Defining pipelines

In [None]:
categorical_transformer = Pipeline(steps = [('simple_imputer',SimpleImputer(strategy='most_frequent')),
                                            ('one_hot_encodr', OneHotEncoder(sparse=False))
                                           ])

numerical_transformer = Pipeline(steps = [('iterative_imputer', IterativeImputer())])



In [None]:
# creating categorical train dataset and numerical train dataset
cat_train_df, numeri_train_df = split_categ_numer(x)

# extracting the categroical column names and numerical column names
cat_train_features = cat_train_df.columns
num_train_features = numeri_train_df.columns

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_train_features),
        ('cat', categorical_transformer, cat_train_features)
        ])

In [None]:
x_train,x_test, y_train, y_test = train_test_split(x,y, train_size=70, random_state=42)
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
y_train = pd.Series(lb.fit_transform(y_train))
y_test = pd.Series(lb.transform(y_test))

## Baseline model

* we take random forest as our baseline model,
* The accuracy of the mode is 72.97%
* our objective is to find a model that give better accuracy than this baseline model
* hence we try other models with and without grid search

In [None]:
rf = Pipeline(steps = [('preprocessor',preprocessor),
                      ('classifier', RandomForestClassifier() )])

In [None]:
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print('Train set score : ', rf.score(x_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred))

## Modelling 

### without Grid_search

* Here we find that the Gradient boosting is performing better with accuracy of 73.71%

In [None]:
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

scores= []
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', classifier)])
    pipe.fit(x_train, y_train)   
    scores.append(pipe.score(x_test, y_test))

In [None]:
ml_model = ['knn','decision tree', 'random forest', 'ada boost', 'gradient boost']
df_x = pd.DataFrame(list(zip(ml_model,scores)), columns=['models','scores'])
print(df_x)
sns.barplot(x='models', y='scores', data=df_x.sort_values(by='scores'), )
plt.xticks(rotation=90)
plt.show()

### With grid search

* However using grid search we find that the random forest preforms much better with acc of 78.57% than the baseline model of acc 72.97%

In [None]:
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

knn_param_grid = {
                'classifier__n_neighbors':[2,3,4,5]
                }

dt_param_grid = { 
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__max_depth' : [4,5,6,7,8],
        }

rf_param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [4,5,6,7,8],
    'classifier__criterion' :['gini', 'entropy']
    }


ada_param_grid = {
         'classifier__n_estimators':[200, 500]#,
        }

gbc_param_grid = {
              "classifier__learning_rate": [0.1,0.01,0.001]
        }

grids = [knn_param_grid, dt_param_grid, rf_param_grid, ada_param_grid, gbc_param_grid]
scores = []
best_params= []
for i, model in enumerate(classifiers):
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    CV = GridSearchCV(pipe, grids[i], n_jobs= 1)
    CV.fit(x_train, y_train)    
    best_params.append(CV.best_params_)
    scores.append(CV.best_score_)

In [None]:
ml_model = ['knn','decision tree', 'random forest', 'ada boost', 'gradient boost']
df_x = pd.DataFrame(list(zip(ml_model,scores)), columns=['models','scores'])
print(df_x)
sns.barplot(x='models', y='scores', data=df_x.sort_values(by='scores'))
plt.xticks(rotation=90)
plt.show()

## Production model

* Since the acc of tuned Random forest model is better than any model, we will use it to estimate the loan_status of the test set

In [None]:
random_forest_param = best_params[3]
print(random_forest_param)

In [None]:
rf = Pipeline(steps = [('preprocessor',preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators= 200))])
rf.fit(x, y)
y_pred = rf.predict(df_test)

In [None]:
y_pred_series = pd.Series(y_pred)
d = pd.concat([df_test, pd.DataFrame(y_pred_series)], axis=1)
d