# Loan Defaulters Prediction



## Problem Statement:
   
   
   **An organization wants to predict who possible defaulters are for the consumer loans product.
   They have data about historic customer behavior based on what they have observed. 
   Hence when they acquire new customers they want to predict who is riskier and who is not.**

In [1]:
!pip install scikit-learn --upgrade --quiet

##  Downloading and Exploring the  Data

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
%matplotlib inline
import seaborn as sns
plt.style.use('dark_background')


In [3]:
train_df=pd.read_csv("../input/loan-prediction-based-on-customer-behavior/Training Data.csv")
train_df

In [4]:
test_df=pd.read_csv("../input/loan-prediction-based-on-customer-behavior/Test Data.csv")
test_df

In [5]:
train_df.columns

In [6]:
train_df.info()

#### There is no null values in our dataset

In [7]:
train_df.isnull().sum()

In [8]:
train_df.describe()

## Data Visulization

In [9]:
plt.figure(figsize=(20,14))
sns.barplot(data=train_df,x='Married/Single',y='Income');
plt.style.use('ggplot')
plt.xlabel('Married/Single',fontsize=25);
plt.ylabel('Income',fontsize=25);
plt.tick_params(which='major',axis='x',labelsize=15)
plt.tick_params(which='major',axis='y',labelsize=15)

In [10]:
plt.figure(figsize=(20,12))
sns.pairplot(train_df);

In [None]:
plt.figure(figsize=(20,12))
sns.barplot(data=train_df,x='Experience',y='Income')
plt.style.use('ggplot')
plt.xlabel('Experience',fontsize=25);
plt.ylabel('Income',fontsize=25);
plt.tick_params(which='major',axis='x',labelsize=15)
plt.tick_params(which='major',axis='y',labelsize=15)

In [None]:
plt.figure(figsize=(20,12))
sns.pointplot(data=train_df,x='Age',y='Income');
plt.xlabel('Age',fontsize=25);
plt.ylabel('Income',fontsize=25);
plt.tick_params(which='major',axis='x',labelsize=15)
plt.tick_params(which='major',axis='y',labelsize=15)


In [None]:
plt.figure(figsize=(20,12))
sns.pointplot(data=train_df,x='Profession',y='Income',color='green');
plt.tick_params(which='major',rotation=90,axis='x',labelsize=15)
plt.tick_params(which='major',axis='y',labelsize=15)
plt.xlabel('Profession',fontsize=25);
plt.ylabel('Income',fontsize=25);

In [None]:
plt.figure(figsize=(20,12))
sns.barplot(data=train_df,x='Profession',y='Experience');
plt.tick_params(which='major',rotation=90,axis='x',labelsize=20)
plt.tick_params(which='major',axis='y',labelsize=15)
plt.xlabel('Profession',fontsize=25);
plt.ylabel('Experience',fontsize=25);

In [None]:
plt.figure(figsize=(20,12))
sns.pointplot(data=train_df,x='Age',y='Experience');
plt.xlabel('Age',fontsize=25);
plt.ylabel('Experience',fontsize=25);
plt.tick_params(which='major',axis='x',labelsize=15)
plt.tick_params(which='major',axis='y',labelsize=15)


## Step 2: Preparing the Data for training

In [None]:
## input columns
input_cols=train_df.columns[1:-1]
list((input_cols))

In [None]:
## target Columns
target_col="Risk_Flag"
print(target_col)

In [None]:
## Input df
input_df=train_df[input_cols].copy()
input_df

In [None]:
## target df
target=train_df[target_col].copy()
target

In [None]:
train_df["CITY"].nunique()

In [None]:
train_df["Profession"].nunique()

In [None]:
train_df.STATE.nunique()

In [None]:
input_df=input_df.drop(columns=['CITY','STATE'],axis=1)
input_df

In [None]:
test_df=test_df.drop(columns=['ID'],axis=1)
test_df

In [None]:
target

## Identifying Numeric and Categorical Columns

In [None]:
input_df.info()

In [None]:
numeric_cols=input_df.select_dtypes(include=['int64']).columns.tolist()
numeric_cols

In [None]:
categorical_cols=input_df.select_dtypes('object').columns.tolist()
categorical_cols

In [None]:
input_df['House_Ownership'].unique()

In [None]:
input_df['Car_Ownership'].unique()

In [None]:
input_df['Married/Single'].unique()

## Scaling Numeric Columns

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler=MinMaxScaler()

In [None]:
scaler.fit(input_df[numeric_cols])

In [None]:
input_df[numeric_cols]=scaler.transform(input_df[numeric_cols])

In [None]:
input_df[numeric_cols].describe()

## Encode Categorical Columns

In [None]:
input_df[categorical_cols]

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder=OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
encoder.fit(input_df[categorical_cols])

In [None]:
encoded_cols=list(encoder.get_feature_names(categorical_cols))
len(encoded_cols)

In [None]:
input_df[encoded_cols]=encoder.transform(input_df[categorical_cols])

In [None]:
input_df[encoded_cols]

In [None]:
input_df

In [None]:
input_df.shape

## Spliting Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_inputs,val_inputs,train_target,val_target=train_test_split(input_df[numeric_cols+encoded_cols],
                                                                       target,
                                                                     test_size=0.25,
                                                                       random_state=20)

In [None]:
train_inputs.shape

In [None]:
val_inputs

In [None]:
train_target.shape

In [None]:
val_target.shape

In [None]:
test_df

## Training Logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model=LogisticRegression(solver='liblinear',random_state=25,max_iter=150)

In [None]:
model.fit(train_inputs[numeric_cols+encoded_cols],train_target)

In [None]:
model.coef_.tolist()

In [None]:
model.intercept_

## Making Prediction and Evaluating the model 


In [None]:
train_preds=model.predict(train_inputs[numeric_cols+encoded_cols])

In [None]:
train_preds

In [None]:
pd.value_counts(train_preds)

In [None]:
train_target

In [None]:
train_probs=model.predict_proba(input_df[numeric_cols+encoded_cols])
train_probs

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(train_target,train_preds)

In [None]:
val_preds=model.predict(val_inputs[numeric_cols+encoded_cols])

In [None]:
val_preds

In [None]:
accuracy_score(val_target,val_preds)

## Making Prediction On Test Data

In [None]:
def predict_input(single_input):
    input_df =test_df
    
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob


predict_input(test_df)

## Training Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model=DecisionTreeClassifier(random_state=25,max_depth=5)

In [None]:
model.fit(train_inputs[numeric_cols+encoded_cols],train_target)

In [None]:
model.score(train_inputs[numeric_cols+encoded_cols],train_target)

In [None]:
train_preds = model.predict(train_inputs[numeric_cols+encoded_cols])

In [None]:
train_preds

In [None]:
pd.value_counts(train_preds)

In [None]:
accuracy_score(train_preds,train_target)

In [None]:
model.score(val_inputs[numeric_cols+encoded_cols], val_target)

In [None]:
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=train_inputs[numeric_cols+encoded_cols].columns, max_depth=2, filled=True);

In [None]:
tree_text = export_text(model, max_depth=10, feature_names=list(train_inputs[numeric_cols+encoded_cols].columns))
print(tree_text[:5000])

In [None]:
importance_df = pd.DataFrame({
    'feature': train_inputs[numeric_cols+encoded_cols].columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df.head(10)

In [None]:
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

In [None]:
def predict_input(single_input):
    input_df =test_df
    
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob


predict_input(test_df)

## While Using Logistic Regression accuracy score on validation data is 0.8775 while using DecisionTreeClassifier it increase to 0.8780