In [None]:
import numpy as np
import seaborn as sbs
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')


In [None]:
#loading data
def load_csv(file):
    return pd.read_csv(file)
    
data = load_csv('Visa_For_Lisa_Loan_Modelling.csv')


In [None]:
#viewing data info
def view_data_info(data):
    print("Data Dimension: ", data.shape)
    print("Data Types: ", data.dtypes)
    print("first 20 rows: ")
    print("\n")
    print(data.head(20))
    print("Statistics")
    print("\n")
    print(data.describe())
view_data_info(data)

In [None]:
def cleaning_data(data):
    columns = data.columns
    
    #check for empty columns
    empty_columns = []
    for all in columns:
        if data[all].isna().sum() > 0:
            empty_columns.append(all)
    
    if len(empty_columns) > 0:
        new_data = data.dropna(axis=0)
    else:
        new_data=data.drop(columns=['Personal Loan'])
        new_data.insert(13, 'Personal Loan',data['Personal Loan'])
        print(new_data.isnull().mean()) #there are no missing data

        print(new_data['ID'].is_unique)
        new_data.set_index('ID')
        return new_data
    
data = cleaning_data(data)
data

In [None]:
def print_histograms(data):
    columns = []
    
    #removing irrelevant columns
    for col in data.columns:
        if col == "ZIP Code"or col == "ID":
            continue
        else:
            columns.append(col)
    rows = []
    cols = []
    for dt in range(3):
        rows.extend([dt] * 4)
        cols.extend(range(4))
    
    fig, axes = plt.subplots(3, 4, figsize=(11,11))
    for i in range(len(columns)):
        sbs.histplot(data[columns[i]], ax=axes[rows[i], cols[i]])
    
    fig.subtitle = ("Histograms Of Columns in the dataset")
    plt.show()
    fig.savefig("Visa_For_Lisa_Histogram.png")
print_histograms(data)    

In [None]:
def print_scatter_matrix(dataset):
    columns= []
    for i in data.columns:
        if i == 'ZIP Code' or i == 'ID' :
            continue
        else:
            columns.append(i)
    new_data=dataset[columns]
    plt.figure(figsize=(10,12))
    plot=sbs.heatmap(new_data.corr(), cmap='YlGnBu', annot=True)
    plt.title("A plot of the Correlation of the Numerical Columns in the Dataset", fontdict={'fontsize':10,'fontweight':'extra bold','horizontalalignment': 'center'}, y=1.0)
    plt.savefig("Scatter_Matrix.png")
    plt.show()
print_scatter_matrix(data) 

In [None]:
# GETTING DISTRIBUTION OF AGES

Age=data['Age'].value_counts().reset_index(name='Count').sort_values(by=['index']).reset_index(drop=True)
Age.rename(columns={'index':'AGE'}, inplace=True)
Age.head()

In [None]:
# Grouping the ages

bn = np.array([22,34, 44, 64,max(Age['AGE'])])
Age['AGE_group']=pd.cut(Age['AGE'],bins=bn
       , labels=["Early Adulthood", "Early Middles", "Late Middles","Late Adulthood"])
Age

In [None]:
Exp = data['Experience'].value_counts().reset_index(name='Number').sort_values(by='index').reset_index(drop=True)
Exp.rename(columns={'index': 'Exp_years'}, inplace=True)
Exp.head()

In [None]:
max_exp = Exp['Exp_years'].max()
bins = np.array([-1, 2, 5, 8, 10, max_exp])
labels = ["Entry_level", "Intermediate_level", "Experienced_level", "Advanced_level", "Expert_level"]

Exp['Job_level'] = pd.cut(Exp['Exp_years'], bins=bins, labels=labels)
Exp.head()

In [None]:
group_exp=Exp.groupby('Job_level',as_index=False)[['Number']].sum()
group_exp

In [None]:
data1 = data.groupby(['Age', 'Personal Loan'])[['Personal Loan']].count().rename(columns={'Personal Loan': 'values_count'}).reset_index()
data1.head()

In [None]:
bins=np.array([22,34, 44, 64,max(Age['AGE'])])
data1['Age_group']=pd.cut(data1['Age'],bins=bins
       , labels=["Early Adulthood", "Early Middle Age", "Late Middle Age","Late Adulthood"])

In [None]:
data1.head()

In [None]:
age = data1.groupby(['Age_group', 'Personal Loan']).agg({'values_count': 'sum'})
age['%'] = age.groupby(level=0).apply(lambda x: 100 * x / x.sum()).round(2)
age = age.reset_index(level=1).reset_index(level=0)

age


In [None]:
pip install kaleido

In [None]:
text = [str(i) + '%' for i in age['%']]
fig = px.bar(
age,
x='Age_group',
y='values_count',
color='Personal Loan',
barmode='group',
text=text,
color_discrete_sequence=px.colors.sequential.Viridis
)
fig.update_layout(
plot_bgcolor='rgba(0, 175, 200, 0)',
title={
'text': 'Distribution of people who accepted and rejected the Loan by Age Groups',
'x': 0.9,
'y': 0.95,
'font_size': 15
}
)
fig.show()
fig.write_image('dist_age.png')

## Distribution By Income

In [None]:
data2 = data.groupby(['Income', 'Personal Loan'])[['Personal Loan']].count().rename(columns={'Personal Loan': 'values_count'}).reset_index()
data2.head()

In [None]:
bins=np.array([0,52.2, 156.6,max(data2['Income'])])
data2['income_group']=pd.cut(data2['Income'],bins=bins
       , labels=["Low Income Class", "Middle Income Class","Upper Income Class"])


In [None]:
data2.head()

In [None]:
income = data2.groupby(['income_group', 'Personal Loan']).agg({'values_count': 'sum'})
income['%'] = income.groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).round(2)
income = income.reset_index(level=1).reset_index(level=0)
income

In [None]:
text=[str(i)+ '%' for i in income['%']]
fig=px.bar(income,x='income_group',y='values_count',color='Personal Loan', barmode='group',text=text,
          color_discrete_sequence=px.colors.sequential.Viridis)
fig.update_layout({'plot_bgcolor':'rgba(0,175,200,0)'})
fig.update_layout({'title':{'text':'Distribution of People who accepted and rejected the Loans by income group',
                            'x':0.2, 'y':0.95,
                            'font_size':15
                           }})
fig.show()

## Distribution

In [None]:
def distribution(cols):
    print('Distribution By ' + cols + ':')
    data_col = data.groupby([cols, 'Personal Loan'])[['Personal Loan']].count().rename(columns={'Personal Loan': 'values_count'}).reset_index()
    data_col.head()
    group=data_col.groupby([cols,'Personal Loan']).agg({'values_count':'sum'})
    group['%']=group.groupby(level=0).apply(lambda x: 100 * x/float(x.sum())).round(2)
    group=group.reset_index(level=1).reset_index(level=0)
    print(group)
    text=[str(i)+ '%' for i in group['%']]
    fig=px.bar(group,x=cols,y='values_count',color='Personal Loan', barmode='group',text=text,
              color_discrete_sequence=px.colors.sequential.Viridis)
    fig.update_layout({'plot_bgcolor':'rgba(255,99,71,0.05)'})
    fig.update_layout({'title':{'text':'Distribution of People who accepted and rejected the Loans by {cols}'.format(cols = cols),
                                'x':0.2, 'y':0.95,
                                'font_size':12
                               }})
    fig.show()
    
columns = ['Family', 'Education', 'Securities Account','CD Account', 'Online', 'CreditCard']
for cols in columns:
    distribution(cols)
    

In [None]:
data0=list(data[data['Personal Loan']== 0]['CCAvg'])
data1=list(data[data['Personal Loan']== 1]['CCAvg'])
hist_data = [data0, data1]
group_labels = ['0','1']
colors = ['#333F44', '#37AA9C']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels,curve_type='normal',colors=colors,histnorm='probability density', show_rug=False)

# Add title
fig.update_layout(title_text='CCAvg for Customers')
fig.update_layout({'plot_bgcolor':'rgba(255,99,71,0.05)'})
fig.show()

## Initialising Models

In [None]:
#initiating the logistic regression
log_reg=LogisticRegression()

#initiating the naive bayes model
naive_bayes=GaussianNB()

#initiating the KNN model
knn=KNeighborsClassifier(n_neighbors=15)

#initializing the the RandomForest model
random=RandomForestClassifier(n_estimators=70,oob_score=True,n_jobs=-1,random_state=101,max_features=None,min_samples_leaf=30)

#initializing the Support Vector Machine
SVM = SVC(kernel="linear",C=0.025, random_state=101)

#initializing the Decision Tree
dtree=DecisionTreeClassifier(max_depth=10,random_state=101,max_features=None,min_samples_leaf=15)

## Splitting the data into testing and training data

In [None]:
X=data.drop('Personal Loan', axis=1)
y= data['Personal Loan']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
clf = {
'Logistic Regression': log_reg,
'Naive Bayes': naive_bayes,
'KNN': knn,
'Random Forests': random,
'Support Vector Machine': SVM,
'Decision Trees': dtree
}

evaluations = []

for classifier_name, classifier in clf.items():
    evals = {}
    y_pred = classifier.fit(X_train, y_train).predict(X_test)
    evals['Accuracy'] = accuracy_score(y_test, y_pred)
    evals['Precision'] = precision_score(y_test, y_pred, pos_label=1)
    evals['Recall'] = recall_score(y_test, y_pred, pos_label=1)
    evals['F1'] = f1_score(y_test, y_pred, pos_label=1)
    evaluations.append(evals)

metrics = pd.DataFrame(evaluations, index=['Logistic Regression', 'Naive Bayes', 'KNN', 'Random Forests', 'Support Vector Machine', 'Decision Trees'])
metrics

from the data above we ca see the Random Forests And Decisions Tree generated the best results.
we will try hyperparameters to see if we can get improvements

## IMPROVING THE RANDOM FOREX MODEL WITH TUNING

In [None]:
n_estimators = [20, 50, 100]
min_samples_leaf = [2, 5, 10]
max_depth = [3, 5, 7]
max_features = ['auto', 'sqrt', 'log2', None]

RandomForest_grid_search = GridSearchCV(RandomForestClassifier(),
param_grid={'n_estimators': n_estimators,
'min_samples_leaf': min_samples_leaf,
'max_depth': max_depth,
'max_features': max_features})

RandomForest_grid_search.fit(X_train, y_train)
print(RandomForest_grid_search.best_params_, 'hello')

## IMPROVING DECISION TREE WITH TUNING

In [None]:
max_depth = [2, 5, 10, None]
min_samples_split = [2, 5, 10, None]
min_samples_leaf = [1, 5, 7]
max_features = [1, 2, 3]
criterion = ['gini', 'entropy']

param_grid = {
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'max_features': max_features,
'criterion': criterion
}

DecisionTree_grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid)
DecisionTree_grid_search.fit(X_train, y_train)
print(DecisionTree_grid_search.best_params_)

after tuning twe will evaluate the model againhe parameters of the model 

In [None]:
clf_2 = {
'Random Forest': RandomForestClassifier(max_depth=7, max_features='sqrt', min_samples_leaf=2, n_estimators=100),
'Decision Trees': DecisionTreeClassifier(criterion='gini', max_depth=None, max_features=3, min_samples_leaf=1, min_samples_split=2)
}

evaluations2 = []
for classifier_name, classifier in clf_2.items():
    evals_2 = {}
    y_pred = classifier.fit(X_train, y_train).predict(X_test)
    evals_2['Accuracy'] = accuracy_score(y_test, y_pred)
    evals_2['Precision'] = precision_score(y_test, y_pred, pos_label=1)
    evals_2['Recall'] = recall_score(y_test, y_pred, pos_label=1)
    evals_2['F1'] = f1_score(y_test, y_pred, pos_label=1)
    evaluations2.append(evals_2)

metrics2 = pd.DataFrame(evaluations2, index=['Random Forests', 'Decision Trees'])
metrics2

Based on the above, the Random Forest is the best model for this task