In [66]:
import pandas as pd
import numpy as np

In [67]:
cc_apps = pd.read_csv("datasets/cc_approvals.csv")
cc_apps.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriverLicence,Citizen,ZipCode,Income,Approved
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


## Inspecting the applications

The columns of datasets lokks confusing but but <a href="http://rstudio-pubs-static.s3.amazonaws.com/73039_9946de135c0a49daa7a0a9eda4a67a72.html">this blog</a> gives us a good overview of the probable features. The probable features in a typical credit card application are <code>Gender</code>, <code>Age</code>, <code>Debt</code>, <code>Married</code>, <code>BankCustomer</code>, <code>EducationLevel</code>, <code>Ethnicity</code>, <code>YearsEmployed</code>, <code>PriorDefault</code>, <code>Employed</code>, <code>CreditScore</code>, <code>DriversLicense</code>, <code>Citizen</code>, <code>ZipCode</code>, <code>Income</code> and finally the <code>ApprovalStatus</code>. This gives us a pretty good starting point, and we can map these features with respect to the columns in the output.   </p>

In [68]:
cc_apps_description = cc_apps.describe()
print(cc_apps_description)

print("\n")

cc_apps_info = cc_apps.info()
print(cc_apps_info)

print("\n")

cc_apps.tail(20)

             Debt  YearsEmployed  CreditScore         Income
count  690.000000     690.000000    690.00000     690.000000
mean     4.758725       2.223406      2.40000    1017.385507
std      4.978163       3.346513      4.86294    5210.102598
min      0.000000       0.000000      0.00000       0.000000
25%      1.000000       0.165000      0.00000       0.000000
50%      2.750000       1.000000      0.00000       5.000000
75%      7.207500       2.625000      3.00000     395.500000
max     28.000000      28.500000     67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    object 
 1   Age             690 non-null    object 
 2   Debt            690 non-null    float64
 3   Married         690 non-null    object 
 4   BankCustomer    690 non-null    object 
 5   EducationLevel  690 non-

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriverLicence,Citizen,ZipCode,Income,Approved
670,b,47.17,5.835,u,g,w,v,5.5,f,f,0,f,g,465,150,-
671,b,25.83,12.835,u,g,cc,v,0.5,f,f,0,f,g,0,2,-
672,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,t,g,240,117,-
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-


## Handling the missing values 

By looking at the dataset we can see that datasets has missing values and datasets has a mixture of numerical and non numerical features. Now we will fix this.

In [69]:
cc_apps = cc_apps.replace("?", np.nan)

cc_apps.tail(20)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriverLicence,Citizen,ZipCode,Income,Approved
670,b,47.17,5.835,u,g,w,v,5.5,f,f,0,f,g,465,150,-
671,b,25.83,12.835,u,g,cc,v,0.5,f,f,0,f,g,0,2,-
672,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,t,g,240,117,-
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-


In [70]:
# Impute yhe missing values with mean imputation
cc_apps.fillna(cc_apps.mean(), inplace = True)

cc_apps.isna().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriverLicence      0
Citizen            0
ZipCode           13
Income             0
Approved           0
dtype: int64

In [71]:
for col in cc_apps:
    if cc_apps[col].dtypes == "object":
        cc_apps = cc_apps.fillna(cc_apps[col].value_counts().index[0])
        
cc_apps.isna().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriverLicence     0
Citizen           0
ZipCode           0
Income            0
Approved          0
dtype: int64

## Preprocessing the data

Now we will convert all the non-numeric values into numeric ones because it results in faster computation and also many machine learning models (especially the ones developed using scikit-learn) require the data to be in a strictly numeric format.We will do this by usong Label Encoding technique 

In [72]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cc_apps:
    if cc_apps[col].dtypes == "object":
        cc_apps[col] = le.fit_transform(cc_apps[col])

# Splitting the datasets into train and test sets

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
# Dropping unwanted columns (DriversLicense), (ZipCode)
cc_apps = cc_apps.drop(['DriverLicence', 'ZipCode'], axis = 1)

In [75]:
# Create arrays for the features and the response variable
y = cc_apps['Approved'].values
X = cc_apps.drop('Approved', axis = 1).values

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
    

## Preprocessing the data

In [77]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range = (0,1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

# Fitting a logistic regression model to train set   

In [78]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(rescaledX_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Making Predictions and evaluating performance

evaluating model on the test set with respect to **classification accuracy** and also checking model's **confusion matrix**.

In [81]:
from sklearn.metrics import confusion_matrix

# Using logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

print(f"Accuracy of logistic regression classifier: ".format(logreg.score(rescaledX_test, y_test)))

# Print the confusion matrix of the logreg model
      
confusion_matrix(y_test, y_pred)

Accuracy of logistic regression classifier: 


array([[92, 11],
       [26, 99]], dtype=int64)

## Grid Searching and making the model perform better

In [83]:
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol = tol, max_iter = max_iter)

## Finding the best performing model

In [85]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator = logreg, param_grid = param_grid, cv = 5)

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.850725 using {'max_iter': 100, 'tol': 0.01}
