### Overview

Building automatic credit card approval predictor using machine learning techniques: logisitc regression model and Xgboost classification. 

Dataset: https://archive.ics.uci.edu/dataset/27/credit+approval


In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
credit_approval = fetch_ucirepo(id=27) 
  
# data (as pandas dataframes) 
X = credit_approval.data.features 
y = credit_approval.data.targets 
  
# metadata 
print(credit_approval.metadata) 
  
# variable information 
print(credit_approval.variables) 


{'uci_id': 27, 'name': 'Credit Approval', 'repository_url': 'https://archive.ics.uci.edu/dataset/27/credit+approval', 'data_url': 'https://archive.ics.uci.edu/static/public/27/data.csv', 'abstract': 'This data concerns credit card applications; good mix of attributes', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 690, 'num_features': 15, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['A16'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1987, 'last_updated': 'Wed Aug 23 2023', 'dataset_doi': '10.24432/C5FS30', 'creators': ['J. R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This file concerns credit card applications.  All attribute names and values have been changed to meaningless symbols to protect confidentiality of the data.\r\n  \r\nThis dataset is interesting because there is a good mix of attributes --

In [47]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [9]:
print(X.head())
print(y.head())

   A15    A14 A13 A12  A11 A10 A9    A8 A7 A6 A5 A4     A3     A2 A1
0    0  202.0   g   f    1   t  t  1.25  v  w  g  u  0.000  30.83  b
1  560   43.0   g   f    6   t  t  3.04  h  q  g  u  4.460  58.67  a
2  824  280.0   g   f    0   f  t  1.50  h  q  g  u  0.500  24.50  a
3    3  100.0   g   t    5   t  t  3.75  v  w  g  u  1.540  27.83  b
4    0  120.0   s   f    0   f  t  1.71  v  w  g  u  5.625  20.17  b
  A16
0   +
1   +
2   +
3   +
4   +


In [8]:
# Print summary statistics
cc_apps_description = X.describe()
print(cc_apps_description)

print('\n')

# Print DataFrame information
cc_apps_info = X.info()
print(cc_apps_info)

print('\n')

                 A15          A14        A11          A8          A3  \
count     690.000000   677.000000  690.00000  690.000000  690.000000   
mean     1017.385507   184.014771    2.40000    2.223406    4.758725   
std      5210.102598   173.806768    4.86294    3.346513    4.978163   
min         0.000000     0.000000    0.00000    0.000000    0.000000   
25%         0.000000    75.000000    0.00000    0.165000    1.000000   
50%         5.000000   160.000000    0.00000    1.000000    2.750000   
75%       395.500000   276.000000    3.00000    2.625000    7.207500   
max    100000.000000  2000.000000   67.00000   28.500000   28.000000   

               A2  
count  678.000000  
mean    31.568171  
std     11.957862  
min     13.750000  
25%     22.602500  
50%     28.460000  
75%     38.230000  
max     80.250000  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------

In [44]:
#Dropping DriversLicense and ZipCode column as they are not as significant while predicitng credit card approvals.
X = X.drop(columns=['A11','A13'])

In [122]:
all_columns = X.join(y)

# Split into train and test sets
all_train, all_test = train_test_split(all_columns, test_size=0.50, random_state=42)

In [123]:
# There were few ? in the dataset. Replace the '?'s with NaN in the train and test sets
all_train = all_train.replace('?', np.nan)
all_test = all_test.replace('?', np.nan)

In [124]:
#Checking for NAN values
count_all_train = all_train.isna().sum()
count_all_test = all_test.isna().sum()

print(count_all_train)
print(count_all_test)

A15     0
A14    10
A12     0
A10     0
A9      0
A8      0
A7      6
A6      6
A5      5
A4      5
A3      0
A2      4
A1      6
A16     0
dtype: int64
A15    0
A14    3
A12    0
A10    0
A9     0
A8     0
A7     3
A6     3
A5     1
A4     1
A3     0
A2     8
A1     6
A16    0
dtype: int64


In [125]:
#Removing Nan Values in numeric columns by using mean imputation
all_train.fillna(all_train.mean(), inplace=True)
all_test.fillna(all_test.mean(), inplace=True)

# Count the number of NaNs in the datasets and print the counts to verify
# ... YOUR CODE FOR TASK 5 ...
count_na_train = all_train.isna().sum()
count_na_test = all_test.isna().sum()

print(count_na_train)
print(count_na_test)

A15    0
A14    0
A12    0
A10    0
A9     0
A8     0
A7     6
A6     6
A5     5
A4     5
A3     0
A2     0
A1     6
A16    0
dtype: int64
A15    0
A14    0
A12    0
A10    0
A9     0
A8     0
A7     3
A6     3
A5     1
A4     1
A3     0
A2     0
A1     6
A16    0
dtype: int64


In [126]:
#Filling missing values for categorical columns with most frequently occuring value
# Iterate over each column of X
for col in all_train:
    # Check if the column is of object type
    if all_train[col].dtypes == 'object':
        # Impute with the most frequent value
        all_train[col] = all_train[col].fillna(all_train[col].value_counts().idxmax())
        all_test[col] = all_test[col].fillna(all_train[col].value_counts().idxmax())

# Count the number of NaNs in the dataset and print the counts to verify
count_all_train = all_train.isna().sum()
count_all_test = all_test.isna().sum()

print(count_all_train)
print(count_all_test)

A15    0
A14    0
A12    0
A10    0
A9     0
A8     0
A7     0
A6     0
A5     0
A4     0
A3     0
A2     0
A1     0
A16    0
dtype: int64
A15    0
A14    0
A12    0
A10    0
A9     0
A8     0
A7     0
A6     0
A5     0
A4     0
A3     0
A2     0
A1     0
A16    0
dtype: int64


In [127]:
#Pre processing
# Convert the categorical features in the train and test sets independently
cc_apps_train = pd.get_dummies(all_train)
cc_apps_test = pd.get_dummies(all_test)

# Reindex the columns of the test set aligning with the train set
cc_apps_test = cc_apps_test.reindex(columns=cc_apps_train.columns, fill_value=0)

In [128]:
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Segregate features and labels into separate variables
X_train, y_train = cc_apps_train.iloc[:, :-1].values, cc_apps_train.iloc[:, [-1]].values
X_test, y_test = cc_apps_test.iloc[:, :-1].values, cc_apps_test.iloc[:, [-1]].values

y_train = y_train.ravel()
y_test = y_test.ravel()

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

In [129]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledX_train,y_train)
print(logreg)

LogisticRegression()


In [130]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test,y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_test,y_pred)

Accuracy of logistic regression classifier:  1.0


array([[154,   0],
       [  0, 191]], dtype=int64)

In [131]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001 ,0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are the corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)

In [132]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Fit grid_model to the data
grid_model_result = grid_model.fit(rescaledX_train, y_train)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

# Extract the best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
print("Accuracy of logistic regression classifier: ", best_model.score(rescaledX_test,y_test))

Best: 1.000000 using {'max_iter': 100, 'tol': 0.01}
Accuracy of logistic regression classifier:  1.0


In [133]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [134]:
# Create and fit an XGBoost model
model = xgb.XGBClassifier(
    objective='binary:logistic',
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    reg_alpha=1.0,   # L1 regularization strength
    reg_lambda=1.0   # L2 regularization strength
)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
