# **Predicting Credit Card Approval**

## Loading Data

In [5]:
# Import pandas
import pandas as pd
import numpy as np

# Load dataset, treating '?' as NaN
cc_apps = pd.read_csv("cc_approvals.data", header=None, na_values='?')

# Quick look at the data
print("First 5 rows:\n", cc_apps.head(), "\n")
print("Dataset shape:", cc_apps.shape, "\n")

# Dataset info and summary statistics
cc_apps.info()
print("\nSummary statistics:\n", cc_apps.describe())

# Check missing values
print("\nMissing values per column:\n", cc_apps.isna().sum())


First 5 rows:
   0      1      2  3  4  5  6     7  8  9   10 11 12     13   14 15
0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  202.0    0  +
1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g   43.0  560  +
2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  280.0  824  +
3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  100.0    3  +
4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  120.0    0  + 

Dataset shape: (690, 16) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       678 non-null    object 
 1   1       678 non-null    float64
 2   2       690 non-null    float64
 3   3       684 non-null    object 
 4   4       684 non-null    object 
 5   5       681 non-null    object 
 6   6       681 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 

## Inspecting Applications

In [6]:
# Quick look at the data
print("First 5 rows:\n", cc_apps.head(), "\n")

# Summary statistics for numeric columns
print("Summary statistics:\n", cc_apps.describe(), "\n")

# DataFrame information
print("Dataset info:")
cc_apps.info()

# Check missing values
print("\nMissing values per column:\n", cc_apps.isna().sum())


First 5 rows:
   0      1      2  3  4  5  6     7  8  9   10 11 12     13   14 15
0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  202.0    0  +
1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g   43.0  560  +
2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  280.0  824  +
3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  100.0    3  +
4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  120.0    0  + 

Summary statistics:
                1           2           7          10           13  \
count  678.000000  690.000000  690.000000  690.00000   677.000000   
mean    31.568171    4.758725    2.223406    2.40000   184.014771   
std     11.957862    4.978163    3.346513    4.86294   173.806768   
min     13.750000    0.000000    0.000000    0.00000     0.000000   
25%     22.602500    1.000000    0.165000    0.00000    75.000000   
50%     28.460000    2.750000    1.000000    0.00000   160.000000   
75%     38.230000    7.207500    2.625000    3.00000   276.000000   
ma

## Handling Missing Values - (I)

In [7]:
# Inspect last few rows to see missing values
print(cc_apps.tail(17))


      0      1       2  3  4   5   6      7  8  9   10 11 12     13   14 15
673  NaN  29.50   2.000  y  p   e   h  2.000  f  f   0  f  g  256.0   17  -
674    a  37.33   2.500  u  g   i   h  0.210  f  f   0  f  g  260.0  246  -
675    a  41.58   1.040  u  g  aa   v  0.665  f  f   0  f  g  240.0  237  -
676    a  30.58  10.665  u  g   q   h  0.085  f  t  12  t  g  129.0    3  -
677    b  19.42   7.250  u  g   m   v  0.040  f  t   1  f  g  100.0    1  -
678    a  17.92  10.210  u  g  ff  ff  0.000  f  f   0  f  g    0.0   50  -
679    a  20.08   1.250  u  g   c   v  0.000  f  f   0  f  g    0.0    0  -
680    b  19.50   0.290  u  g   k   v  0.290  f  f   0  f  g  280.0  364  -
681    b  27.83   1.000  y  p   d   h  3.000  f  f   0  f  g  176.0  537  -
682    b  17.08   3.290  u  g   i   v  0.335  f  f   0  t  g  140.0    2  -
683    b  36.42   0.750  y  p   d   v  0.585  f  f   0  f  g  240.0    3  -
684    b  40.58   3.290  u  g   m   v  3.500  f  f   0  t  s  400.0    0  -
685    b  21

## Handling Missing Values - (II)

In [8]:
# Impute numeric missing values with mean
cc_apps.fillna(cc_apps.mean(numeric_only=True), inplace=True)

# Verify missing values
print(cc_apps.isnull().sum())


0     12
1      0
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
dtype: int64


## Handling Missing Values - (III)

In [9]:
# Impute categorical missing values with most frequent value
for col in cc_apps.select_dtypes(include=['object']).columns:
    cc_apps[col] = cc_apps[col].fillna(cc_apps[col].mode()[0])

# Verify no missing values remain
print(cc_apps.isnull().sum())


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


## Preprocessing Data - (I)

In [10]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
le = LabelEncoder()

# Encode all object-type columns
for col in cc_apps.columns:
    if cc_apps[col].dtype == 'object':
        cc_apps[col] = le.fit_transform(cc_apps[col])

# Check first few rows
print(cc_apps.head())


   0      1      2   3   4   5   6     7   8   9   10  11  12     13   14  15
0   1  30.83  0.000   1   0  12   7  1.25   1   1   1   0   0  202.0    0   0
1   0  58.67  4.460   1   0  10   3  3.04   1   1   6   0   0   43.0  560   0
2   0  24.50  0.500   1   0  10   3  1.50   1   0   0   0   0  280.0  824   0
3   1  27.83  1.540   1   0  12   7  3.75   1   1   5   1   0  100.0    3   0
4   1  20.17  5.625   1   0  12   7  1.71   1   0   0   0   2  120.0    0   0


## Splitting Data into train and test sets

In [11]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Drop features 11 and 13
cc_apps = cc_apps.drop([11, 13], axis=1)
cc_apps = cc_apps.values

# Separate features and labels
X, y = cc_apps[:, 0:13], cc_apps[:, 13]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)


## Preprocessing Data - (II)

In [12]:
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Scale features
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)  # Use transform on test set


## Fitting Logistic Regression Model to the train set

In [13]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate and fit logistic regression
logreg = LogisticRegression()
logreg.fit(rescaledX_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


## Making Predictions and Evaluating Performance

In [14]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Predict on test set
y_pred = logreg.predict(rescaledX_test)

# Accuracy and confusion matrix
print("Accuracy of logistic regression classifier: {:.2f}%".format(logreg.score(rescaledX_test, y_test)*100))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy of logistic regression classifier: 84.21%
Confusion Matrix:
 [[94  9]
 [27 98]]


## Grid Searching and making model perform better

In [15]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define grid of parameters
param_grid = {'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}

# Instantiate GridSearchCV
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Scale full dataset
rescaledX = scaler.fit_transform(X)

# Fit grid search
grid_model_result = grid_model.fit(rescaledX, y)


## Finding Best performing Model

In [16]:
# Best score and parameters
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: {:.6f} using {}".format(best_score, best_params))


Best: 0.850725 using {'max_iter': 100, 'tol': 0.01}


**Outcome:** Based on cross-validation, the logistic regression model achieves its best performance with a maximum of 100 iterations and a tolerance of 0.01, resulting in an average accuracy of approximately 85.1%. These hyperparameter values provide the most reliable predictive performance on the dataset while maintaining computational efficiency.

# END