In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None) 
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,g,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,g,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,g,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,g,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,s,0,+


In [2]:
#Convert the column name to strings
cc_apps.columns = cc_apps.columns.astype(str)

In [3]:
#Check each of the categorical columns to see how many unique variables there are and check which ones to drop and which to keep
zero,three,four,five,six,eight,nine,eleven = cc_apps['0'].nunique(),cc_apps['3'].nunique(),cc_apps['4'].nunique(),cc_apps['5'].nunique(),cc_apps['6'].nunique(),cc_apps['8'].nunique(),cc_apps['9'].nunique(),cc_apps['11'].nunique()
print(f'The unique variables are: \nZero:{zero} \nThree:{three} \nFour:{four} \nFive:{five} \nSix:{six} \nEight:{eight} \nNine:{nine} \nEleven:{eleven}')

The unique variables are: 
Zero:3 
Three:4 
Four:4 
Five:15 
Six:10 
Eight:2 
Nine:2 
Eleven:3


In [4]:
#Keep 0,3,4,6,8,9,11 and drop the 5 column
cc_apps_drop_5_and_6 = cc_apps.drop(['5'],axis=1)

#Create boolean columns for the categorical columns
cc_boolean = pd.get_dummies(data=cc_apps_drop_5_and_6,columns=['0','3','4','6','8','9','11'],drop_first=True)

#Convert the boolean input values into binary values
cc_binary = cc_boolean.drop(['1','2','7','10','12','13'],axis=1).astype(int)

#Concatenate the binary and the boolean dataframe
cc_binary = pd.concat([cc_boolean[['1','2','7','10','12','13']],cc_binary],axis=1)

In [5]:
#Replace the '+' and '-' values in column 13 with True and False
cc_binary['13'] = cc_binary['13'].replace({'+': True, '-': False})

#Convert these True and False values into integers 1 and 0
cc_binary['13'] = cc_binary['13'].astype(int)

#Move this column to be the last column in your dataframe for better viewing
column_to_move = cc_binary.pop('13')

cc_binary = pd.concat([cc_binary, column_to_move], axis=1)

  cc_binary['13'] = cc_binary['13'].replace({'+': True, '-': False})


In [6]:
# Replace non-numeric values with NaN
cc_binary = cc_binary.apply(pd.to_numeric, errors='coerce')

#Remove any rows with NaN values
cc_binary = cc_binary.dropna()

In [7]:
#Split the data into the input and the target value
x = cc_binary.drop('13',axis=1)
y = cc_binary['13']

#Split the data into your training and test set
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.4,random_state=101)

In [8]:
# Create a pipeline with StandardScaler and LogisticRegression
pipeline = Pipeline([('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
])

In [15]:
# Define the parameter grid
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__penalty': ['l1', 'l2'],  # Penalty type
    'classifier__solver': ['liblinear'],  # Solver
}

In [16]:
# Instantiate GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

In [17]:
# Fit GridSearchCV to the training data
grid_search.fit(x_train, y_train)

In [18]:
# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Estimator:", best_estimator)

Best Parameters: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best Estimator: Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', LogisticRegression(C=10, solver='liblinear'))])


In [19]:
# Predict on the test set and evaluate the model
y_Pred = best_estimator.predict(x_test)

In [20]:
#Print out the confusion matrix and classification and report
print(confusion_matrix(y_test,y_Pred))
print(classification_report(y_test,y_Pred))

[[126  24]
 [  9 113]]
              precision    recall  f1-score   support

           0       0.93      0.84      0.88       150
           1       0.82      0.93      0.87       122

    accuracy                           0.88       272
   macro avg       0.88      0.88      0.88       272
weighted avg       0.88      0.88      0.88       272

