In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


In [3]:
# load dataset
data = pd.read_csv('cc_approvals.data', header=None, na_values='?')

In [7]:
# inspect data
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [13]:
# print summary statistics
data.describe()




Unnamed: 0,1,2,7,10,13,14
count,678.0,690.0,690.0,690.0,677.0,690.0
mean,31.568171,4.758725,2.223406,2.4,184.014771,1017.385507
std,11.957862,4.978163,3.346513,4.86294,173.806768,5210.102598
min,13.75,0.0,0.0,0.0,0.0,0.0
25%,22.6025,1.0,0.165,0.0,75.0,0.0
50%,28.46,2.75,1.0,0.0,160.0,5.0
75%,38.23,7.2075,2.625,3.0,276.0,395.5
max,80.25,28.0,28.5,67.0,2000.0,100000.0


In [15]:
# print DataFrame information
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       678 non-null    object 
 1   1       678 non-null    float64
 2   2       690 non-null    float64
 3   3       684 non-null    object 
 4   4       684 non-null    object 
 5   5       681 non-null    object 
 6   6       681 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      677 non-null    float64
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 86.4+ KB


In [16]:
# inspect missing values in ths dataset
data.tail(17)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256.0,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260.0,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240.0,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129.0,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100.0,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0.0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0.0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280.0,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176.0,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140.0,2,-


In [26]:
# Import numpy
import numpy as np
# Inspect missing values in the dataset
data.tail(17)

# Replace the '?'s with Na
data.replace('?',np.nan)
# Inspect the missing values again
data.tail(17)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256.0,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260.0,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240.0,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129.0,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100.0,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0.0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0.0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280.0,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176.0,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140.0,2,-


In [38]:
# Impute the missing values with mean imputation 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')




# Count the number of NaNs in the dataset to verify imputation
data.isnull().sum()



0     12
1      0
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
dtype: int64

In [42]:
# Iterate over each column of the DataFrame
for column in data.columns:
    # Check if the column is of object type
    if data[column].dtype == 'object':
        # Impute with the most frequent value
        data.fillna(data[column].value_counts().index[0])


        
# Count the number of NaNs in the dataset and print the counts to verify.
data.isnull().sum()



0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [52]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
# Instantiate LabelEncoder
le = LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for column in data.columns:
    # Compare if the dtype is object
    if data[column].dtype == 'object':
        # use  LabelEncoder to do the numeric transformation
        data[column] = le.fit_transform(data[column])
        
data.info
       


<bound method DataFrame.info of      0      1       2   3   4   5   6   ...  9   10  11  12     13   14  15
0     1  30.83   0.000   2   0  13   8  ...   1   1   0   0  202.0    0   0
1     0  58.67   4.460   2   0  11   3  ...   1   6   0   0   43.0  560   0
2     0  24.50   0.500   2   0  11   3  ...   0   0   0   0  280.0  824   0
3     1  27.83   1.540   2   0  13   8  ...   1   5   1   0  100.0    3   0
4     1  20.17   5.625   2   0  13   8  ...   0   0   0   2  120.0    0   0
..   ..    ...     ...  ..  ..  ..  ..  ...  ..  ..  ..  ..    ...  ...  ..
685   1  21.08  10.085   3   3   4   3  ...   0   0   0   0  260.0    0   1
686   0  22.67   0.750   2   0   1   8  ...   1   2   1   0  200.0  394   1
687   0  25.25  13.500   3   3   5   2  ...   1   1   1   0  200.0    1   1
688   1  17.92   0.205   2   0   0   8  ...   0   0   0   0  280.0  750   1
689   1  35.00   3.375   2   0   1   3  ...   0   0   1   0    0.0    0   1

[690 rows x 16 columns]>

In [53]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the dataset into features and target variable
X = data.iloc[:,:-1]
y = data.iloc[:,-1] 

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [54]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)




In [55]:
# Import LogisticRegression

from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression model and fit it to the training set
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [57]:
from sklearn.metrics import confusion_matrix

# Using logreg to predict instances from the test set and store it
y_pred = log_reg.predict(X_test)
print(f"Accuracy of logistic regression classifier: ".format(log_reg.score(X_test, y_test)))

# print confusion matrix of  the logreg model
confusion_matrix(y_test, y_pred)



Accuracy of logistic regression classifier: 


array([[61,  9],
       [15, 53]])

In [59]:
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]


# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
grid = dict(tol = tol, max_iter = max_iter)




In [60]:
# Instantiate GridSearchCV with the required parameters
grid_search = GridSearchCV(estimator=log_reg, param_grid=grid, cv=5, n_jobs=-1, scoring='accuracy')

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit data to grid_model
grid_model = grid_search.fit(rescaledX, y)

# Summarize results
print(f"Best: {grid_model.best_score_} using {grid_model.best_params_}")




Best: 0.8521739130434781 using {'max_iter': 100, 'tol': 0.0001}
