In [50]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

# Load the dataset
cc_apps = pd.read_csv("/Users/karolk/Python_Work/DataCamp/Datasets/Credit_card_approvals/cc_approvals.data", header=None)
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [51]:
# drop features 11 and 13 from the dataset
cc_apps = cc_apps.drop([11, 13], axis=1)



In [52]:

# splitting the data into train and test sets (test set is 33% of the data, random_state is 42)
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)

# replace '?' with NaN
cc_apps_train_nans_replaced = cc_apps_train.replace('?', np.nan)
cc_apps_test_nans_replaced = cc_apps_test.replace('?', np.nan)

# create a table of missing values by column
cc_apps_train_nans_replaced.isnull().sum()

# convert datatype 1 to float in the train and test sets
cc_apps_train_nans_replaced[1] = cc_apps_train_nans_replaced[1].astype(float)
cc_apps_test_nans_replaced[1] = cc_apps_test_nans_replaced[1].astype(float)


# get the data types of the columns
cc_apps_train_nans_replaced.dtypes


0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
12     object
14      int64
15     object
dtype: object

In [53]:
# Impute the missing values with mean imputation for columns 2, 7, 10, 14 for the train set
# List of columns you want to impute
columns_to_impute = [1, 2, 7, 10, 14]

# Create a new DataFrame for the imputed data
cc_apps_train_imputed = cc_apps_train_nans_replaced.copy()

# Loop over the columns and impute them
for column in columns_to_impute:
    mean_value = cc_apps_train_imputed[column].mean()
    cc_apps_train_imputed[column] = cc_apps_train_imputed[column].fillna(mean_value)

# imputing the missing values with mean imputation for columns 2, 7, 10, 14 for the test set
cc_apps_test_imputed = cc_apps_test_nans_replaced.copy()

# Loop over the columns and impute them
for column in columns_to_impute:
    mean_value = cc_apps_test_imputed[column].mean()
    cc_apps_test_imputed[column] = cc_apps_test_imputed[column].fillna(mean_value)


In [54]:
# iterate through cc_apps_train_imputed columns with a for loop, checking for object data type and return the columns with object data type
for column in cc_apps_train_imputed.columns:
    if cc_apps_train_imputed[column].dtypes == 'object':
        print(column)

0
3
4
5
6
8
9
12
15


In [55]:
# impute the missing values for cc_apps_train_imputed and cc_apps_test_imputed using fillna method and the most frequent value using value_counts method
for column in cc_apps_train_imputed.columns:
    if cc_apps_train_imputed[column].dtypes == 'object':
        cc_apps_train_imputed[column] = cc_apps_train_imputed[column].fillna(cc_apps_train_imputed[column].value_counts().index[0])
        cc_apps_test_imputed[column] = cc_apps_test_imputed[column].fillna(cc_apps_test_imputed[column].value_counts().index[0])
        

In [56]:
# Convert the categorical features in the train and test sets independently
cc_apps_train_cat_encoding = pd.get_dummies(cc_apps_train_imputed)
cc_apps_test_cat_encoding = pd.get_dummies(cc_apps_test_imputed)

# Reindex the columns of the test set aligning with the train set
cc_apps_test_cat_encoding = cc_apps_test_cat_encoding.reindex(
    columns=cc_apps_train_cat_encoding.columns, fill_value=0
)

In [57]:
# view the first five rows of the encoded train set
cc_apps_train_cat_encoding.head()

Unnamed: 0,1,2,7,10,14,0_a,0_b,3_l,3_u,3_y,...,6_z,8_f,8_t,9_f,9_t,12_g,12_p,12_s,15_+,15_-
382,24.33,2.5,4.5,0,456,True,False,False,False,True,...,False,True,False,True,False,True,False,False,False,True
137,33.58,2.75,4.25,6,0,False,True,False,True,False,...,False,False,True,False,True,True,False,False,True,False
346,32.25,1.5,0.25,0,122,False,True,False,True,False,...,False,True,False,True,False,True,False,False,False,True
326,30.17,1.085,0.04,0,179,False,True,False,False,True,...,False,True,False,True,False,True,False,False,False,True
33,36.75,5.125,5.0,0,4000,True,False,False,True,False,...,False,False,True,True,False,True,False,False,True,False


In [59]:
# create the X_train, X_test, y_train, y_test with the train and test sets (last column is target feature)
X_train, y_train = (
    cc_apps_train_cat_encoding.iloc[:, :-1].values,
    cc_apps_train_cat_encoding.iloc[:, [-1]].values,
)
X_test, y_test = (
    cc_apps_test_cat_encoding.iloc[:, :-1].values,
    cc_apps_test_cat_encoding.iloc[:, [-1]].values,
)


In [62]:
# using minmax scaler to scale the train and test sets
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

# initiate the logistic regression model
logreg = LogisticRegression()

# fit the model to the train set
logreg.fit(rescaledX_train, y_train)

#use logreg to predict the test set
y_pred = logreg.predict(rescaledX_test)

#print the confusion matrix
print(confusion_matrix(y_test, y_pred))

[[103   0]
 [  0 125]]


  y = column_or_1d(y, warn=True)
