In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

# Load the dataset
cc_apps = pd.read_csv("Datasets/Credit_card_approvals/cc_approvals.data", header=None)
cc_apps.head()

In [None]:
# drop features 11 and 13 from the dataset
cc_apps = cc_apps.drop([11, 13], axis=1)



In [None]:

# splitting the data into train and test sets (test set is 33% of the data, random_state is 42)
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)

# replace '?' with NaN
cc_apps_train_nans_replaced = cc_apps_train.replace('?', np.nan)
cc_apps_test_nans_replaced = cc_apps_test.replace('?', np.nan)

# create a table of missing values by column
cc_apps_train_nans_replaced.isnull().sum()

# convert datatype 1 to float in the train and test sets
cc_apps_train_nans_replaced[1] = cc_apps_train_nans_replaced[1].astype(float)
cc_apps_test_nans_replaced[1] = cc_apps_test_nans_replaced[1].astype(float)


# get the data types of the columns
cc_apps_train_nans_replaced.dtypes


In [None]:
# Impute the missing values with mean imputation for columns 2, 7, 10, 14 for the train set
# List of columns you want to impute
columns_to_impute = [1, 2, 7, 10, 14]

# Create a new DataFrame for the imputed data
cc_apps_train_imputed = cc_apps_train_nans_replaced.copy()

# Loop over the columns and impute them
for column in columns_to_impute:
    mean_value = cc_apps_train_imputed[column].mean()
    cc_apps_train_imputed[column] = cc_apps_train_imputed[column].fillna(mean_value)

# imputing the missing values with mean imputation for columns 2, 7, 10, 14 for the test set
cc_apps_test_imputed = cc_apps_test_nans_replaced.copy()

# Loop over the columns and impute them
for column in columns_to_impute:
    mean_value = cc_apps_test_imputed[column].mean()
    cc_apps_test_imputed[column] = cc_apps_test_imputed[column].fillna(mean_value)


In [None]:
# iterate through cc_apps_train_imputed columns with a for loop, checking for object data type and return the columns with object data type
for column in cc_apps_train_imputed.columns:
    if cc_apps_train_imputed[column].dtypes == 'object':
        print(column)

In [None]:
# impute the missing values for cc_apps_train_imputed and cc_apps_test_imputed using fillna method and the most frequent value using value_counts method
for column in cc_apps_train_imputed.columns:
    if cc_apps_train_imputed[column].dtypes == 'object':
        cc_apps_train_imputed[column] = cc_apps_train_imputed[column].fillna(cc_apps_train_imputed[column].value_counts().index[0])
        cc_apps_test_imputed[column] = cc_apps_test_imputed[column].fillna(cc_apps_test_imputed[column].value_counts().index[0])
        

In [None]:
# Convert the categorical features in the train and test sets independently
cc_apps_train_cat_encoding = pd.get_dummies(cc_apps_train_imputed)
cc_apps_test_cat_encoding = pd.get_dummies(cc_apps_test_imputed)

# Reindex the columns of the test set aligning with the train set
cc_apps_test_cat_encoding = cc_apps_test_cat_encoding.reindex(
    columns=cc_apps_train_cat_encoding.columns, fill_value=0
)

In [None]:
# view the first five rows of the encoded train set
cc_apps_train_cat_encoding.head()

In [None]:
# create the X_train, X_test, y_train, y_test with the train and test sets (last column is target feature)
X_train, y_train = (
    cc_apps_train_cat_encoding.iloc[:, :-1].values,
    cc_apps_train_cat_encoding.iloc[:, [-1]].values,
)
X_test, y_test = (
    cc_apps_test_cat_encoding.iloc[:, :-1].values,
    cc_apps_test_cat_encoding.iloc[:, [-1]].values,
)


In [None]:
# using minmax scaler to scale the train and test sets
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

# initiate the logistic regression model
logreg = LogisticRegression()

# fit the model to the train set
logreg.fit(rescaledX_train, y_train)

#use logreg to predict the test set
y_pred = logreg.predict(rescaledX_test)

#print the confusion matrix
print(confusion_matrix(y_test, y_pred))