In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/My Drive/Colab Notebooks/Data

/content/drive/My Drive/Colab Notebooks/Data


In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [5]:
# Load dataset
cc_apps = pd.read_csv('cc_approvals.data', header=None)

In [6]:
# 1. Missing Value Imputation Replace '?' with NaN
cc_apps = cc_apps.replace('?', np.nan)

In [11]:
# 2. Feature Engineering

In [7]:
# Impute numerical features with the median (robust to outliers)
for col in cc_apps.columns:
    if cc_apps[col].dtype != 'object':
        cc_apps[col] = cc_apps[col].fillna(cc_apps[col].median())

In [8]:
# Impute categorical features with the most frequent value
for col in cc_apps.columns:
    if cc_apps[col].dtype == 'object':
        cc_apps[col] = cc_apps[col].fillna(cc_apps[col].mode()[0])

In [9]:
# 3. Label Encoding
le = LabelEncoder()
for col in cc_apps.columns:
    if cc_apps[col].dtype == 'object':
        cc_apps[col] = le.fit_transform(cc_apps[col])

In [12]:
# 4. Feature Scaling
scaler = MinMaxScaler(feature_range=(0, 1))
cc_apps = scaler.fit_transform(cc_apps)  # Scale all features

In [13]:
# 5. Data Splitting
X = cc_apps[:, :-1]  # All columns except the last (target)
y = cc_apps[:, -1]   # The last column (target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [26]:
# 6. Model Selection and Hyperparameter Tuning

# Option 1: Logistic Regression with GridSearchCV
logreg = LogisticRegression()
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}  # Expanded grid
grid_model = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy')
grid_model.fit(X_train, y_train)
y_pred = grid_model.predict(X_test)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 63, in _check_solver
    

In [27]:
# 7. Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8421052631578947
Confusion Matrix:
 [[93 10]
 [26 99]]


In [28]:
# Print the best parameters found by GridSearchCV
print("Best parameters:", grid_model.best_params_)


Best parameters: {'C': 0.1, 'penalty': 'l2'}


In [29]:
#Option 2: RandomForestClassifier (Alternative Model)
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
grid_model_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_model_rf.fit(X_train, y_train)
y_pred = grid_model_rf.predict(X_test)

In [30]:
#  Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8596491228070176
Confusion Matrix:
 [[ 87  16]
 [ 16 109]]


In [31]:
print("Best parameters:", grid_model_rf.best_params_) # For RandomForestClassifier

Best parameters: {'max_depth': 10, 'n_estimators': 200}
