In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

dataset = pd.read_csv("dataset.csv")
print(dataset.head(4))

input_data = dataset.iloc[:, :-1]
output_data = dataset.iloc[:, -1]

ss = StandardScaler()
input_data = pd.DataFrame(ss.fit_transform(input_data), columns=input_data.columns)
print(input_data.shape)

x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2, random_state=39)

log_reg = LogisticRegression(max_iter=1000)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga', 'liblinear']  
}

param_grid['penalty'] = [p for p in param_grid['penalty'] if p != 'elasticnet' or 'saga' in param_grid['solver']]

grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

log_reg = LogisticRegression(**best_params, max_iter=1000)
log_reg.fit(x_train, y_train)

y_pred_test = log_reg.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Test accuracy: {test_accuracy * 100:.2f}%')

y_pred_train = log_reg.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f'Train accuracy: {train_accuracy * 100:.2f}%')

data_point = np.array([[0, 29, 0, 0, 0, 0, 0, 2, 1]])
data_point_scaled = ss.transform(data_point)
prediction = log_reg.predict(data_point_scaled)
print(f'Prediction: {prediction[0]}')


   Gender  Age  NS1  IgG  IgM  Area  AreaType  HouseType  District  Outcome
0       0   45    0    0    0     1         0          1         1        0
1       1   17    0    0    1     0         1          1         1        0
2       0   29    0    0    0     0         0          2         1        0
3       0   63    1    1    0     0         1          2         1        1
(1000, 9)
Best parameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}
Test accuracy: 100.00%
Train accuracy: 100.00%
Prediction: 0


50 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1291, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterable_