# Logistic Regression

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [3]:
# Load and preprocess data
from sklearn.model_selection import KFold


health_data = pd.read_csv("../data/data_train.csv", delimiter=",", index_col="Id")
health_data = health_data.drop_duplicates()
selected_features = ['HighBP', 'HighChol', 'BMI',
       'HeartDiseaseorAttack', 'PhysActivity', 'GenHlth', 
       'PhysHlth', 'DiffWalk', 'Age', 'Education', 'Income',
       'ExtraMedTest', 'ExtraAlcoholTest']
X = health_data[selected_features].copy()
y = health_data.Status
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# Define the pipeline with polynomial feature expansion
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('classifier', LogisticRegression())
])

# Define the parameter grid
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],  # Regularization parameter for Logistic Regression
    'classifier__class_weight': [None, 'balanced'],  # Class weight for Logistic Regression
    'classifier__max_iter': [200, 300, 400]   # Maximum number of iterations for Logistic Regression
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, n_jobs=-1, scoring='f1')

# Train the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters:", grid_search.best_params_)

# Evaluate the best model on the validation set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_valid)
f1 = f1_score(y_valid, y_pred)
print("F1 Score:", f1)


Best parameters: {'classifier__C': 100, 'classifier__class_weight': None, 'classifier__max_iter': 200}
F1 Score: 0.797236084452975


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import optuna

# Define the objective function for Optuna
def objective(trial):
    # Define hyperparameters to search
    params = {
        'C': trial.suggest_categorical('C', [0.1, 1, 10, 100]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'max_iter': trial.suggest_categorical('max_iter', [200, 300])
    }

    # Instantiate the logistic regression model with current hyperparameters
    logis_model = LogisticRegression(**params)

    # Create polynomial features up to degree 2
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly.fit_transform(X_train)

    return np.mean(cross_val_score(logis_model, X_poly, y_train, cv=kf, scoring='f1'))

# Perform hyperparameter optimization with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Retrieve the best hyperparameters
best_params = study.best_params
best_logis_model = LogisticRegression(**best_params)

# Create polynomial features up to degree 2 for the entire dataset
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_train)

logis_scores = cross_val_score(best_logis_model, X_poly, y_train, cv=kf, scoring='f1')

# Print the cross-validation scores and mean score
print("Best params:", best_params)
print("Cross-validation F1 scores:", logis_scores)
print("Mean CV F1 Score:", np.mean(logis_scores))

[I 2024-04-11 00:13:54,755] A new study created in memory with name: no-name-ebeed77f-d942-4813-b918-784765279630
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://sc

KeyboardInterrupt: 