## Logistic Regression Implementation on (scaled) non-reduced dataset

Imports

In [23]:
# Utilities
import pandas as pd
import warnings
import matplotlib as plt

# Models
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

Set random seed for reproductibility

In [2]:
seed = 42

Read clean (scaled data)

In [3]:
df_scaled = pd.read_csv('CleanedData/dataset_preprocessed.csv')

Take sample for (initial) tests

In [4]:
df_scaled_sample = df_scaled.sample(n=1000, random_state=seed)

Train-Test Split

In [5]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    df_scaled_sample.drop('averageRating', axis=1), 
    df_scaled_sample['averageRating'], test_size=test_size, 
    random_state=seed
)

Model fit

In [7]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

In [8]:
predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 score:", f1_score(y_test, predictions, average=None))

Accuracy: 0.765
F1 score: [0.         0.81853282 0.69117647]


In [9]:
# Apar warning-uri ptc nu converge daca max_iter e mic (<100)
# Maximu pe care l-am pus e 10k si acolo nu apare niciun warning dar am lasat ca lista
# Ptc modelele cele mai bune sunt la la max_iter <500
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

## Grid Search with Cross-Validation

In [10]:
# Define the hyperparameters to tune
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'max_iter': [10, 100, 200, 500, 100, 1000, 5000],
    'solver': ['liblinear', 'saga']
}

# Perform grid search cross-validation
grid_search_cv = GridSearchCV(estimator=LogisticRegression(random_state=seed),
                               param_grid=param_grid, cv=5, scoring='accuracy')
grid_search_cv.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params_cv = grid_search_cv.best_params_
best_model_cv = grid_search_cv.best_estimator_

Results of the best model 

In [11]:
accuracy_cv = best_model_cv.score(X_test, y_test)

print("Best Hyperparameters:", best_params_cv)
print("Accuracy:", accuracy_cv)

Best Hyperparameters: {'C': 100, 'max_iter': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.78


Results of best models Grid Search CV

In [12]:
df_cv = pd.DataFrame(grid_search_cv.cv_results_)

# Show top parameters for LR with Cross Validation Grid Serach
df_results_cv = df_cv[['params', 'mean_test_score']]
df_results_cv_sorted = df_results_cv.sort_values(by='mean_test_score', ascending=False)

print(df_results_cv_sorted.head(5))

                                                params  mean_test_score
154  {'C': 100, 'max_iter': 500, 'penalty': 'l2', '...          0.75625
166  {'C': 100, 'max_iter': 5000, 'penalty': 'l2', ...          0.75625
142  {'C': 100, 'max_iter': 10, 'penalty': 'l2', 's...          0.75625
164  {'C': 100, 'max_iter': 5000, 'penalty': 'l1', ...          0.75625
144  {'C': 100, 'max_iter': 100, 'penalty': 'l1', '...          0.75625


In [13]:
# Use best parameters to make predictions
predictions_cv = best_model_cv.predict(X_test)

print("Accuracy achieved for best parameters:", accuracy_score(y_test, predictions))
print("F1 score for best parameters:", f1_score(y_test, predictions, average=None))
# Aici (rezultatu) e 0. (pentru clasa 0) ptc sunt ft putine date pentru clasa 0 (maybe unbalanced)

Accuracy achieved for best parameters: 0.765
F1 score for best parameters: [0.         0.81853282 0.69117647]


## Grid Search Stratified K Fold Validation

In [14]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

grid_search_skf = GridSearchCV(estimator=LogisticRegression(random_state=seed), 
                           param_grid=param_grid, cv=skf, scoring='accuracy')
grid_search_skf.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params_skf = grid_search_skf.best_params_
best_model_skf = grid_search_skf.best_estimator_

# Evaluate the best model on the test set
accuracy = best_model_skf.score(X_test, y_test)

print("Best Hyperparameters with Stratified K-Fold:", best_params_skf)
print("Accuracy with Stratified K-Fold:", accuracy)


Best Hyperparameters with Stratified K-Fold: {'C': 1, 'max_iter': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy with Stratified K-Fold: 0.79


Results of the best model 

In [15]:
accuracy_skf = best_model_skf.score(X_test, y_test)

print("Best Hyperparameters:", best_params_skf)
print("Accuracy:", accuracy_skf)

Best Hyperparameters: {'C': 1, 'max_iter': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.79


Results of best models Grid Search CV

In [17]:
df_skf = pd.DataFrame(grid_search_skf.cv_results_)

# Show top parameters for LR with Cross Validation Grid Serach
df_results_skf = df_skf[['params', 'mean_test_score']]
df_results_skf_sorted = df_results_skf.sort_values(by='mean_test_score', ascending=False)

print(df_results_skf_sorted.head(5))

                                                params  mean_test_score
84   {'C': 1, 'max_iter': 10, 'penalty': 'l1', 'sol...          0.74250
108  {'C': 1, 'max_iter': 5000, 'penalty': 'l1', 's...          0.74125
92   {'C': 1, 'max_iter': 200, 'penalty': 'l1', 'so...          0.74125
96   {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'so...          0.74125
100  {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'so...          0.74125


In [19]:
# Use best parameters to make predictions
predictions_skf = best_model_skf.predict(X_test)

print("Accuracy achieved for best parameters:", accuracy_score(y_test, predictions_skf))
print("F1 score for best parameters:", f1_score(y_test, predictions_skf, average=None))

Accuracy achieved for best parameters: 0.79
F1 score for best parameters: [0.         0.83464567 0.73758865]


Next maybe add elastic search as penalty? only works with saga solver

# Tune with Optuna

In [17]:
def objective(trial):
    # Define the hyperparameters to optimize
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    C = trial.suggest_loguniform('C', 0.01, 10.0)
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    solver= trial.suggest_categorical('solver', ['liblinear', 'saga'])
    
    # Create the logistic regression model with the hyperparameters
    model = LogisticRegression(penalty=penalty, C=C, max_iter=max_iter,
     solver=solver, random_state=seed)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    #score_cross_val = cross_val_score(model, X_train, y_train, cv=5, scoring='precision')

    
    return accuracy

In [21]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=seed))
study.optimize(objective, n_trials=200)

# Get the best hyperparameters and accuracy score
best_params = study.best_params
best_accuracy = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

[I 2024-06-04 14:25:44,220] A new study created in memory with name: no-name-61aaa6cc-3370-4b94-9a1b-853b5a8d2a53
  C = trial.suggest_loguniform('C', 0.01, 10.0)
[I 2024-06-04 14:25:44,286] Trial 0 finished with value: 0.76 and parameters: {'penalty': 'l2', 'C': 1.5702970884055387, 'max_iter': 639, 'solver': 'liblinear'}. Best is trial 0 with value: 0.76.
  C = trial.suggest_loguniform('C', 0.01, 10.0)
[I 2024-06-04 14:25:44,434] Trial 1 finished with value: 0.745 and parameters: {'penalty': 'l2', 'C': 0.6358358856676253, 'max_iter': 737, 'solver': 'saga'}. Best is trial 0 with value: 0.76.
  C = trial.suggest_loguniform('C', 0.01, 10.0)
[I 2024-06-04 14:25:44,508] Trial 2 finished with value: 0.57 and parameters: {'penalty': 'l1', 'C': 0.035113563139704075, 'max_iter': 265, 'solver': 'saga'}. Best is trial 0 with value: 0.76.
  C = trial.suggest_loguniform('C', 0.01, 10.0)
[I 2024-06-04 14:25:45,097] Trial 3 finished with value: 0.77 and parameters: {'penalty': 'l1', 'C': 0.6847920095

Best Hyperparameters: {'penalty': 'l1', 'C': 3.8842777547031417, 'max_iter': 661, 'solver': 'liblinear'}
Best Accuracy: 0.805


In [22]:
study.best_params

{'penalty': 'l1',
 'C': 3.8842777547031417,
 'max_iter': 661,
 'solver': 'liblinear'}

In [24]:
optuna.visualization.plot_optimization_history(study)

In [25]:
optuna.visualization.plot_parallel_coordinate(study)

In [27]:
optuna.visualization.plot_slice(study, params=['penalty', 'C', 'max_iter', 'solver'])

In [28]:
optuna.visualization.plot_param_importances(study)