<a href="https://colab.research.google.com/github/ViktorBakken/SM-r/blob/main/models/Logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.linear_model as skl_lm
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV

from scipy.stats import uniform


seed = 42

In [36]:
#Load data
df = pd.read_csv(os.path.abspath("normalized_labeled_training_data.csv"))

#Select label column
y = df['increase_stock']

#Select training data
X = df[['hour_of_day', 'day_of_week', 'month', 'holiday','weekday','summertime','temp','dew','humidity','precip','snow','snowdepth','windspeed','cloudcover','visibility']]

#Create new features
X['day_or_night'] = X['hour_of_day'].apply(lambda x: 1 if 8 <= x < 21 else 0)
X['normal_day'] = (~((X['summertime'] == 1) | (X['holiday'] == 1) | (X['weekday'] == 0))).astype(int)
X['cold'] = X['temp'].apply(lambda x: 1 if x <= 8 else 0)


#Remove bad features
X = X.drop(['snow', 'snowdepth', 'holiday', 'visibility', 'precip', 'dew'], axis=1)

X.info()

#Select label column
y = df['increase_stock']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=seed, stratify=y)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   hour_of_day   1600 non-null   float64
 1   day_of_week   1600 non-null   float64
 2   month         1600 non-null   float64
 3   weekday       1600 non-null   float64
 4   summertime    1600 non-null   float64
 5   temp          1600 non-null   float64
 6   humidity      1600 non-null   float64
 7   windspeed     1600 non-null   float64
 8   cloudcover    1600 non-null   float64
 9   day_or_night  1600 non-null   int64  
 10  normal_day    1600 non-null   int64  
 11  cold          1600 non-null   int64  
dtypes: float64(9), int64(3)
memory usage: 150.1 KB


In [37]:

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('log_reg', skl_lm.LogisticRegression(max_iter=1000, random_state=seed))
])

# Define hyperparameter search space
param_distributions = {
    'log_reg__C': uniform(0.01, 10),  # C is the inverse of regularization strength
    'log_reg__penalty': ['l2'],
    'log_reg__solver': ['lbfgs', 'saga']
}

# Set up cross-validation with 10 folds
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

# RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=20,
    scoring='f1',
    cv=cv,
    random_state=seed,
    n_jobs=-1)

# Fit the model using RandomizedSearchCV
random_search.fit(X_train, y_train)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

In [38]:
# Print evaluation metrics
print(f"Best Hyperparameters: {random_search.best_params_}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")
print(f"Recall Score: {recall_score(y_test, y_pred):.4f}")
print(f"Precision Score: {precision_score(y_test, y_pred):.4f}")

# ROC AUC Score
try:
    y_proba = best_model.predict_proba(X_test)[:, 1]
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
except ValueError:
    print(f"ROC AUC Score: {np.nan}")


Best Hyperparameters: {'log_reg__C': 9.74755518841459, 'log_reg__penalty': 'l2', 'log_reg__solver': 'lbfgs'}
F1 Score: 0.5591
Accuracy Score: 0.8719
Recall Score: 0.4483
Precision Score: 0.7429
ROC AUC Score: 0.8891
