In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV

# Logistic regression

In [2]:
df = pd.read_csv('df_alt2.csv')

In [3]:
df = df.fillna(0)


In [4]:
datetime_format = '%Y-%m-%d %H:%M:%S'
df['datetime'] = pd.to_datetime(df['datetime'], format=datetime_format, errors='coerce')
df['day_sunrise'] = pd.to_datetime(df['day_sunrise'], format='%H:%M:%S', errors='coerce')
df['day_sunset'] = pd.to_datetime(df['day_sunset'], format='%H:%M:%S', errors='coerce')

df['hour'] = df['datetime'].dt.hour
df['sunrise_hour'] = df['day_sunrise'].dt.hour
df['sunset_hour'] = df['day_sunset'].dt.hour


df['sun'] = ((df['hour'] >= df['sunrise_hour']) & (df['hour'] <= df['sunset_hour'])).astype(int)

df = df.drop(columns=['hour', 'day_sunrise', 'day_sunset'])

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
df

Unnamed: 0,city_address,day_precip,day_precipcover,day_snow,day_windspeed,day_winddir,day_uvindex,hour_temp,hour_humidity,hour_precip,hour_precipprob,hour_snow,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_uvindex,datetime,Clear,Ice,Snow,Overcast,Rain,Fog,Partially cloudy,Freezing Drizzle/Freezing Rain,anomaly,alarms,date,text_vector,sunrise_hour,sunset_hour,sun
0,Lutsk,0.118,4.17,0.1,15.5,252.7,1.0,2.4,89.18,0.0,0.0,0.1,15.5,275.6,1020.0,0.0,91.5,0.0,2022-02-24 00:00:00,0,0,0,1,0,0,0,0,False,0,2022-02-24,0.000000,7,17,0
1,Lutsk,0.118,4.17,0.1,15.5,252.7,1.0,2.4,87.90,0.0,0.0,0.0,14.8,280.3,1021.0,0.2,88.2,0.0,2022-02-24 01:00:00,0,0,0,0,0,0,1,0,False,0,2022-02-24,0.000000,7,17,0
2,Lutsk,0.118,4.17,0.1,15.5,252.7,1.0,2.9,88.58,0.0,0.0,0.0,14.4,310.0,1022.0,10.0,100.0,0.0,2022-02-24 02:00:00,0,0,0,1,0,0,0,0,False,0,2022-02-24,0.000000,7,17,0
3,Lutsk,0.118,4.17,0.1,15.5,252.7,1.0,2.3,86.63,0.0,0.0,0.0,13.3,295.1,1021.0,0.1,92.0,0.0,2022-02-24 03:00:00,0,0,0,1,0,0,0,0,False,0,2022-02-24,0.000000,7,17,0
4,Lutsk,0.118,4.17,0.1,15.5,252.7,1.0,1.9,87.85,0.0,0.0,0.0,13.3,305.8,1021.0,0.0,93.8,0.0,2022-02-24 04:00:00,0,0,0,1,0,0,0,0,False,0,2022-02-24,0.000000,7,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608230,Poltava,0.000,0.00,0.0,12.2,164.8,4.0,-1.3,73.09,0.0,0.0,0.0,5.8,174.9,1030.0,10.0,100.0,0.0,2025-03-01 19:00:00,0,0,0,1,0,0,0,0,False,0,2025-03-01,0.010219,6,17,0
608231,Poltava,0.000,0.00,0.0,12.2,164.8,4.0,-1.8,81.17,0.0,0.0,0.0,0.0,170.5,1029.6,10.0,100.0,0.0,2025-03-01 20:00:00,0,0,0,1,0,0,0,0,False,1,2025-03-01,0.010219,6,17,0
608232,Poltava,0.000,0.00,0.0,12.2,164.8,4.0,-1.0,68.31,0.0,0.0,0.0,6.8,168.7,1029.0,10.0,99.6,0.0,2025-03-01 21:00:00,0,0,0,1,0,0,0,0,False,1,2025-03-01,0.010219,6,17,0
608233,Poltava,0.000,0.00,0.0,12.2,164.8,4.0,-1.7,71.36,0.0,0.0,0.0,7.2,173.4,1029.0,10.0,98.2,0.0,2025-03-01 22:00:00,0,0,0,1,0,0,0,0,False,1,2025-03-01,0.010219,6,17,0


In [7]:
df = df.drop(columns=[ 'anomaly', 'hour_precipprob', 'datetime', 'date'])

In [8]:
df['alarms'] = df['alarms'].apply(lambda x: 1 if x != 0 else 0)


In [9]:
exclude_cols = ['city_address', 'alarms']
cols_to_scale = df.select_dtypes(include='number').columns.difference(exclude_cols)

scaler = StandardScaler()
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
df

Unnamed: 0,city_address,day_precip,day_precipcover,day_snow,day_windspeed,day_winddir,day_uvindex,hour_temp,hour_humidity,hour_precip,hour_snow,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_uvindex,Clear,Ice,Snow,Overcast,Rain,Fog,Partially cloudy,Freezing Drizzle/Freezing Rain,alarms,text_vector,sunrise_hour,sunset_hour,sun
0,Lutsk,-0.185081,-0.196903,-0.031899,-0.442215,0.579504,-1.43407,-0.849978,0.837254,-0.040789,1.965870,0.629076,0.806240,0.354982,-1.506271,0.715180,-0.632421,-0.494998,-0.001282,-0.131364,1.214477,-0.251926,-0.001813,-0.813725,-0.010015,0,-4.949748,0.972614,-0.647629,-1.109194
1,Lutsk,-0.185081,-0.196903,-0.031899,-0.442215,0.579504,-1.43407,-0.849978,0.771467,-0.040789,-0.104457,0.521676,0.850419,0.470171,-1.485003,0.626751,-0.632421,-0.494998,-0.001282,-0.131364,-0.823400,-0.251926,-0.001813,1.228916,-0.010015,0,-4.949748,0.972614,-0.647629,-1.109194
2,Lutsk,-0.185081,-0.196903,-0.031899,-0.442215,0.579504,-1.43407,-0.798895,0.806416,-0.040789,-0.104457,0.460305,1.129587,0.585360,-0.442869,0.942951,-0.632421,-0.494998,-0.001282,-0.131364,1.214477,-0.251926,-0.001813,-0.813725,-0.010015,0,-4.949748,0.972614,-0.647629,-1.109194
3,Lutsk,-0.185081,-0.196903,-0.031899,-0.442215,0.579504,-1.43407,-0.860195,0.706194,-0.040789,-0.104457,0.291534,0.989533,0.470171,-1.495637,0.728578,-0.632421,-0.494998,-0.001282,-0.131364,1.214477,-0.251926,-0.001813,-0.813725,-0.010015,0,-4.949748,0.972614,-0.647629,-1.109194
4,Lutsk,-0.185081,-0.196903,-0.031899,-0.442215,0.579504,-1.43407,-0.901061,0.768897,-0.040789,-0.104457,0.291534,1.090109,0.470171,-1.506271,0.776812,-0.632421,-0.494998,-0.001282,-0.131364,1.214477,-0.251926,-0.001813,-0.813725,-0.010015,0,-4.949748,0.972614,-0.647629,-1.109194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608230,Poltava,-0.196124,-0.541428,-0.182755,-0.921328,-0.248141,-0.30329,-1.227992,0.010293,-0.040789,-0.104457,-0.859175,-0.140301,1.506873,-0.442869,0.942951,-0.632421,-0.494998,-0.001282,-0.131364,1.214477,-0.251926,-0.001813,-0.813725,-0.010015,0,0.259337,0.035206,-0.647629,-1.109194
608231,Poltava,-0.196124,-0.541428,-0.182755,-0.921328,-0.248141,-0.30329,-1.279075,0.425572,-0.040789,-0.104457,-1.749057,-0.181660,1.460798,-0.442869,0.942951,-0.632421,-0.494998,-0.001282,-0.131364,1.214477,-0.251926,-0.001813,-0.813725,-0.010015,1,0.259337,0.035206,-0.647629,-1.109194
608232,Poltava,-0.196124,-0.541428,-0.182755,-0.921328,-0.248141,-0.30329,-1.197343,-0.235379,-0.040789,-0.104457,-0.705747,-0.198579,1.391684,-0.442869,0.932232,-0.632421,-0.494998,-0.001282,-0.131364,1.214477,-0.251926,-0.001813,-0.813725,-0.010015,1,0.259337,0.035206,-0.647629,-1.109194
608233,Poltava,-0.196124,-0.541428,-0.182755,-0.921328,-0.248141,-0.30329,-1.268859,-0.078622,-0.040789,-0.104457,-0.644376,-0.154401,1.391684,-0.442869,0.894717,-0.632421,-0.494998,-0.001282,-0.131364,1.214477,-0.251926,-0.001813,-0.813725,-0.010015,1,0.259337,0.035206,-0.647629,-1.109194


In [10]:
df_majority = df[df['alarms'] == 0]
df_minority = df[df['alarms'] == 1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)


df_balanced = pd.concat([df_majority, df_minority_upsampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df_balanced['alarms'].value_counts()

alarms
1    502391
0    502391
Name: count, dtype: int64

In [11]:
df_balanced = pd.get_dummies(df_balanced, columns=['city_address'], drop_first = True)

In [12]:
x = df_balanced.loc[:, df_balanced.columns != 'alarms']
y = df_balanced['alarms']

In [13]:
tscv = TimeSeriesSplit(n_splits=4)

for train_index, test_index in tscv.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [14]:
def estimate_clf(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train)


    y_pred_train = clf.predict(x_train)
    print('Train accuracy:', accuracy_score(y_train, y_pred_train))
    print('Train confusion matrix:\n', confusion_matrix(y_train, y_pred_train))


    y_pred_test = clf.predict(x_test)
    print('\nTest accuracy:', accuracy_score(y_test, y_pred_test))
    print('Test confusion matrix:\n', confusion_matrix(y_test, y_pred_test))



    average_type = 'binary'
    labels = np.unique(y_test)
    pos_label = labels[1]
    print('Test Precision:', precision_score(y_test, y_pred_test, average=average_type, pos_label=pos_label))
    print('Test Recall:', recall_score(y_test, y_pred_test, average=average_type, pos_label=pos_label))
    print('Test F1 Score:', f1_score(y_test, y_pred_test, average=average_type, pos_label=pos_label))


    print('\nDetailed classification report:\n')
    print(classification_report(y_test, y_pred_test))

In [15]:
df_balanced

Unnamed: 0,day_precip,day_precipcover,day_snow,day_windspeed,day_winddir,day_uvindex,hour_temp,hour_humidity,hour_precip,hour_snow,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_uvindex,Clear,Ice,Snow,Overcast,Rain,Fog,Partially cloudy,Freezing Drizzle/Freezing Rain,alarms,text_vector,sunrise_hour,sunset_hour,sun,city_address_Chernihiv,city_address_Chernivtsi,city_address_Dnipro,city_address_Donetsk,city_address_Ivano-Frankivsk,city_address_Kharkiv,city_address_Kherson,city_address_Khmelnytskyi,city_address_Kropyvnytskyi,city_address_Kyiv,city_address_Lutsk,city_address_Lviv,city_address_Mykolaiv,city_address_Odesa,city_address_Poltava,city_address_Rivne,city_address_Sumy,city_address_Ternopil,city_address_Uzhgorod,city_address_Vinnytsia,city_address_Zaporozhye,city_address_Zhytomyr
0,-0.196124,-0.541428,-0.182755,0.443416,-1.430759,1.204416,0.376013,-1.548545,-0.040789,-0.104457,1.288816,-1.340633,-0.220964,-0.336529,-1.685798,2.518295,2.020209,-0.001282,-0.131364,-0.823400,-0.251926,-0.001813,-0.813725,-0.010015,1,1.056348,-0.902203,0.971354,0.901555,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
1,-0.111903,1.524070,-0.182755,2.272754,0.425086,0.827490,0.529262,-0.990899,-0.040789,-0.104457,1.733757,0.852299,-1.142477,1.056528,-1.736712,-0.632421,2.020209,-0.001282,-0.131364,-0.823400,-0.251926,-0.001813,-0.813725,-0.010015,0,-0.084350,0.035206,-0.107968,-1.109194,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,-0.186766,-0.196903,-0.182755,-0.964883,-1.608717,-1.057143,-0.839762,1.005319,0.005903,-0.104457,-0.475605,-1.581263,0.354982,-1.312041,0.942951,-0.632421,-0.494998,-0.001282,-0.131364,1.214477,3.969414,-0.001813,-0.813725,-0.010015,1,-0.171740,-0.902203,-0.647629,-1.109194,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,-0.196124,-0.541428,-0.182755,-1.124587,-0.182230,1.581343,1.857420,-1.876965,-0.040789,-0.104457,-0.813147,-0.829294,-0.451342,1.056528,-1.736712,2.968398,2.020209,-0.001282,-0.131364,-0.823400,-0.251926,-0.001813,-0.813725,-0.010015,0,-0.557220,-0.902203,0.971354,0.901555,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
4,-0.196124,-0.541428,-0.182755,0.341786,0.523951,0.827490,1.367023,-1.044865,-0.040789,-0.104457,0.843875,0.642687,-0.451342,-0.442869,0.894717,2.518295,-0.494998,-0.001282,-0.131364,1.214477,-0.251926,-0.001813,-0.813725,-0.010015,0,0.254233,-1.839612,1.511015,0.901555,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004777,-0.196124,-0.541428,-0.182755,0.022378,0.193458,-1.057143,-1.820555,0.654284,-0.040789,-0.104457,-0.030664,0.534591,-0.681721,-0.442869,-1.736712,-0.632421,2.020209,-0.001282,-0.131364,-0.823400,-0.251926,-0.001813,-0.813725,-0.010015,0,0.658896,0.972614,-1.187290,-1.109194,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
1004778,-0.196124,-0.541428,-0.182755,0.443416,-0.418566,0.450563,1.336373,-1.427251,-0.040789,-0.104457,-0.199435,-0.522866,-0.336153,0.266066,-1.683119,-0.182318,2.020209,-0.001282,-0.131364,-0.823400,-0.251926,-0.001813,-0.813725,-0.010015,0,0.698884,0.972614,0.431693,0.901555,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
1004779,-0.196124,-0.541428,-0.182755,-0.601920,1.077597,1.581343,0.559912,-0.791483,-0.040789,-0.104457,-1.196716,0.422736,0.470171,1.056528,0.058662,-0.632421,-0.494998,-0.001282,-0.131364,-0.823400,-0.251926,-0.001813,1.228916,-0.010015,0,-1.804741,-0.902203,1.511015,-1.109194,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1004780,-0.196124,-0.541428,-0.182755,-0.601920,-1.369557,0.827490,1.857420,-1.724320,-0.040789,-0.104457,-0.751775,-1.158280,-0.451342,-0.442869,-0.367404,-0.182318,-0.494998,-0.001282,-0.131364,-0.823400,-0.251926,-0.001813,1.228916,-0.010015,1,0.048552,-0.902203,0.971354,0.901555,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False


In [18]:
clf_liblinear = LogisticRegression(
    penalty='l2',
    solver='liblinear',
    C=0.5,
    class_weight='balanced',
)
print("\nEvaluating model with solver: liblinear")
estimate_clf(clf_liblinear, x_train, y_train, x_test, y_test)



Evaluating model with solver: liblinear
Train accuracy: 0.712053354830523
Train confusion matrix:
 [[262894 139090]
 [ 92369 309473]]

Test accuracy: 0.7114094627679691
Test confusion matrix:
 [[65544 34863]
 [23131 77418]]
Test Precision: 0.6895022310096989
Test Recall: 0.7699529582591572
Test F1 Score: 0.7275102194239534

Detailed classification report:

              precision    recall  f1-score   support

           0       0.74      0.65      0.69    100407
           1       0.69      0.77      0.73    100549

    accuracy                           0.71    200956
   macro avg       0.71      0.71      0.71    200956
weighted avg       0.71      0.71      0.71    200956



## The best model for your situation is the one with the "liblinear" solver, as it achieves the highest f1-score (0.73), which is more important in our case presicion and recall important at this situation.

In [16]:
from sklearn.linear_model import LogisticRegression


def tune_logistic_regression(x_train, y_train, x_test, y_test):
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 0.5, 0.8, 1, 3, 5, 8, 10],
        'class_weight': [None, 'balanced'],
    }
    clf = LogisticRegression(solver='liblinear', max_iter=1000)

    grid_search = GridSearchCV(clf, param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=1)
    grid_search.fit(x_train, y_train)

    print("\nBest hyperparameters found:")
    print(grid_search.best_params_)
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

    print("\nEvaluating best model on test set:")
    best_clf = grid_search.best_estimator_
    estimate_clf(best_clf, x_train, y_train, x_test, y_test)


In [17]:
tune_logistic_regression(x_train, y_train, x_test, y_test)

Fitting 3 folds for each of 36 candidates, totalling 108 fits

Best hyperparameters found:
{'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l2'}
Best cross-validation accuracy: 0.7120

Evaluating best model on test set:
Train accuracy: 0.712053354830523
Train confusion matrix:
 [[262894 139090]
 [ 92369 309473]]

Test accuracy: 0.7114094627679691
Test confusion matrix:
 [[65544 34863]
 [23131 77418]]
Test Precision: 0.6895022310096989
Test Recall: 0.7699529582591572
Test F1 Score: 0.7275102194239534

Detailed classification report:

              precision    recall  f1-score   support

           0       0.74      0.65      0.69    100407
           1       0.69      0.77      0.73    100549

    accuracy                           0.71    200956
   macro avg       0.71      0.71      0.71    200956
weighted avg       0.71      0.71      0.71    200956

