In [1]:
!pip install catboost



In [2]:
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import resample
from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from joblib import dump, load

## **Data preparing**
---

In [3]:
df = pd.read_csv('df+dist+hol.csv')

In [4]:
df.columns

Index(['city_address', 'day_precip', 'day_precipcover', 'day_snow',
       'day_windspeed', 'day_winddir', 'day_uvindex', 'hour_temp',
       'hour_humidity', 'hour_precip', 'hour_precipprob', 'hour_snow',
       'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility',
       'hour_cloudcover', 'hour_uvindex', 'datetime', 'Clear', 'Ice', 'Snow',
       'Overcast', 'Rain', 'Fog', 'Partially cloudy',
       'Freezing Drizzle/Freezing Rain', 'anomaly', 'alarms', 'date',
       'text_vector', 'tg_vector', 'sun', 'Engels2', 'Baltimore', 'Saki',
       'Belbek', 'Olenya', 'Mozdok', 'Savasleyka', 'hol_risk'],
      dtype='object')

In [5]:
df.rename(columns={'text_vector': 'isw_vector'}, inplace=True)

In [6]:
df['alarms'] = df['alarms'].apply(lambda x: 0 if x == 0 else 1)

In [7]:
df['alarms'].value_counts()

alarms
0    502391
1    105844
Name: count, dtype: int64

In [8]:
df.isnull().sum()

city_address                          0
day_precip                            0
day_precipcover                       0
day_snow                              0
day_windspeed                         0
day_winddir                           0
day_uvindex                           0
hour_temp                             0
hour_humidity                         0
hour_precip                           0
hour_precipprob                       0
hour_snow                             0
hour_windspeed                        0
hour_winddir                          0
hour_pressure                         0
hour_visibility                       0
hour_cloudcover                       0
hour_uvindex                          0
datetime                              0
Clear                                 0
Ice                                   0
Snow                                  0
Overcast                              0
Rain                                  0
Fog                                   0


In [9]:
df['isw_vector'] = df['isw_vector'].fillna(0)

In [10]:
df['datetime'] = pd.to_datetime(df['datetime'])

df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['second'] = df['datetime'].dt.second

In [11]:
df = df.drop(columns=['datetime'])
df = df.drop(columns=['date'])
df = df.drop(columns=['anomaly'])
df = df.drop(columns=['city_address'])

In [12]:
df_majority = df[df['alarms'] == 0]
df_minority = df[df['alarms'] == 1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)


df_balanced = pd.concat([df_majority, df_minority_upsampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df_balanced['alarms'].value_counts()

alarms
1    502391
0    502391
Name: count, dtype: int64

***MODEL Cat Boost***

---

In [13]:
X = df_balanced.drop('alarms', axis=1)
y = df_balanced['alarms']

tscv = TimeSeriesSplit(n_splits=4)

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


In [14]:
model = CatBoostClassifier(verbose=0, random_state=42)

# Hyperparameters for RandomizedSearchCV
params = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'iterations': [100, 300, 500],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128],
    'random_strength': [1, 2, 5, 10]
}

# RandomizedSearchCV
search = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    n_iter=20,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1
)


*Loading a test sample to evaluate the future ensemble, ensuring it is tested on data that the models have not encountered during training*.

---

In [15]:
all_X_test = []
all_y_test = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    all_X_test.append(X_test)
    all_y_test.append(y_test)

X_test_full = pd.concat(all_X_test).reset_index(drop=True)
y_test_full = pd.concat(all_y_test).reset_index(drop=True)

X_test_full.to_csv('X_test_full.csv', index=False)
y_test_full.to_csv('y_test_full.csv', index=False)


***Cat Boost:***

In [None]:
model = CatBoostClassifier(iterations=700, depth=10, random_strength=2,
                           learning_rate=0.2, l2_leaf_reg=1, random_state=42,
                           border_count=64, verbose=0)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

model.save_model('catboost.pkl')

Classification Report:
               precision    recall  f1-score   support

           0     0.9677    0.8833    0.9236    100407
           1     0.8928    0.9706    0.9301    100549

    accuracy                         0.9270    200956
   macro avg     0.9303    0.9269    0.9268    200956
weighted avg     0.9302    0.9270    0.9268    200956

Confusion Matrix:
 [[88687 11720]
 [ 2958 97591]]


In [None]:
command_id = 1
model_name = "catboost_classifier"
version = "v1"

file_name = f"{command_id}__{model_name}__{version}.pkl"

model.save_model(file_name)

***MODEL: Random Forest***

---

In [14]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features=None,
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0     0.9854    0.8742    0.9265    100407
           1     0.8871    0.9870    0.9344    100549

    accuracy                         0.9307    200956
   macro avg     0.9362    0.9306    0.9304    200956
weighted avg     0.9362    0.9307    0.9304    200956

Confusion Matrix:
 [[87779 12628]
 [ 1305 99244]]


In [15]:
dump(model, 'random_forest_classifier.pkl')

['random_forest_classifier.pkl']