**Imports**

In [27]:
!pip3 install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.6 MB[0m [31m?[0m eta [36m-:--:--[0mDownloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m4.0 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m4.0 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Successfully installed lightgbm-4.6.0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, log_loss, f1_score, confusion_matrix


In [21]:
weather_df = pd.read_csv("weather.csv")

# Verifică structura datelor
print("Structura finală a dataset-ului:")
print(f"Shape: {weather_df.shape}")
print(f"Coloane: {list(weather_df.columns)}")

# Preview date
weather_df.head()

Structura finală a dataset-ului:
Shape: (1054120, 10)
Coloane: ['data', 'latitudine', 'longitudine', 'altitude', 'air_temp', 'precip_mm_day', 'wind_mps', 'pressure', 'relative_umidity', 'category']


Unnamed: 0,data,latitudine,longitudine,altitude,air_temp,precip_mm_day,wind_mps,pressure,relative_umidity,category
0,2021-01-01,-81.85,-169.649994,155.524704,-33.752155,0.002975,3.118733,978.691629,2.029997,Arctic Extreme Cold ❄️
1,2021-01-01,-81.85,-166.449997,155.524704,-34.250574,0.003527,2.681528,978.291502,2.051618,Arctic Extreme Cold ❄️
2,2021-01-01,-81.85,-165.050003,155.524704,-34.321038,0.004053,2.589519,978.066435,2.062059,Arctic Extreme Cold ❄️
3,2021-01-01,-81.85,-155.050003,155.524704,-32.517866,0.003713,3.84711,976.913246,2.120131,Arctic Extreme Cold ❄️
4,2021-01-01,-81.85,-148.75,155.524704,-31.465957,0.000945,4.786155,975.317713,1.929487,Arctic Extreme Cold ❄️


**Encoding Categories**

In [None]:
X = weather_df.drop(columns=['category'], axis=1)
# print (X)
y = weather_df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_features = X_train.select_dtypes(include='int64').columns.tolist()
categorical_features = X_train.select_dtypes(include='object').columns.tolist()

numerical_pipeline = Pipeline(steps=[
	('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
	('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
	('num', numerical_pipeline, numerical_features),
	('cat', categorical_pipeline, categorical_features)
])

rdf_pipeline = Pipeline(steps=[
	('preprocessor', preprocessor),
	('rdf_model', RandomForestClassifier(random_state=42))
])

lgb_pipeline = Pipeline(steps=[
	('preprocessor', preprocessor),
	('lgb_model', lgb.LGBMClassifier(random_state=42))
])

rdf_param_grid = {
	'rdf_model__n_estimators': [50, 100],
	'rdf_model__max_depth': [10, 15],
	'rdf_model__min_samples_split': [5, 10]
}

lgb_param_grid = {
	'lgb_model__objective': ['multiclass'],
	'lgb_model__num_class': [len(np.unique(y))],
	'lgb_model__learning_rate': [0.01, 0.1, 0.3],
	'lgb_model__max_depth': [-1],
	'lgb_model__n_estimators': [100, 500]
}

cross_validation = StratifiedShuffleSplit(n_splits=3, random_state=42)

rdf_model = GridSearchCV(estimator=rdf_pipeline, param_grid=rdf_param_grid, cv=cross_validation, n_jobs=-1, verbose=2, scoring='neg_log_loss')
rdf_model.fit(X_train, y_train)

lgb_model = GridSearchCV(estimator=lgb_pipeline, param_grid=lgb_param_grid, cv=cross_validation, n_jobs=-1, verbose=2, scoring='neg_log_loss')
lgb_model.fit(X_train, y_train)

stacking_model = StackingClassifier(
	estimators=[
     	('lgb', lgb_model.best_estimator_),
		('rdf', rdf_model.best_estimator_)
	],
	final_estimator=LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000),
	cv=3
)
stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_test)
y_pred_proba = stacking_model.predict_proba(X_test)

print("LogLoss: \n", log_loss(y_test, y_pred_proba))
print("Accuracy score: \n", accuracy_score(y_test, y_pred))
print("F1 Score: \n", f1_score(y_pred, y_test, average="macro"))
print("Classification Report: \n", classification_report(y_test, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))



Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END rdf_model__max_depth=10, rdf_model__min_samples_split=5, rdf_model__n_estimators=50; total time=  38.8s
[CV] END rdf_model__max_depth=10, rdf_model__min_samples_split=5, rdf_model__n_estimators=50; total time=  38.8s
[CV] END rdf_model__max_depth=10, rdf_model__min_samples_split=10, rdf_model__n_estimators=50; total time=  39.4s
[CV] END rdf_model__max_depth=10, rdf_model__min_samples_split=5, rdf_model__n_estimators=50; total time=  39.9s
[CV] END rdf_model__max_depth=10, rdf_model__min_samples_split=10, rdf_model__n_estimators=50; total time=  39.4s
[CV] END rdf_model__max_depth=10, rdf_model__min_samples_split=5, rdf_model__n_estimators=50; total time=  39.9s
[CV] END rdf_model__max_depth=10, rdf_model__min_samples_split=10, rdf_model__n_estimators=50; total time=  40.0s
[CV] END rdf_model__max_depth=10, rdf_model__min_samples_split=10, rdf_model__n_estimators=50; total time=  40.0s
[CV] END rdf_model__max_depth=10



[CV] END lgb_model__learning_rate=0.3, lgb_model__max_depth=-1, lgb_model__n_estimators=100, lgb_model__num_class=22, lgb_model__objective=multiclass; total time=22.5min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.199028 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1460
[LightGBM] [Info] Number of data points in the train set: 758966, number of used features: 730
[LightGBM] [Info] Start training from score -3.254404
[LightGBM] [Info] Start training from score -3.121935
[LightGBM] [Info] Start training from score -3.687571
[LightGBM] [Info] Start training from score -4.564588
[LightGBM] [Info] Start training from score -4.463246
[LightGBM] [Info] Start training from score -3.078811
[LightGBM] [Info] Start training from score -8.042544
[LightGBM] [Info] Start training from score -5.730577
[LightGBM] [Info] Start training from score -10.595273
[LightGBM] [Info] Start training from score -8.22650



[CV] END lgb_model__learning_rate=0.3, lgb_model__max_depth=-1, lgb_model__n_estimators=100, lgb_model__num_class=22, lgb_model__objective=multiclass; total time=23.1min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.161068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1460
[LightGBM] [Info] Number of data points in the train set: 758966, number of used features: 730
[LightGBM] [Info] Start training from score -3.254404
[LightGBM] [Info] Start training from score -3.121935
[LightGBM] [Info] Start training from score -3.687571
[LightGBM] [Info] Start training from score -4.564588
[LightGBM] [Info] Start training from score -4.463246
[LightGBM] [Info] Start training from score -3.078811
[LightGBM] [Info] Start training from score -8.042544
[LightGBM] [Info] Start training from score -5.730577
[LightGBM] [Info] Start training from score -10.595273
[LightGBM] [Info] Start training from score -8.22650



[CV] END lgb_model__learning_rate=0.3, lgb_model__max_depth=-1, lgb_model__n_estimators=100, lgb_model__num_class=22, lgb_model__objective=multiclass; total time=30.0min










[CV] END lgb_model__learning_rate=0.1, lgb_model__max_depth=-1, lgb_model__n_estimators=100, lgb_model__num_class=22, lgb_model__objective=multiclass; total time=33.7min
