**Imports**

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif

**Geographic Feature Engineering**

In [5]:
weather_df = pd.read_csv('weather.csv')

def add_climate_zones(df):
    df = df.copy()
    
    df['climate_zone'] = df.apply(lambda row: 
        'Arctic' if (abs(row['latitudine']) >= 60 or row['altitude'] > 4000)
        else 'Temperate' if abs(row['latitudine']) >= 25
        else 'Tropical', axis=1
    )
    
    df['altitude_cat'] = df.apply(lambda row:
        'High' if row['altitude'] > 2000
        else 'Medium' if row['altitude'] > 500
        else 'Low', axis=1
    )
    
    df['is_very_hot'] = (df['air_temp'] > 45).astype(int)
    df['is_very_cold'] = (df['air_temp'] < -20).astype(int)  
    df['is_very_dry'] = (df['relative_umidity'] < 15).astype(int)
    df['is_very_wet'] = (df['relative_umidity'] > 90).astype(int)
    df['is_heavy_rain'] = (df['precip_mm_day'] > 60).astype(int)
    df['is_no_rain'] = (df['precip_mm_day'] < 1).astype(int)
    df['is_extreme_altitude'] = (df['altitude'] > 4500).astype(int)
    df['is_tropical_lat'] = (abs(df['latitudine']) < 25).astype(int)
    

    df['warm_monsoon'] = df.apply(lambda row:
        1 if (abs(row['latitudine']) < 25 and 
              row['air_temp'] > 27 and 
              row['precip_mm_day'] > 80 and
              row['relative_umidity'] > 90) else 0, axis=1
    )
    
    df['ultra_extreme'] = df.apply(lambda row:
        1 if (row['altitude'] > 5000 and 
              row['air_temp'] < -10 and
              row['pressure'] < 600) else 0, axis=1
    )
    
    df['is_warm_temp'] = ((df['air_temp'] >= 25) & (df['air_temp'] < 35)).astype(int)
    df['is_extreme_cold_temp'] = (df['air_temp'] < -35).astype(int)
    
    df['season_simple'] = df['month'].apply(
        lambda m: 'Summer' if m in [6, 7, 8] 
        else 'Winter' if m in [12, 1, 2] 
        else 'Spring/Fall'
    )
    
    return df

weather_df = add_climate_zones(weather_df)
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048349 entries, 0 to 1048348
Data columns (total 27 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   data                  1048349 non-null  object 
 1   latitudine            1048349 non-null  float64
 2   longitudine           1048349 non-null  float64
 3   altitude              1048349 non-null  float64
 4   air_temp              1048349 non-null  float64
 5   precip_mm_day         1048349 non-null  float64
 6   wind_mps              1048349 non-null  float64
 7   pressure              1048349 non-null  float64
 8   relative_umidity      1048349 non-null  float64
 9   category              1048349 non-null  object 
 10  month                 1048349 non-null  int64  
 11  season                1048349 non-null  int64  
 12  climate_zone          1048349 non-null  object 
 13  altitude_cat          1048349 non-null  object 
 14  is_very_hot           1048349 non-

**Model Creation**

In [6]:
from sklearn.ensemble import RandomForestClassifier

X = weather_df.drop(columns=['data', 'category'], errors='ignore')
y = weather_df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
	transformers=[
		('num', StandardScaler(), numerical_features),
		('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
	]
)

pipeline = Pipeline(steps=[
	('preprocessor', preprocessor),
	('feature_selection', SelectKBest(score_func=f_classif, k=15)),
	('classifier', None)
])

rf_model = RandomForestClassifier(
	n_estimators=100,
	max_depth=15,
	min_samples_split=100,
	min_samples_leaf=50,
	max_features='sqrt',
	class_weight='balanced',
	random_state=42,
	n_jobs=-1
)
pipeline.set_params(classifier=rf_model)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print("Accuracy score: \n", accuracy_score(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))

  f = msb / msw


Accuracy score: 
 0.9787046310869462
Classification report: 
                                precision    recall  f1-score   support

       Arctic Extreme Cold ❄️       1.00      1.00      1.00     21702
                Arctic Mild 🐧       0.95      1.00      0.97     10432
              Arctic Severe 🧊       1.00      1.00      1.00     10619
  Desert Hot & Dry (Summer) 🏜       1.00      1.00      1.00      2053
Equatorial Dry-Season Sunny ☀       0.49      0.89      0.63      3748
            Temperate Cold ❄️       1.00      1.00      1.00     15129
             Temperate Cool 🌲       1.00      0.99      0.99     54407
             Temperate Warm 🍃       1.00      1.00      1.00     14711
              Tropical Cool 🌊       1.00      1.00      1.00     23824
              Tropical Mild 🌿       1.00      1.00      1.00     24320
              Tropical Warm 🌺       0.98      0.88      0.93     28725

                     accuracy                           0.98    209670
             