In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

In [14]:
df = pd.read_csv("German_Major_Cities_Hamburg_Berlin_Munich_Colonge_Frankfurt_weather_report_2020_to_2025.csv")
# print(df[ [
#     'tempmax', 'tempmin', 'feelslikemax', 'feelslikemin', 'dew',
#     'precipprob', 'precipcover', 'snow', 'snowdepth', 'windgust',
#     'winddir', 'cloudcover', 'visibility', 'solarradiation',
#     'solarenergy', 'uvindex'
# ]].describe())

In [None]:
df = df.rename(columns={'name': 'city'})

# Keep necessary columns
target_columns = [
    'tempmax', 'tempmin', 'feelslikemax', 'feelslikemin', 'dew',
    'precipprob', 'precipcover', 'snow', 'snowdepth', 'windgust',
    'winddir', 'cloudcover', 'solarradiation',
    'solarenergy', 'uvindex'
]
df = df[['city', 'datetime'] + target_columns].dropna()

# Convert datetime and extract cyclical features
df['datetime'] = pd.to_datetime(df['datetime'])
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['dayofyear'] = df['datetime'].dt.dayofyear
df['sin_dayofyear'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
df['cos_dayofyear'] = np.cos(2 * np.pi * df['dayofyear'] / 365)

# One-hot encode city
city_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
city_encoded = city_encoder.fit_transform(df[['city']])
city_encoded_df = pd.DataFrame(city_encoded, columns=city_encoder.get_feature_names_out(['city']))

# Final feature set (only city and date-based features)
X = pd.concat([city_encoded_df, df[['year','month','sin_dayofyear', 'cos_dayofyear']]], axis=1)
y = df[target_columns]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train MultiOutputRegressor with RandomForest
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Stage 1 Model MSE: {mse:.4f}")
print(f"Stage 1 Model R²: {r2:.4f}")

os.makedirs("models", exist_ok=True)

# Save model and encoder
joblib.dump(model, "models/stage1_model.pkl")
joblib.dump(city_encoder, "models/stage1_city_encoder.pkl") # city_encoder_stage1

NameError: name 'df' is not defined