In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import numpy as np
import joblib 


file_path = r'D:\Weather api\cairo_merged_with_season.csv' 
target_columns = ['tempmin', 'tempmax', 'temp', 'feelslike']

final_feature_columns = ['windspeed', 'winddir', 'sealevelpressure', 'season', 'cloudcover']

# Define the path and filename for saving the model
model_save_path = 'random_forest_weather_predictor_reduced_features.joblib'

# --- Load Data ---
df = pd.read_csv(file_path)

# --- Define Final Features (X) and Targets (y) ---
X = df[final_feature_columns]
y = df[target_columns]

# --- Identify Column Types for Preprocessing ---
categorical_features = ['winddir', 'season']
numerical_features = [col for col in X.column   s if col not in categorical_features]

# imputer: It fills in missing values in numerical features with the median value of the column of the season 
# scaler: standardizes the numerical features by removing the mean and scaling to unit variance
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Using most frequent for categorical imputation Converts categorical features into a one-hot encoded format, creating binary columns 
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep any columns not specified (shouldn't be any here)
)


# --- Create the Full Model Pipeline ---This combines the numerical and categorical transformers into a single ColumnTransformer:
# n_estimators= 100 trees. 
# random_state=randomize the data for reproducibility
# n_jobs=allows the model to use all available CPU cores for training
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

#split_train 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# --- Train the Model ---
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
# --- Evaluate the Model ---
print("\nEvaluating Model Performance (Mean Absolute Error):")
# Ensure y_pred has the expected shape before calculating MAE
if y_pred.shape[1] == len(target_columns):
    for i, target in enumerate(target_columns):
        mae = mean_absolute_error(y_test[target], y_pred[:, i])
        print(f"  MAE for {target}: {mae:.4f}")
# --- Save the Trained Model ---
print(f"\nSaving the trained model to {model_save_path}...")
joblib.dump(model_pipeline, model_save_path)



--- Feature Setup ---
Using features: ['windspeed', 'winddir', 'sealevelpressure', 'season', 'cloudcover']
Categorical Features: ['winddir', 'season']
Numerical Features: ['windspeed', 'sealevelpressure', 'cloudcover']
Target columns: ['tempmin', 'tempmax', 'temp', 'feelslike']

Data split: 1600 training samples, 400 testing samples

Training the RandomForestRegressor model with reduced features...
Model training complete.

Making predictions on the test set...

Evaluating Model Performance (Mean Absolute Error):
  MAE for tempmin: 1.7692
  MAE for tempmax: 2.1916
  MAE for temp: 1.7784
  MAE for feelslike: 1.8693

Saving the trained model to random_forest_weather_predictor_reduced_features.joblib...


['random_forest_weather_predictor_reduced_features.joblib']