In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import joblib
import json
from pathlib import Path

# --- Step 1: Load Data from Local File ---
data_path = Path("cleaned_aqi_weather_dataset.csv")

if not data_path.exists():
    raise FileNotFoundError(f"Data file not found: {data_path}. Please ensure cleaned_aqi_weather_dataset.csv exists in the project root.")

# Load dataset
df = pd.read_csv(data_path)
print(f" Loaded data from {data_path}")

# Process date columns
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
df = df.dropna(subset=['aqi_index', 'Calculated_AQI'])  # Remove rows with missing target values

# Define features and target
target_columns = ['aqi_index', 'Calculated_AQI']
date_columns = ['year', 'month', 'day', 'hour']
features = [col for col in df.columns if col not in target_columns and col != 'date']

X = df[features]
y = df[target_columns]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize variables
Y_predicted = None
scaler_created = False

# --- Step 2: Load Model and Scaler from Local Files ---
model_path = Path("best_model.pkl")
scaler_path = Path("scaler.pkl")
metadata_path = Path("best_model_metadata.json")

try:
    # Check if model exists
    if not model_path.exists():
        raise FileNotFoundError(f"Model file not found: {model_path}")
    
    # Load or create scaler
    if scaler_path.exists():
        # Load existing scaler
        scaler = joblib.load(scaler_path)
        print(f" Loaded scaler from {scaler_path}")
        # Use transform (not fit_transform) since scaler is already fitted
        X_train_scaled = scaler.transform(X_train)
        test_scaled = scaler.transform(X_test)
    else:
        print(f" Scaler file not found: {scaler_path}. Creating new scaler.")
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        test_scaled = scaler.transform(X_test)
        # Save the scaler for future use
        joblib.dump(scaler, scaler_path)
        scaler_created = True
        print(f" Created and saved scaler to {scaler_path}")
    
    # --- Load metadata and check versions (if available) ---
    if metadata_path.exists():
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
        
        saved_sklearn = metadata.get("sklearn_version", "unknown")
        saved_numpy = metadata.get("numpy_version", "unknown")
        
        import sklearn
        from packaging import version
        
        print(f" Model saved with scikit-learn {saved_sklearn}, numpy {saved_numpy}")
        print(f" Current scikit-learn: {sklearn.__version__}, numpy: {np.__version__}")
        
        if version.parse(sklearn.__version__) != version.parse(saved_sklearn):
            print(" Warning: scikit-learn version mismatch.")
        if version.parse(np.__version__) != version.parse(saved_numpy):
            print(" Warning: numpy version mismatch.")
    else:
        print(f" Metadata file not found: {metadata_path}")
    
    # --- Load model ---
    model = joblib.load(model_path)
    print(f" Successfully loaded pre-trained model from {model_path}")
    
    y_pred = model.predict(test_scaled)
    Y_predicted = y_pred

except Exception as e:
    print(f" Could not load model: {str(e)}. Training new model.")
    # Create scaler if not already created
    if not scaler_path.exists():
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        test_scaled = scaler.transform(X_test)
        joblib.dump(scaler, scaler_path)
        scaler_created = True
        print(f" Created and saved scaler to {scaler_path}")
    elif 'scaler' not in locals():
        # If scaler file exists but wasn't loaded, load it
        scaler = joblib.load(scaler_path)
        X_train_scaled = scaler.transform(X_train)
        test_scaled = scaler.transform(X_test)
    
    # Train new model
    model = RandomForestRegressor(n_estimators=300, random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(test_scaled)
    Y_predicted = y_pred
    # Save the newly trained model
    joblib.dump(model, model_path)
    print(f" Trained and saved new model to {model_path}")


# --- Step 3: Evaluate Model ---


print("\nModel Performance:")
for i, col in enumerate(target_columns):
    rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], Y_predicted[:, i]))
    mae = mean_absolute_error(y_test.iloc[:, i], Y_predicted[:, i])
    r2 = r2_score(y_test.iloc[:, i], Y_predicted[:, i])
    
    print(f"  Target: {col}")
    print(f"    RMSE: {rmse:.4f}")
    print(f"    MAE: {mae:.4f}")
    print(f"    RÂ²: {r2:.4f}")

# --- Step 4: Predict AQI for Next 3 Days ---
start_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)
future_dates = [start_date + timedelta(days=i) for i in range(3)]

# Prepare future features
future_features = pd.DataFrame({
    "year": [d.year for d in future_dates],
    "month": [d.month for d in future_dates],
    "day": [d.day for d in future_dates],
    "hour": [12] * 3,  # Predict for noon
})

# Find most recent data for April 2025
recent_data = df[(df['year'] == 2025) & (df['month'] == 4)]

numeric_features = [col for col in features if col not in date_columns]

if len(recent_data) > 0:
    recent_averages = recent_data[numeric_features].mean().to_dict()
else:
    similar_season_data = df[df['month'] == 4]
    if len(similar_season_data) > 0:
        recent_averages = similar_season_data[numeric_features].mean().to_dict()
    else:
        recent_averages = df[numeric_features].mean().to_dict()

for feature, value in recent_averages.items():
    future_features[feature] = value

# Add small variations to weather-related features
for i in range(1, len(future_features)):
    for feature in ['temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'wind_speed_10m', 'wind_direction_10m', 'surface_pressure']:
        if feature in future_features.columns:
            variation = np.random.uniform(-0.05, 0.05)
            future_features.loc[i, feature] = future_features.loc[i-1, feature] * (1 + variation)

# Fill missing columns
required_columns = X_train.columns.tolist()
for col in required_columns:
    if col not in future_features.columns:
        future_features[col] = X_train[col].mean()

# Order columns correctly
future_features = future_features[required_columns]

# Scale and predict
future_scaled = scaler.transform(future_features)
predictions = model.predict(future_scaled)

prediction_results = pd.DataFrame({
    "Date": [d.strftime("%Y-%m-%d") for d in future_dates],
    "Predicted_AQI": np.round(predictions[:, 0], 2),
    "Predicted_Calculated_AQI": np.round(predictions[:, 1], 2)
})

print("\nPredicted values for the next 3 days:")
print(prediction_results)

# --- Step 5: Feature Importance ---
if hasattr(model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_,
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 most important features:")
    print(feature_importance.head(10))
elif hasattr(model, 'estimators_'):
    estimator = model.estimators_[0]
    if hasattr(estimator, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': estimator.feature_importances_,
        }).sort_values('Importance', ascending=False)
        
        print("\nTop 10 most important features (from first target model):")
        print(feature_importance.head(10))

# --- Step 6: Save Predictions Locally ---
# (Model and scaler are already saved above if needed)

# Save predictions to local file
prediction_csv_path = Path("prediction_results.csv")
prediction_results.to_csv(prediction_csv_path, index=False)
print(f"\n Prediction results saved to {prediction_csv_path}")

In [None]:
# Save `prediction_results` to pickle file (in addition to CSV)
prediction_results.to_pickle("prediction_results.pkl")
print(" Prediction results also saved to prediction_results.pkl")
