# Rainfall Prediction Model Development

This notebook develops and evaluates machine learning models for rainfall prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import os
import sys

# Add parent directory to path
sys.path.append('..')
from config import Config

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load and Prepare Data

In [None]:
# Load data
data_files = []
for filename in os.listdir(Config.RAW_DATA_DIR):
    if filename.endswith('.csv'):
        filepath = os.path.join(Config.RAW_DATA_DIR, filename)
        df = pd.read_csv(filepath)
        data_files.append(df)

if data_files:
    df = pd.concat(data_files, ignore_index=True)
    print(f"Dataset shape: {df.shape}")
    
    # Handle missing values
    df = df.dropna()
    print(f"After removing missing values: {df.shape}")
    
    # Encode categorical variables
    le_location = LabelEncoder()
    le_season = LabelEncoder()
    
    df['location_encoded'] = le_location.fit_transform(df['location'])
    df['season_encoded'] = le_season.fit_transform(df['season'])
    
    print("Data prepared successfully!")
else:
    print("No data files found. Please run fetch_data.py first.")

## Feature Selection

In [None]:
if 'df' in locals():
    # Select features
    feature_columns = [
        'temperature_2m_mean', 'relative_humidity_2m_mean', 'surface_pressure_mean',
        'wind_speed_10m_mean', 'cloud_cover_mean', 'month', 'day',
        'location_encoded', 'season_encoded'
    ]
    
    X = df[feature_columns]
    y = df[Config.TARGET_VARIABLE]
    
    print(f"Features: {feature_columns}")
    print(f"Target: {Config.TARGET_VARIABLE}")
    print(f"X shape: {X.shape}, y shape: {y.shape}")

## Model Training and Evaluation

In [None]:
if 'X' in locals():
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=Config.TEST_SIZE, random_state=Config.RANDOM_STATE
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"Training set: {X_train_scaled.shape}")
    print(f"Test set: {X_test_scaled.shape}")

In [None]:
if 'X_train_scaled' in locals():
    # Train Random Forest model
    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=Config.RANDOM_STATE,
        n_jobs=-1
    )
    
    rf_model.fit(X_train_scaled, y_train)
    rf_pred = rf_model.predict(X_test_scaled)
    
    # Train Linear Regression model
    lr_model = LinearRegression()
    lr_model.fit(X_train_scaled, y_train)
    lr_pred = lr_model.predict(X_test_scaled)
    
    print("Models trained successfully!")

## Model Evaluation

In [None]:
if 'rf_pred' in locals():
    # Evaluate Random Forest
    rf_mse = mean_squared_error(y_test, rf_pred)
    rf_rmse = np.sqrt(rf_mse)
    rf_mae = mean_absolute_error(y_test, rf_pred)
    rf_r2 = r2_score(y_test, rf_pred)
    
    # Evaluate Linear Regression
    lr_mse = mean_squared_error(y_test, lr_pred)
    lr_rmse = np.sqrt(lr_mse)
    lr_mae = mean_absolute_error(y_test, lr_pred)
    lr_r2 = r2_score(y_test, lr_pred)
    
    # Create comparison DataFrame
    results = pd.DataFrame({
        'Model': ['Random Forest', 'Linear Regression'],
        'RMSE': [rf_rmse, lr_rmse],
        'MAE': [rf_mae, lr_mae],
        'R² Score': [rf_r2, lr_r2]
    })
    
    print("Model Performance Comparison:")
    display(results)

## Feature Importance

In [None]:
if 'rf_model' in locals():
    # Feature importance
    feature_names = [
        'Temperature', 'Humidity', 'Pressure', 'Wind Speed', 'Cloud Cover',
        'Month', 'Day', 'Location', 'Season'
    ]
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=importance_df, x='importance', y='feature')
    plt.title('Feature Importance (Random Forest)')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
    print("Feature Importance:")
    display(importance_df)

## Prediction Visualization

In [None]:
if 'rf_pred' in locals():
    # Actual vs Predicted plot
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.scatter(y_test, rf_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Rainfall (mm)')
    plt.ylabel('Predicted Rainfall (mm)')
    plt.title('Random Forest: Actual vs Predicted')
    
    plt.subplot(1, 2, 2)
    plt.scatter(y_test, lr_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Rainfall (mm)')
    plt.ylabel('Predicted Rainfall (mm)')
    plt.title('Linear Regression: Actual vs Predicted')
    
    plt.tight_layout()
    plt.show()

## Save Best Model

In [None]:
if 'rf_model' in locals():
    # Choose the best model (Random Forest typically performs better)
    best_model = rf_model
    
    # Save model and preprocessing components
    os.makedirs(Config.MODEL_DIR, exist_ok=True)
    joblib.dump(best_model, os.path.join(Config.MODEL_DIR, 'rainfall_model.pkl'))
    joblib.dump(scaler, os.path.join(Config.MODEL_DIR, 'scaler.pkl'))
    joblib.dump(le_location, os.path.join(Config.MODEL_DIR, 'location_encoder.pkl'))
    joblib.dump(le_season, os.path.join(Config.MODEL_DIR, 'season_encoder.pkl'))
    
    print(f"Model and preprocessing components saved to {Config.MODEL_DIR}")
    print(f"Best model RMSE: {rf_rmse:.2f}")
    print(f"Best model R² Score: {rf_r2:.3f}")