In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("weather_data.csv")  # Replace with your actual dataset path

# Remove 'date' and 'cloud_cover' columns if they exist
df = df.drop(['date', 'cloud_cover'], axis=1, errors='ignore')

# Encode categorical column ('rain_or_not') if not already encoded
df['rain_or_not'] = df['rain_or_not'].map({'Rain': 1, 'No Rain': 0})  # Convert to binary (1 or 0)

# Handle missing values: Remove rows with NaN in target columns
df = df.dropna(subset=['avg_temperature', 'humidity', 'avg_wind_speed', 'pressure', 'rain_or_not'])

# Define parameters for regressor and classifier
best_regressor_params = {
    'n_estimators': 50,
    'max_depth': None,
    'min_samples_split': 2
}

best_classifier_params = {
    'n_estimators': 50,
    'max_depth': None,
    'min_samples_split': 5
}

# Set N to the number of rows in the dataset
N = 21  # Set N as 21 for predicting the next 21 days

# 🔹 Predicting numerical values using RandomForestRegressor
for target_col in ['avg_temperature', 'humidity', 'avg_wind_speed', 'pressure']:  # Use actual columns
    # Create feature set (X) and target set (y)
    X, y = [], []
    
    # Correct the loop range to properly iterate over the dataset
    for i in range(N, len(df)):  # Ensure starting index allows enough data for N previous days
        X.append(df[target_col].iloc[i-N:i].values)  # Use previous N days of the target column
        y.append(df[target_col].iloc[i])  # Target value for prediction

    X = np.array(X)
    y = np.array(y)

    # Check if X or y are empty
    if X.shape[0] == 0:
        continue

    # Reshape X if necessary for sklearn
    if X.ndim == 1:
        X = X.reshape(-1, 1)

    # Train the model using the regressor parameters
    model = RandomForestRegressor(
        n_estimators=best_regressor_params['n_estimators'],
        max_depth=best_regressor_params['max_depth'],
        min_samples_split=best_regressor_params['min_samples_split'],
        random_state=42
    )
    model.fit(X, y)  # Fit the model to the full data

    # Store predictions for the next 21 days
    future_predictions = model.predict(X[-21:])  # Predict for the next 21 days

    # Output future predictions for the next 21 days
  #  print(f"\n📌 Future Predictions for {target_col} for the next 21 days:")
  #  for i in range(21):  # Print the next 21 days predictions
  #      print(f"  Day {len(df) + i + 1}: Predicted = {future_predictions[i]:.2f}")


# 🔹 Predicting 'rain_or_not' using RandomForestClassifier (for classification)
print("\n🔹 Training model to predict 'rain_or_not'...")

X, y = [], []
for i in range(N, len(df)):  # Ensure proper range for feature set
    X.append(df.drop(columns=['rain_or_not']).iloc[i-N:i].values.flatten())  # Use all other features
    y.append(df['rain_or_not'].iloc[i])  # Target label (rain or not)

X = np.array(X)
y = np.array(y)

# Check if X or y are empty
if X.shape[0] == 0:
    print("Error: X array is empty for rain_or_not")
else:
    # Reshape X to be 2D for the classifier
    if X.ndim == 1:
        X = X.reshape(-1, 1)

    # Train the model using the corrected classifier parameters
    classifier = RandomForestClassifier(
        n_estimators=best_classifier_params['n_estimators'],
        max_depth=best_classifier_params['max_depth'],
        min_samples_split=best_classifier_params['min_samples_split'],
        random_state=42
    )
    classifier.fit(X, y)  # Fit the classifier to the full data

    # Store predictions for the next 21 days
    rain_predictions = classifier.predict(X[-21:])  # Predict for the next 21 days
    rain_probabilities = classifier.predict_proba(X[-21:])[:, 1]  # Get probability for class "Rain"

    # Output predictions for the next 21 days (only for rain-related predictions)
    print("\n📌 Future Predictions for 'rain_or_not' for the next 21 days:")
    for i in range(21):  # Print the next 21 days predictions
        print(f"  Day {len(df) + i + 1}: Predicted Probability = {rain_probabilities[i]:.2f} | Predicted Class = {'Rain' if rain_predictions[i] == 1 else 'No Rain'}")



🔹 Training model to predict 'rain_or_not'...

📌 Future Predictions for 'rain_or_not' for the next 21 days:
  Day 297: Predicted Probability = 0.87 | Predicted Class = Rain
  Day 298: Predicted Probability = 0.93 | Predicted Class = Rain
  Day 299: Predicted Probability = 0.83 | Predicted Class = Rain
  Day 300: Predicted Probability = 0.91 | Predicted Class = Rain
  Day 301: Predicted Probability = 0.29 | Predicted Class = No Rain
  Day 302: Predicted Probability = 0.73 | Predicted Class = Rain
  Day 303: Predicted Probability = 0.83 | Predicted Class = Rain
  Day 304: Predicted Probability = 0.87 | Predicted Class = Rain
  Day 305: Predicted Probability = 0.22 | Predicted Class = No Rain
  Day 306: Predicted Probability = 0.83 | Predicted Class = Rain
  Day 307: Predicted Probability = 0.79 | Predicted Class = Rain
  Day 308: Predicted Probability = 0.26 | Predicted Class = No Rain
  Day 309: Predicted Probability = 0.84 | Predicted Class = Rain
  Day 310: Predicted Probability = 0.2