In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score

# Load dataset
df = pd.read_csv("weather_data.csv")  # Replace with your actual dataset path

# Remove 'date' and 'cloud_cover' columns if they exist
df = df.drop(['date', 'cloud_cover'], axis=1, errors='ignore')

# Check for missing values
print("Missing values before cleaning:\n", df.isnull().sum())

# Drop rows with missing values
df = df.dropna()

# Verify that no missing values remain
print("\nMissing values after cleaning:\n", df.isnull().sum())

# Encode categorical column ('rain_or_not')
categorical_columns = ['rain_or_not']
le = LabelEncoder()
df['rain_or_not'] = le.fit_transform(df['rain_or_not'])  # Convert to binary (0 or 1)

# Number of past days to use as features
N = 270  

# Dictionary to store predictions and feature importance
predictions = {}
feature_importance = {}

# Separate 'rain_or_not' (classification) from numeric columns (regression)
numerical_columns = df.drop(columns=['rain_or_not']).columns.tolist()

# 🔹 Predicting numerical values using RandomForestRegressor
for target_col in numerical_columns:
    print(f"\n🔹 Training model to predict: {target_col}...")

    # Create feature set (X) and target set (y)
    X, y = [], []
    
    for i in range(N, len(df)):
        X.append(df[target_col].iloc[i-N:i].values)  # Use previous N days of the target column
        y.append(df[target_col].iloc[i])  # Target value for prediction

    X = np.array(X)
    y = np.array(y)

    # Split into train, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)

    # 🔹 Hyperparameter tuning using Grid Search
    param_grid = {
        'n_estimators': [1000],
        'max_depth': [2],
        'min_samples_split': [2]
    }

    grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_  # Select best model from grid search

    # Store predictions
    target_predictions = best_model.predict(X_test)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, target_predictions))
    print(f"\n📌 RMSE for {target_col}: {rmse:.2f}")

    # Store feature importance
    feature_importance[target_col] = best_model.feature_importances_

    # Print some predicted vs. actual values for reference
    print(f"\n📌 Predictions for {target_col}:")
    for i in range(0, len(y_test), 10):  # Print every 10th prediction
        print(f"  Day {N + i}: Predicted = {target_predictions[i]:.2f} | Actual = {y_test[i]:.2f}")

# 🔹 Predicting 'rain_or_not' using RandomForestClassifier
print("\n🔹 Training model to predict 'rain_or_not'...")

X, y = [], []
for i in range(N, len(df)):
    X.append(df.drop(columns=['rain_or_not']).iloc[i-N:i].values.flatten())  # Use all other features
    y.append(df['rain_or_not'].iloc[i])  # Target label (rain or not)

X = np.array(X)
y = np.array(y)

# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)

# 🔹 Hyperparameter tuning for classification
param_grid = {
    'n_estimators': [50, 10],
    'max_depth': [None],
    'min_samples_split': [5, 2]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)
best_classifier = grid_search.best_estimator_  # Select best model from grid search

# Store predicted probabilities (for Rain)
rain_probabilities = best_classifier.predict_proba(X_test)

# Extract probabilities of Rain (class 1) from the 2nd column of the output
rain_probabilities = rain_probabilities[:, 1]  # Probabilities of class 1 (Rain)

# Get the predicted classes by thresholding the probabilities (if needed)
predicted_classes = (rain_probabilities >= 0.5).astype(int)

# Calculate Accuracy
accuracy = accuracy_score(y_test, predicted_classes)
print(f"\n📌 Accuracy for 'rain_or_not' on the test set: {accuracy:.2f}")

# Output best parameters for classification model
print(f"\n🔹 Best Parameters for 'rain_or_not' Classification Model:")
print(grid_search.best_params_)

# Print some predicted vs. actual values with probabilities
print("\n📌 Predictions for 'rain_or_not':")
for i in range(0, len(y_test), 10):  # Print every 10th prediction
    print(f"  Day {N + i}: Predicted Probability = {rain_probabilities[i]:.2f} | Actual = {'Rain' if y_test[i] == 1 else 'No Rain'}")

# 🔹 Performance metrics for the last 20 days (instead of full test set)
print("\n🔹 Classification Performance Metrics for the last 20 days:")

# Get the classification performance metrics on the last 20 days
last_20_days_actual = y[-20:]

last_20_days_features = []
for i in range(len(df) - 20, len(df)):
    last_20_days_features.append(df.drop(columns=['rain_or_not']).iloc[i-N:i].values.flatten())

last_20_days_features = np.array(last_20_days_features)

# Get probabilities for the last 20 days
last_20_days_probabilities = best_classifier.predict_proba(last_20_days_features)
last_20_days_probabilities = last_20_days_probabilities[:, 1]  # Probabilities for Rain (class 1)

# Get predicted classes for the last 20 days
last_20_days_predictions = (last_20_days_probabilities >= 0.5).astype(int)

# Calculate accuracy for the last 20 days
last_20_days_accuracy = accuracy_score(last_20_days_actual, last_20_days_predictions)
print(f"\n📌 Accuracy for 'rain_or_not' on the last 20 days: {last_20_days_accuracy:.2f}")

# Print predicted probabilities and final classification for the last 20 days
print("\n📌 Predictions for 'rain_or_not' on the last 20 days:")

for i in range(len(last_20_days_predictions)):
    predicted_class = 'Rain' if last_20_days_predictions[i] == 1 else 'No Rain'
    print(f"  Day {len(df)-20 + i}: Predicted Probability = {last_20_days_probabilities[i]:.2f} | Predicted Class = {predicted_class} | Actual = {'Rain' if last_20_days_actual[i] == 1 else 'No Rain'}")

# Example for full test set predictions (if desired)
print("\n📌 Predictions for 'rain_or_not' on the full test set:")
for i in range(0, len(y_test), 10):  # Print every 10th prediction
    predicted_class = 'Rain' if predicted_classes[i] == 1 else 'No Rain'
    print(f"  Day {N + i}: Predicted Probability = {rain_probabilities[i]:.2f} | Predicted Class = {predicted_class} | Actual = {'Rain' if y_test[i] == 1 else 'No Rain'}")

Missing values before cleaning:
 avg_temperature    15
humidity           15
avg_wind_speed     15
rain_or_not         0
pressure            0
dtype: int64

Missing values after cleaning:
 avg_temperature    0
humidity           0
avg_wind_speed     0
rain_or_not        0
pressure           0
dtype: int64

🔹 Training model to predict: avg_temperature...
Fitting 3 folds for each of 1 candidates, totalling 3 fits

📌 RMSE for avg_temperature: 4.22

📌 Predictions for avg_temperature:
  Day 270: Predicted = 24.18 | Actual = 19.65

🔹 Training model to predict: humidity...
Fitting 3 folds for each of 1 candidates, totalling 3 fits

📌 RMSE for humidity: 12.65

📌 Predictions for humidity:
  Day 270: Predicted = 50.01 | Actual = 42.19

🔹 Training model to predict: avg_wind_speed...
Fitting 3 folds for each of 1 candidates, totalling 3 fits

📌 RMSE for avg_wind_speed: 2.70

📌 Predictions for avg_wind_speed:
  Day 270: Predicted = 9.34 | Actual = 8.09

🔹 Training model to predict: pressure...
Fitt