In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("weather_data.csv")  # Replace with your actual dataset path

# Remove 'date' and 'cloud_cover' columns if they exist
df = df.drop(['date', 'cloud_cover'], axis=1, errors='ignore')

# Check for missing values
print("Missing values before cleaning:\n", df.isnull().sum())

# Drop rows with missing values
df = df.dropna()

# Verify that no missing values remain
print("\nMissing values after cleaning:\n", df.isnull().sum())

# Encode categorical column ('rain_or_not')
categorical_columns = ['rain_or_not']
le = LabelEncoder()
df['rain_or_not'] = le.fit_transform(df['rain_or_not'])  # Convert to binary (0 or 1)

# Number of past days to use as features
N = 270  

# Dictionary to store predictions and feature importance
predictions = {}
feature_importance = {}

# Separate 'rain_or_not' (classification) from numeric columns (regression)
numerical_columns = df.drop(columns=['rain_or_not']).columns.tolist()

# 🔹 Predicting numerical values using RandomForestRegressor
for target_col in numerical_columns:
    print(f"\n🔹 Training model to predict: {target_col}...")

    # Create feature set (X) and target set (y)
    X, y = [], []
    
    for i in range(N, len(df)):
        X.append(df[target_col].iloc[i-N:i].values)  # Use previous N days of the target column
        y.append(df[target_col].iloc[i])  # Target value for prediction

    X = np.array(X)
    y = np.array(y)

    # Split into train, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)

    # 🔹 Hyperparameter tuning using Grid Search
    param_grid = {
        'n_estimators': [50, 10],
        'max_depth': [None],
        'min_samples_split': [2]
    }

    grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_  # Select best model from grid search

    # Store predictions
    target_predictions = best_model.predict(X_test)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, target_predictions))
    print(f"\n📌 RMSE for {target_col}: {rmse:.2f}")

    # Store feature importance
    feature_importance[target_col] = best_model.feature_importances_

    # Output best parameters for regression model
    print(f"\n🔹 Best Parameters for {target_col} Regression Model:")
    print(grid_search.best_params_)

    # Print some predicted vs. actual values for reference
    print(f"\n📌 Predictions for {target_col}:")
    for i in range(0, len(y_test), 10):  # Print every 10th prediction
        print(f"  Day {N + i}: Predicted = {target_predictions[i]:.2f} | Actual = {y_test[i]:.2f}")

# 🔹 Predicting 'rain_or_not' using RandomForestClassifier
print("\n🔹 Training model to predict 'rain_or_not'...")

X, y = [], []
for i in range(N, len(df)):
    X.append(df.drop(columns=['rain_or_not']).iloc[i-N:i].values.flatten())  # Use all other features
    y.append(df['rain_or_not'].iloc[i])  # Target label (rain or not)

X = np.array(X)
y = np.array(y)

# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)

# 🔹 Hyperparameter tuning for classification
param_grid = {
    'n_estimators': [50, 10],
    'max_depth': [None],
    'min_samples_split': [5, 2]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)
best_classifier = grid_search.best_estimator_  # Select best model from grid search

# Store predictions
rain_predictions = best_classifier.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, rain_predictions)
print(f"\n📌 Accuracy for 'rain_or_not' on the test set: {accuracy:.2f}")

# Output best parameters for classification model
print(f"\n🔹 Best Parameters for 'rain_or_not' Classification Model:")
print(grid_search.best_params_)

# Print some predicted vs. actual values
print("\n📌 Predictions for 'rain_or_not':")
for i in range(0, len(y_test), 10):  # Print every 10th prediction
    print(f"  Day {N + i}: Predicted = {'Rain' if rain_predictions[i] == 1 else 'No Rain'} | Actual = {'Rain' if y_test[i] == 1 else 'No Rain'}")

# 🔹 Performance metrics for the last 20 days (instead of full test set)
print("\n🔹 Classification Performance Metrics for the last 20 days:")

# Get the classification performance metrics on the last 20 days
print(classification_report(last_20_days_actual, last_20_days_predictions))

# 🔹 Confusion Matrix for the last 20 days
conf_matrix_last_20 = confusion_matrix(last_20_days_actual, last_20_days_predictions)
print("\n🔹 Confusion Matrix for the last 20 days:")
print(conf_matrix_last_20)

# 🔹 Evaluating accuracy on the last 20 days
print("\n🔹 Evaluating accuracy on the last 20 days...")

last_20_days_features = []
last_20_days_actual = y[-20:]

for i in range(len(df) - 20, len(df)):
    last_20_days_features.append(df.drop(columns=['rain_or_not']).iloc[i-N:i].values.flatten())

last_20_days_features = np.array(last_20_days_features)

# Make predictions for the last 20 days
last_20_days_predictions = best_classifier.predict(last_20_days_features)

# Calculate accuracy on the last 20 days
last_20_days_accuracy = accuracy_score(last_20_days_actual, last_20_days_predictions)
print(f"\n📌 Accuracy for 'rain_or_not' on the last 20 days: {last_20_days_accuracy:.2f}")

# Print some predicted vs. actual values for the last 20 days
print("\n📌 Predictions for 'rain_or_not' on the last 20 days:")
for i in range(len(last_20_days_predictions)):
    print(f"  Day {len(df)-20 + i}: Predicted = {'Rain' if last_20_days_predictions[i] == 1 else 'No Rain'} | Actual = {'Rain' if last_20_days_actual[i] == 1 else 'No Rain'}")


Missing values before cleaning:
 avg_temperature    15
humidity           15
avg_wind_speed     15
rain_or_not         0
pressure            0
dtype: int64

Missing values after cleaning:
 avg_temperature    0
humidity           0
avg_wind_speed     0
rain_or_not        0
pressure           0
dtype: int64

🔹 Training model to predict: avg_temperature...
Fitting 3 folds for each of 2 candidates, totalling 6 fits

📌 RMSE for avg_temperature: 4.14

🔹 Best Parameters for avg_temperature Regression Model:
{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 10}

📌 Predictions for avg_temperature:
  Day 270: Predicted = 23.88 | Actual = 19.65

🔹 Training model to predict: humidity...
Fitting 3 folds for each of 2 candidates, totalling 6 fits

📌 RMSE for humidity: 11.77

🔹 Best Parameters for humidity Regression Model:
{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 10}

📌 Predictions for humidity:
  Day 270: Predicted = 49.40 | Actual = 42.19

🔹 Training model to predict: 

In [8]:
# 🔹 Set N as the number of rows in the dataset
N = len(df)  # Use all the available data

# 🔹 Get the most recent data up to today
today_data = df.iloc[-N:]  # This now includes all the data in the dataset

# Prepare the feature set for the next 21 days prediction
future_predictions_rain = []
future_predictions_numerical = {}

# Predict for the next 21 days using the trained RandomForestClassifier and RandomForestRegressor
for day in range(1, 22):  # For 21 days ahead
    print(f"\n🔹 Predicting for Day {len(df) + day}...")

    # Prepare the features for this prediction
    future_features = today_data.drop(columns=['rain_or_not']).values.flatten().reshape(1, -1)

    # Predict 'rain_or_not' for the next day
    future_rain_prediction = best_classifier.predict(future_features)
    future_predictions_rain.append(future_rain_prediction[0])

    # Now update today_data for the next iteration
    future_rain = future_rain_prediction[0]  # 1 for Rain, 0 for No Rain

    # Add the prediction to the 'rain_or_not' column for future prediction
    future_row = today_data.copy()
    future_row['rain_or_not'] = future_rain

    # Update today's data to include the new prediction (simulate the next day)
    today_data = pd.concat([today_data, future_row]).iloc[1:]

    # For each numerical column, predict future values using RandomForestRegressor
    for target_col in numerical_columns:
        future_numerical_prediction = best_model.predict(future_features)
        if target_col not in future_predictions_numerical:
            future_predictions_numerical[target_col] = []
        future_predictions_numerical[target_col].append(future_numerical_prediction[0])

# Output predictions for the next 21 days
print("\n🔹 Future Predictions for 'rain_or_not' (Next 21 Days):")
for i, rain in enumerate(future_predictions_rain):
    print(f"  Day {len(df) + i + 1}: Predicted = {'Rain' if rain == 1 else 'No Rain'}")

# Output predictions for numerical features
print("\n🔹 Future Predictions for Numerical Features (Next 21 Days):")
for target_col in future_predictions_numerical:
    print(f"\n🔹 Predictions for {target_col}:")
    for i, prediction in enumerate(future_predictions_numerical[target_col]):
        print(f"  Day {len(df) + i + 1}: Predicted {target_col} = {prediction:.2f}")



🔹 Predicting for Day 297...


ValueError: X has 1184 features, but RandomForestClassifier is expecting 1080 features as input.