# Rainfall Prediction Model
This notebook loads a rainfall dataset, processes the data, trains two models (Linear Regression and Random Forest), evaluates their performance, and visualizes results.

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")


In [None]:
# Load dataset
df = pd.read_csv("D:\\My Work\\Rainfall\\rainfall.csv")

# Fill missing values
df.fillna(df.mean(numeric_only=True), inplace=True)

# Encode categorical feature
le = LabelEncoder()
df['SUBDIVISION_ENC'] = le.fit_transform(df['SUBDIVISION'])

# Scale year
scaler = StandardScaler()
df['YEAR_SCALED'] = scaler.fit_transform(df[['YEAR']])
df.head()


In [None]:
# Monthly data transformation
months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']

records = []
for _, row in df.iterrows():
    for i, month in enumerate(months):
        records.append({
            'SUBDIVISION_ENC': row['SUBDIVISION_ENC'],
            'YEAR': row['YEAR'],
            'MONTH_NUM': i + 1,
            'RAINFALL': row[month]
        })

monthly_df = pd.DataFrame(records)
monthly_df.head()


In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=monthly_df.groupby('MONTH_NUM')['RAINFALL'].mean().reset_index(), x='MONTH_NUM', y='RAINFALL', marker='o')
plt.title("Average Rainfall by Month")
plt.xlabel("Month")
plt.ylabel("Rainfall (mm)")
plt.xticks(range(1, 13))
plt.grid(True)
plt.show()


In [None]:
train_df = monthly_df[monthly_df['YEAR'] <= 2005]
test_df = monthly_df[monthly_df['YEAR'] > 2005]

features = ['SUBDIVISION_ENC', 'YEAR', 'MONTH_NUM']
X_train = train_df[features]
y_train = train_df['RAINFALL']
X_test = test_df[features]
y_test = test_df['RAINFALL']


In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


In [None]:
def evaluate(model, name):
    preds = model.predict(X_test)
    print(f"\n{name} Evaluation")
    print("MAE:", mean_absolute_error(y_test, preds))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
    print("R²:", r2_score(y_test, preds))
    return preds

lr_preds = evaluate(lr, "Linear Regression")
rf_preds = evaluate(rf, "Random Forest")


In [None]:
errors = {
    "Model": ["Linear Regression", "Random Forest"],
    "MAE": [mean_absolute_error(y_test, lr_preds), mean_absolute_error(y_test, rf_preds)],
    "RMSE": [np.sqrt(mean_squared_error(y_test, lr_preds)), np.sqrt(mean_squared_error(y_test, rf_preds))],
    "R2": [r2_score(y_test, lr_preds), r2_score(y_test, rf_preds)]
}

error_df = pd.DataFrame(errors)

plt.figure(figsize=(12, 4))
for i, metric in enumerate(["MAE", "RMSE", "R2"]):
    plt.subplot(1, 3, i+1)
    sns.barplot(data=error_df, x="Model", y=metric)
    plt.title(metric)
    plt.tight_layout()

plt.show()


In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.scatterplot(x=y_test, y=lr_preds, alpha=0.5)
plt.title("Linear Regression: Actual vs Predicted")
plt.xlabel("Actual Rainfall")
plt.ylabel("Predicted Rainfall")

plt.subplot(1, 2, 2)
sns.scatterplot(x=y_test, y=rf_preds, alpha=0.5)
plt.title("Random Forest: Actual vs Predicted")
plt.xlabel("Actual Rainfall")
plt.ylabel("Predicted Rainfall")

plt.tight_layout()
plt.show()


In [None]:
joblib.dump(lr, "linear_regression_model.pkl")
joblib.dump(rf, "random_forest_model.pkl")
joblib.dump(le, "label_encoder.pkl")
joblib.dump(scaler, "year_scaler.pkl")
