In [1]:
import pandas as pd

# Load dataset
file_path = "rainfall_dataset.csv"  # Ensure this file is in the same folder as the notebook
df = pd.read_csv(file_path)

# Convert date_time to datetime format
df["date_time"] = pd.to_datetime(df["date_time"], format="%d-%m-%Y %H:%M")

# Extract time-based features BEFORE dropping date_time
df["year"] = df["date_time"].dt.year
df["month"] = df["date_time"].dt.month
df["day"] = df["date_time"].dt.day
df["hour"] = df["date_time"].dt.hour
df["season"] = df["month"] % 12 // 3 + 1  # 1: Winter, 2: Spring, 3: Summer, 4: Fall

# Drop irrelevant columns
df_cleaned = df.drop(columns=["date_time", "moonrise", "moonset", "sunrise", "sunset", "moon_illumination", "location"])

# Display first 5 rows
df_cleaned.head()


Unnamed: 0,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,...,pressure,tempC,visibility,winddirDegree,windspeedKmph,year,month,day,hour,season
0,30,18,0,11.0,6,13,18,18,18,10,...,1013,18,10,114,5,2008,12,11,0,1
1,30,18,0,11.0,6,14,19,19,19,9,...,1013,19,10,125,4,2008,12,11,1,1
2,30,18,0,11.0,6,13,18,18,18,8,...,1013,18,10,136,4,2008,12,11,2,1
3,30,18,0,11.0,6,13,18,18,18,8,...,1013,18,10,147,4,2008,12,11,3,1
4,30,18,0,11.0,6,13,20,20,20,7,...,1014,20,10,136,4,2008,12,11,4,1


In [2]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df_cleaned.drop(columns=["precipMM"])  # Features
y = df_cleaned["precipMM"]  # Target (Rainfall in mm)

# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check dataset sizes
X_train.shape, X_test.shape


((92908, 22), (23228, 22))

In [3]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize XGBoost Regressor
xgb_model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=7, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate XGBoost model
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = mse_xgb ** 0.5
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"✅ MAE: {mae_xgb}")
print(f"✅ RMSE: {rmse_xgb}")
print(f"✅ R² Score: {r2_xgb}")


✅ MAE: 0.1406054590109535
✅ RMSE: 0.4704989488886933
✅ R² Score: 0.6787254692887972


In [4]:
import joblib

# Save trained model
joblib.dump(xgb_model, "rainfall_model.pkl")

print("✅ Model saved successfully as 'rainfall_model.pkl'")


✅ Model saved successfully as 'rainfall_model.pkl'


In [5]:
import pandas as pd

# Load dataset
df = pd.read_csv("rainfall_dataset.csv")

# Check unique city names
available_cities = df["location"].unique()
print("Available Cities in Dataset:", available_cities)


Available Cities in Dataset: ['pune']
