In [43]:
import pandas as pd
import numpy as np
import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
import joblib


In [74]:
# Set the target city

# "WINDHOEK"  Namibia
# "NDJAMENA"   Chad
# "NIAMEY AERO"  Niger
# "TEJGAON"   Bangladesh

CITY = "TEJGAON"


In [75]:
# Load and clean the dataset
df = pd.read_csv("weather2.csv", parse_dates=["DATE"])
print("data shape before cleaning:", df.shape)

df["LOCATION"] = df["NAME"].apply(lambda x: x.split(",")[0].strip())
df.drop(columns=["STATION", "NAME"], inplace=True)

for col in ["PRCP", "TMAX", "TMIN"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["TMAX"] = (df["TMAX"] - 32) * 5/9
df["TMIN"] = (df["TMIN"] - 32) * 5/9

df.dropna(subset=["TMAX", "TMIN"], inplace=True)
df = df[df["LOCATION"].str.upper() == CITY.upper()]
df.set_index("DATE", inplace=True)

print("data shape after cleaning:", df.shape)
print(df.round(2).head(3))


data shape before cleaning: (33292, 7)
data shape after cleaning: (1710, 5)
            PRCP  TAVG   TMAX   TMIN LOCATION
DATE                                         
2000-03-19   NaN  79.0  30.56  21.11  TEJGAON
2001-08-24  0.12  85.0  31.11  25.56  TEJGAON
2002-11-21  0.00  74.0  27.78  18.33  TEJGAON


In [76]:
# make desired dataframe
main_weather1 = df[["PRCP", "TMAX", "TMIN"]].copy()
main_weather1.columns = ["precipitation", "temperature_max", "temperature_min"]
main_weather1 = main_weather1.reset_index()

print("cleaned data preview:")
print(main_weather1.round(2).head(3))


cleaned data preview:
        DATE  precipitation  temperature_max  temperature_min
0 2000-03-19            NaN            30.56            21.11
1 2001-08-24           0.12            31.11            25.56
2 2002-11-21           0.00            27.78            18.33


In [77]:
# make lag features and sort by date
main_weather1 = main_weather1.sort_values("DATE").copy()
main_weather1["precipitation_lag1"] = main_weather1["precipitation"].shift(1)
main_weather1["temperature_max_lag1"] = main_weather1["temperature_max"].shift(1)
main_weather1["temperature_min_lag1"] = main_weather1["temperature_min"].shift(1)
main_weather1.dropna(inplace=True)

print("featured data preview:")
print(main_weather1.round(2).head(3))


featured data preview:
        DATE  precipitation  temperature_max  temperature_min  \
2 2002-11-21            0.0            27.78            18.33   
3 2002-12-09            0.0            27.22            16.11   
4 2003-03-24            0.0            31.67            20.56   

   precipitation_lag1  temperature_max_lag1  temperature_min_lag1  
2                0.12                 31.11                 25.56  
3                0.00                 27.78                 18.33  
4                0.00                 27.22                 16.11  


In [78]:
# define input features and targets
features = ['precipitation', 'temperature_min', 'precipitation_lag1',
            'temperature_max_lag1', 'temperature_min_lag1']
targets = ['temperature_max', 'temperature_min', 'precipitation']

X = main_weather1[features]
y = main_weather1[targets]

print("input shape:", X.shape)
print("target shape:", y.shape)


input shape: (1702, 5)
target shape: (1702, 3)


In [79]:
# train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=250, random_state=42))
model.fit(X_train, y_train)

print("model trained")


model trained


In [80]:
# make 7 day forecast
city_data = main_weather1.sort_values("DATE")
recent_days = city_data.tail(7)

future_dates = [(datetime.datetime.today() + datetime.timedelta(days=i)).strftime('%Y-%m-%d') for i in range(1, 8)]
predictions = []

for i in range(len(recent_days)):
    row = recent_days.iloc[i][features]
    input_df = pd.DataFrame([row])
    result = model.predict(input_df)[0]
    temp_max, temp_min, rain = [round(val, 2) for val in result]
    predictions.append([future_dates[i], temp_max, temp_min, rain])

future_df = pd.DataFrame(predictions, columns=[
    "Date", "Predicted_temperature_max", "Predicted_temperature_min", "Predicted_precipitation"])

print("7-day forecast:")
print(future_df)


7-day forecast:
         Date  Predicted_temperature_max  Predicted_temperature_min  \
0  2025-05-03                      35.59                      25.56   
1  2025-05-04                      35.92                      25.56   
2  2025-05-05                      37.42                      22.78   
3  2025-05-06                      34.26                      25.56   
4  2025-05-07                      34.78                      25.56   
5  2025-05-08                      35.52                      25.00   
6  2025-05-09                      31.00                      22.22   

   Predicted_precipitation  
0                     0.00  
1                     0.00  
2                     0.00  
3                     0.00  
4                     0.00  
5                     0.00  
6                     0.65  


In [81]:
# classify extreme weather types
labels = []
heat = flood = 0

for i in range(len(future_df)):
    row = future_df.iloc[i]
    label = []

    # Heat Wave: Max temperature ≥ 35°C for 3 consecutive days
    if row["Predicted_temperature_max"] >= 35:
        heat += 1
    else:
        heat = 0
    if heat >= 3:
        label.append("Heat Wave")

    # Flash Flood Risk: Precipitation ≥ 20 mm in a day (est. 20–25 mm/hr)
    if row["Predicted_precipitation"] >= 20:
        flood += 1
    else:
        flood = 0
    if flood >= 1:
        label.append("Possibility of Flash Flood")

    # Hazardous Rainfall: Precipitation ≥ 48 mm in a single day (10 mm/hr x 4–6 hrs)
    if row["Predicted_precipitation"] >= 48:
        label.append("Heavy Rain")

    # Extreme Cold: Max temperature < 0°C
    if row["Predicted_temperature_max"] < 0:
        label.append("Extreme Cold")

    labels.append("No extreme weather" if not label else " and ".join(label))

future_df["Extreme_Weather_Type"] = labels

print("Extreme weather in forecast:")
print(future_df[["Date", "Extreme_Weather_Type"]])


Extreme weather in forecast:
         Date Extreme_Weather_Type
0  2025-05-03   No extreme weather
1  2025-05-04   No extreme weather
2  2025-05-05            Heat Wave
3  2025-05-06   No extreme weather
4  2025-05-07   No extreme weather
5  2025-05-08   No extreme weather
6  2025-05-09   No extreme weather


In [None]:
# model evaluation
predictions_test = model.predict(X_test)

comparison = pd.DataFrame({
    "Actual Max": y_test["temperature_max"].values[:5],
    "Predicted Max": predictions_test[:5, 0],
    "Actual Min": y_test["temperature_min"].values[:5],
    "Predicted Min": predictions_test[:5, 1],
    "Actual Precip": y_test["precipitation"].values[:5],
    "Predicted Precip": predictions_test[:5, 2]
})

print("\nmodel prediction vs actual values:")
print(comparison.round(2))




model prediction vs actual values:
   Actual Max  Predicted Max  Actual Min  Predicted Min  Actual Precip  \
0       33.89          32.59       21.11          21.11           0.00   
1       28.33          27.37       16.67          16.67           0.00   
2       23.33          24.53       13.33          13.33           0.00   
3       32.22          33.23       26.67          26.67           0.01   
4       32.22          33.17       23.33          23.33           0.00   

   Predicted Precip  
0              0.00  
1              0.00  
2              0.00  
3              0.01  
4              0.00  

model accuracy (%):
Max Temp: 95.86%
Min Temp: 99.99%
Precipitation: 98.08%


In [83]:
# save model for this city
city_name_clean = CITY.lower().replace(" ", "_")
model_path = f"{city_name_clean}_model.pkl"
joblib.dump(model, model_path)

print(f"model saved as: {model_path}")


model saved as: tejgaon_model.pkl
