In [3]:
# Basic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

# ML Models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')


In [4]:
# Load Dataset
df = pd.read_csv('weather_forecast_2022_2024_100k.csv') 

df.head()

Unnamed: 0,forecast_time,temperature,humidity,wind_speed,wind_direction,pressure,precipitation,cloud_coverage,weather_condition
0,2022-01-01 00:00:00,15.993428,90.124617,5.429465,223.713616,1011.603226,0.0,51.7,cloudy
1,2022-01-01 01:00:00,14.747359,94.099472,4.830542,243.206965,1008.126983,0.0,54.6,cloudy
2,2022-01-01 02:00:00,16.343152,87.59647,6.286963,145.135328,1005.557377,0.5,48.1,rainy
3,2022-01-01 03:00:00,18.117722,94.614785,3.643288,301.252915,1009.596029,0.1,52.8,rainy
4,2022-01-01 04:00:00,14.627243,94.475663,5.41601,332.605324,1009.729219,0.0,54.3,cloudy


In [5]:
df.dtypes

forecast_time         object
temperature          float64
humidity             float64
wind_speed           float64
wind_direction       float64
pressure             float64
precipitation        float64
cloud_coverage       float64
weather_condition     object
dtype: object

In [6]:
# Convert forecast_time to datetime
df['forecast_time'] = pd.to_datetime(df['forecast_time'], format="%Y-%m-%d %H:%M:%S")

# Check for missing values
print(df.isnull().sum())


forecast_time        0
temperature          0
humidity             0
wind_speed           0
wind_direction       0
pressure             0
precipitation        0
cloud_coverage       0
weather_condition    0
dtype: int64


In [7]:
# Extract time features
df['hour'] = df['forecast_time'].dt.hour
df['month'] = df['forecast_time'].dt.month
df['day'] = df['forecast_time'].dt.day

# Convert wind to vectors
df['wind_x'] = df['wind_speed'] * np.cos(np.deg2rad(df['wind_direction']))
df['wind_y'] = df['wind_speed'] * np.sin(np.deg2rad(df['wind_direction']))

# Rain Indicator
df['is_rain'] = np.where(df['precipitation'] > 0, 1, 0)

# Encode target variable
le = LabelEncoder()
df['weather_condition'] = le.fit_transform(df['weather_condition'])

df.head()


Unnamed: 0,forecast_time,temperature,humidity,wind_speed,wind_direction,pressure,precipitation,cloud_coverage,weather_condition,hour,month,day,wind_x,wind_y,is_rain
0,2022-01-01 00:00:00,15.993428,90.124617,5.429465,223.713616,1011.603226,0.0,51.7,1,0,1,1,-3.924433,-3.752055,0
1,2022-01-01 01:00:00,14.747359,94.099472,4.830542,243.206965,1008.126983,0.0,54.6,1,1,1,1,-2.177459,-4.311938,0
2,2022-01-01 02:00:00,16.343152,87.59647,6.286963,145.135328,1005.557377,0.5,48.1,2,2,1,1,-5.158481,3.59388,1
3,2022-01-01 03:00:00,18.117722,94.614785,3.643288,301.252915,1009.596029,0.1,52.8,2,3,1,1,1.890199,-3.114594,1
4,2022-01-01 04:00:00,14.627243,94.475663,5.41601,332.605324,1009.729219,0.0,54.3,1,4,1,1,4.808648,-2.492,0


In [8]:
X = df.drop(['forecast_time', 'weather_condition'], axis=1)
y = df['weather_condition']

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [10]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy Score: 0.9998099220680479
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2861
           1       1.00      1.00      1.00       313
           2       1.00      1.00      1.00      2086
           3       0.00      0.00      0.00         1

    accuracy                           1.00      5261
   macro avg       0.75      0.75      0.75      5261
weighted avg       1.00      1.00      1.00      5261



In [12]:
pickle.dump(model, open('weather_model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))
pickle.dump(le, open('label_encoder.pkl', 'wb'))

In [13]:
df['predicted_weather'] = le.inverse_transform(model.predict(X_scaled))

df.to_csv('weather_predictions.csv', index=False)