In [45]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import os


In [46]:
DATA_PATH = "../data/raw_weather_data.csv"
df = pd.read_csv(DATA_PATH)

print(df.head())
print(df.info())
print(df["weather"].value_counts())

       city  temperature  humidity  pressure  wind_speed weather  \
0  Delhi,IN        18.24        23      1020        1.20   Clear   
1  Delhi,IN        17.46        26      1020        1.43   Clear   
2  Delhi,IN        15.37        31      1021        1.13   Clear   
3  Delhi,IN        12.56        36      1021        1.38   Clear   
4  Delhi,IN        11.67        38      1021        1.41  Clouds   

             timestamp  
0  2026-01-12 17:30:00  
1  2026-01-12 20:30:00  
2  2026-01-12 23:30:00  
3  2026-01-13 02:30:00  
4  2026-01-13 05:30:00  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1640 entries, 0 to 1639
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   city         1640 non-null   object 
 1   temperature  1640 non-null   float64
 2   humidity     1640 non-null   int64  
 3   pressure     1640 non-null   int64  
 4   wind_speed   1640 non-null   float64
 5   weather      1640 non-null   object 

In [47]:
# Remove missing values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
# Remove duplicate rows
df.drop_duplicates(inplace=True)

print("Dataset shape after cleaning:", df.shape)


Dataset shape after cleaning: (1640, 7)


In [48]:
from sklearn.utils import resample

# Check class distribution before balancing
print("Before Balancing:")
print(df["weather"].value_counts())

# Separate majority and minority classes
major = df[df.weather == df.weather.mode()[0]]
minor = df[df.weather != df.weather.mode()[0]]

# Upsample minority class
minor_upsampled = resample(
    minor,
    replace=True,
    n_samples=len(major),
    random_state=42
)

# Combine balanced data
df = pd.concat([major, minor_upsampled])

# Shuffle dataset
df = df.sample(frac=1, random_state=42)

# Check after balancing
print("\nAfter Balancing:")
print(df["weather"].value_counts())

Before Balancing:
weather
Clouds    828
Clear     600
Rain      183
Snow       29
Name: count, dtype: int64

After Balancing:
weather
Clouds    828
Clear     614
Rain      189
Snow       25
Name: count, dtype: int64


In [49]:
le = LabelEncoder()
df["weather"] = le.fit_transform(df["weather"])

print("Encoded Classes:", le.classes_)


Encoded Classes: ['Clear' 'Clouds' 'Rain' 'Snow']


In [None]:
#Remove duplicate columns if any
df = df.loc[:, ~df.columns.duplicated()]

In [None]:
features = ["temperature", "humidity", "pressure", "wind_speed"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])

processed_df = pd.DataFrame(X_scaled, columns=features)

#Use .values to avoid index alignment problem
processed_df["weather"] = df["weather"].values

In [52]:
processed_df.to_csv("../data/processed_weather_data.csv", index=False)

joblib.dump(scaler, "../models/scaler.pkl")
joblib.dump(le, "../models/label_encoder.pkl")

print("Preprocessing completed")


Preprocessing completed
