In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("weatherAUS.csv")
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


# Data Cleaning

In [3]:
df["Date"] = pd.to_datetime(df["Date"])
for col in df.select_dtypes("object").columns:
    df[col] = df[col].astype("category")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Date           145460 non-null  datetime64[ns]
 1   Location       145460 non-null  category      
 2   MinTemp        143975 non-null  float64       
 3   MaxTemp        144199 non-null  float64       
 4   Rainfall       142199 non-null  float64       
 5   Evaporation    82670 non-null   float64       
 6   Sunshine       75625 non-null   float64       
 7   WindGustDir    135134 non-null  category      
 8   WindGustSpeed  135197 non-null  float64       
 9   WindDir9am     134894 non-null  category      
 10  WindDir3pm     141232 non-null  category      
 11  WindSpeed9am   143693 non-null  float64       
 12  WindSpeed3pm   142398 non-null  float64       
 13  Humidity9am    142806 non-null  float64       
 14  Humidity3pm    140953 non-null  float64       
 15  

In [4]:
df = df.dropna(subset=["RainTomorrow"]).reset_index(drop=True)

df["RainTomorrow"] = df["RainTomorrow"].astype(str).str.strip().str.upper()
df["RainTomorrow"] = df["RainTomorrow"].map({"YES": "Yes", "NO": "No"})

df["RainToday"] = df["RainToday"].astype(str).str.strip().str.upper()
df["RainToday"] = df["RainToday"].map({"YES": "Yes", "NO": "No"})

df["RainTomorrow"].value_counts()


RainTomorrow
No     110316
Yes     31877
Name: count, dtype: int64

# Feature Engineering

In [5]:
# Date features
if "Date" in df.columns:
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    df["DayOfYear"] = df["Date"].dt.dayofyear

    def season(m):
        if m in [12,1,2]: return "Summer"
        elif m in [3,4,5]: return "Autumn"
        elif m in [6,7,8]: return "Winter"
        else: return "Spring"
    
    df["Season"] = df["Month"].apply(season).astype("category")

# Weather logic features
if {"MaxTemp", "MinTemp"}.issubset(df.columns):
    df["TempDiff"] = df["MaxTemp"] - df["MinTemp"]

if {"Pressure9am", "Pressure3pm"}.issubset(df.columns):
    df["PressureDrop"] = df["Pressure9am"] - df["Pressure3pm"]

if {"Humidity3pm", "Humidity9am"}.issubset(df.columns):
    df["HumidityDiff"] = df["Humidity3pm"] - df["Humidity9am"]

if {"WindSpeed3pm", "WindSpeed9am"}.issubset(df.columns):
    df["WindSpeedChange"] = df["WindSpeed3pm"] - df["WindSpeed9am"]

if "WindGustSpeed" in df.columns:
    df["IsWindyDay"] = (df["WindGustSpeed"] > 60).astype(int)

df.head()


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Year,Month,Day,DayOfYear,Season,TempDiff,PressureDrop,HumidityDiff,WindSpeedChange,IsWindyDay
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,2008,12,1,336,Summer,9.5,0.6,-49.0,4.0,0
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,2008,12,2,337,Summer,17.7,2.8,-19.0,18.0,0
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,2008,12,3,338,Summer,12.8,-1.1,-8.0,7.0,0
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,2008,12,4,339,Summer,18.8,4.8,-29.0,-2.0,0
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,2008,12,5,340,Summer,14.8,4.8,-49.0,13.0,0


In [None]:
df["RainTomorrow_flag"] = df["RainTomorrow"].map({"Yes": 1, "No": 0})

y = df["RainTomorrow_flag"]

cols_to_drop = [
    "RainTomorrow", "RainTomorrow_flag",  
    "Date" 
]

X = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

X.head()


Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Year,Month,Day,DayOfYear,Season,TempDiff,PressureDrop,HumidityDiff,WindSpeedChange,IsWindyDay
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,2008,12,1,336,Summer,9.5,0.6,-49.0,4.0,0
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,2008,12,2,337,Summer,17.7,2.8,-19.0,18.0,0
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,2008,12,3,338,Summer,12.8,-1.1,-8.0,7.0,0
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,2008,12,4,339,Summer,18.8,4.8,-29.0,-2.0,0
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,2008,12,5,340,Summer,14.8,4.8,-49.0,13.0,0


# Train / Test Split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((113754, 31), (28439, 31))

In [15]:
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["category", "object"]).columns.tolist()


### fill missing values

In [16]:
location_col = "Location" if "Location" in categorical_features else None

if location_col:
    other_cats = [c for c in categorical_features if c != location_col]
else:
    other_cats = categorical_features

In [18]:
# Numeric

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])


In [None]:
from sklearn.preprocessing import OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [22]:
# TargetEncoder
import category_encoders as ce

if location_col:
    target_encoder = ce.TargetEncoder(cols=[location_col])


In [None]:
# ColumnTransformer

transformers = []

# Numeric
if numeric_features:
    transformers.append(("num", numeric_transformer, numeric_features))

# Location TargetEncoder
if location_col:
    transformers.append(("loc_te", target_encoder, [location_col]))

# categoricals with OneHot
if other_cats:
    transformers.append(("cat", categorical_transformer, other_cats))
    
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder="drop" 
)

preprocessor


0,1,2
,transformers,"[('num', ...), ('loc_te', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,verbose,0
,cols,['Location']
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [25]:
preprocessor.fit(X_train, y_train)

X_train_prepared = preprocessor.transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

X_train_prepared.shape, X_test_prepared.shape


((113754, 75), (28439, 75))