# Imports

In [1]:
import kagglehub
import os
import shutil

import pandas as pd

from sklearn.model_selection import train_test_split

from autogluon.tabular import TabularDataset, TabularPredictor # type: ignore

  from .autonotebook import tqdm as notebook_tqdm


# Download

In [2]:
# Download latest version
path = kagglehub.dataset_download("jsphyg/weather-dataset-rattle-package")

print(f"Path to dataset files: {path}")

# Move datafile to data directory in repo
# Create data directory if it doesn't exist
os.makedirs("../data", exist_ok=True)

# Source and destination paths
source_file = os.path.join(path, "weatherAUS.csv")
dest_file = "../data/weatherAUS.csv"

# Move the file
shutil.copy2(source_file, dest_file)
print(f"Data file moved from {source_file} to {dest_file}")

Path to dataset files: /Users/richardcollins/.cache/kagglehub/datasets/jsphyg/weather-dataset-rattle-package/versions/2
Data file moved from /Users/richardcollins/.cache/kagglehub/datasets/jsphyg/weather-dataset-rattle-package/versions/2/weatherAUS.csv to ../data/weatherAUS.csv


# Explore

## Open and check

In [3]:
df = pd.read_csv("../data/weatherAUS.csv")

print(f"df.shape:")
print(f"{df.shape}")

print(f"df.head():")
print(f"{df.head()}")

df.shape:
(145460, 23)
df.head():
         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3p

# Drop NaNs

In [4]:
# IMPORTANT: Remove rows where target variable has NaN values
# This must be done AFTER feature preprocessing but BEFORE train-test split
print(f"Dataset shape before removing NaN in target: {df.shape}")
print(f"Number of NaN values in RainTomorrow: {df['RainTomorrow'].isna().sum()}")

# Drop rows where RainTomorrow is NaN - this is crucial for LogisticRegression
df_clean = df.dropna(subset=['RainTomorrow']).copy()

print(f"Dataset shape after removing NaN in target: {df_clean.shape}")
print(f"Number of NaN values in RainTomorrow after cleaning: {df_clean['RainTomorrow'].isna().sum()}")

# Update df to use the cleaned version
df = df_clean

Dataset shape before removing NaN in target: (145460, 23)
Number of NaN values in RainTomorrow: 3267
Dataset shape after removing NaN in target: (142193, 23)
Number of NaN values in RainTomorrow after cleaning: 0


# Declare Target and Features

In [5]:
label = "RainTomorrow"
X = df.drop([label], axis=1)
y = df[label]

# Split train and test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=37)

### Reconfigure dataset into train and test sets
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

# Predictor declaration

In [7]:
train_data = TabularDataset(df_train)
predictor = TabularPredictor(label=label).fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20250712_055935"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.12.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.5.0: Tue Apr 22 19:54:43 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T8132
CPU Count:          10
Memory Avail:       4.55 GB / 16.00 GB (28.5%)
Disk Space Avail:   352.69 GB / 460.43 GB (76.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competition

# Predict

In [8]:
test_data = TabularDataset(df_test)

y_pred = predictor.predict(test_data.drop(columns=[label]))
y_pred.head()

138104    No
32162     No
120242    No
116622    No
56743     No
Name: RainTomorrow, dtype: object

# Evaluate

In [9]:
predictor.evaluate(test_data, silent=True)

{'accuracy': 0.8663455114455502,
 'balanced_accuracy': np.float64(0.7623193269237571),
 'mcc': np.float64(0.5895668394376516),
 'roc_auc': np.float64(0.9052601276179671),
 'f1': 0.6596830513027129,
 'precision': 0.7786937222574508,
 'recall': 0.5722273998136067}

In [10]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.866346,0.8672,accuracy,0.401961,0.038997,83.013742,0.001717,0.000298,0.042292,2,True,14
1,XGBoost,0.86631,0.8616,accuracy,0.098079,0.011575,2.192455,0.098079,0.011575,2.192455,1,True,11
2,LightGBM,0.865396,0.8584,accuracy,0.076882,0.014081,1.102837,0.076882,0.014081,1.102837,1,True,4
3,NeuralNetFastAI,0.865185,0.8648,accuracy,0.233524,0.012235,42.494683,0.233524,0.012235,42.494683,1,True,10
4,CatBoost,0.862302,0.854,accuracy,0.056966,0.002607,14.669198,0.056966,0.002607,14.669198,1,True,7
5,NeuralNetTorch,0.860649,0.8636,accuracy,0.089838,0.012383,39.37393,0.089838,0.012383,39.37393,1,True,12
6,LightGBMXT,0.859911,0.852,accuracy,0.036579,0.00777,3.976544,0.036579,0.00777,3.976544,1,True,3
7,LightGBMLarge,0.858223,0.8524,accuracy,0.03538,0.005896,1.093835,0.03538,0.005896,1.093835,1,True,13
8,RandomForestEntr,0.858153,0.852,accuracy,0.618244,0.036654,8.025995,0.618244,0.036654,8.025995,1,True,6
9,RandomForestGini,0.857555,0.8528,accuracy,0.464533,0.038201,10.871004,0.464533,0.038201,10.871004,1,True,5
