In [1]:
import numpy as np
import pandas as pd
from tabpfn import TabPFNRegressor  
from sklearn.datasets import fetch_openml
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/FoodWasteRecyleDataWithGDP-Population2002-2023.csv')
df.head()

Unnamed: 0,Year,Waste Type,Total Food Waste (kt),Waste Food Disposed (kt),Total Food Waste Recycled (kt),Recycling Rate (%),Avg Monthly Disposal (kt),Avg Daily Disposal (kt),Non-Retail Food Establishments,Food Shops,Food Stalls,Supermarkets,National Environment Agency Hawker Stalls,Population (000),GDP per Capita (S$)
0,2002,Food waste,526,495,31,5.93,41.25,1.36,1256,9940,13629,207,,4163.7,46832
1,2003,Food waste,548,515,33,6.0,42.92,1.41,1135,10839,14125,212,,4114.8,46499
2,2004,Food waste,531,500,31,5.9,41.67,1.37,1128,10828,13509,237,,4166.7,50968
3,2005,Food waste,532,495,37,6.9,41.25,1.36,1119,11226,13579,257,,4265.8,54750
4,2006,Food waste,543,498,45,8.2,41.5,1.36,1154,11451,13039,255,,4401.4,59379


In [3]:
df.drop(columns=['Waste Type'], inplace=True)

In [4]:
df['Year'] = pd.to_datetime(df['Year'], format='%Y')

In [5]:
df['National Environment Agency Hawker Stalls'] = df['National Environment Agency Hawker Stalls'].fillna(df['National Environment Agency Hawker Stalls'].median())

In [6]:
df["Total Food Waste % Change"] = df["Total Food Waste (kt)"].pct_change() * 100
df["Waste Disposed % Change"] = df["Waste Food Disposed (kt)"].pct_change() * 100
df["Waste Recycled % Change"] = df["Total Food Waste Recycled (kt)"].pct_change() * 100

In [7]:
df.drop(df[ df['Year'].dt.year == 2020].index, inplace=True)

In [8]:
# Compute correlation matrix
corr = df[['Total Food Waste (kt)','Non-Retail Food Establishments', 'Food Stalls','Supermarkets','National Environment Agency Hawker Stalls', 'Population (000)','GDP per Capita (S$)']].corr()
target_corr = corr['Total Food Waste (kt)'].drop('Total Food Waste (kt)')
top_three_features = target_corr.abs().sort_values(ascending = False).head(3).index.tolist()
print(top_three_features)

['Population (000)', 'Supermarkets', 'GDP per Capita (S$)']


In [9]:
x = df[top_three_features]
y = df['Total Food Waste (kt)']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [10]:
regressor = TabPFNRegressor()  
regressor.fit(x_train, y_train)

In [11]:
y_pred = regressor.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("TabPFN R² Score:", r2)
print("TabPFN MSE:", mse)

TabPFN R² Score: 0.9621146321296692
TabPFN MSE: 454.9669494628906


In [34]:
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNRegressor
from sklearn.model_selection import GridSearchCV

In [43]:
grid = {
    'max_time': [0,20,40,60,80,100,120,140,160,180,200,220,240,260,280,300],
}

In [54]:
auto_regressor = AutoTabPFNRegressor(max_time=100,device="auto")
auto_regressor.fit(x_train,y_train)

  x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
  x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda))
  x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
  x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
INFO:tabpfn_extensions.post_hoc_ensembles.greedy_weighted_ensemble:Order of selections: [0, 15, 3, 2, 15, 3, 0, 15, 6, 0, 15, 3, 3, 15, 3, 15, 3, 2, 15, 3, 2, 15, 3, 0, 15]
INFO:tabpfn_extensions.post_hoc_ensembles.greedy_weighted_ensemble:Val loss over iterations: [284.61904332950263, 220.80562286881, 48.84011339973322, 48.84011339973322, 48.84011339973322, 40.326023178465036, 40.326023178465036, 40.326023178465036, 40.326023178465036, 40.326023178465036, 40.326023178465036, 40.326023178465036, 40.326023178465036, 40.326023178465036, 39.84547024709464, 39.84547024709464, 39.262979820209104, 39.262979820209104, 39.262979820209104, 38.65117241209309, 38.65117241209309, 38.65117241209309, 38.469361936264015, 38.469361936264015, 38.4693619362

In [55]:
y_pred2 = auto_regressor.predict(x_test)

mse2 = mean_squared_error(y_test, y_pred2)
r22 = r2_score(y_test, y_pred2)

print("AutoTabPFN R² Score:", r22)
print("AutoTabPFN MSE:", mse2)

  x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda))


AutoTabPFN R² Score: 0.9620130860933079
AutoTabPFN MSE: 456.1863685820219


In [56]:
import joblib

joblib.dump(auto_regressor, "auto_regressor.pkl")

['auto_regressor.pkl']