# Imports

In [8]:
from autogluon.tabular import TabularDataset, TabularPredictor
import os
import numpy as np
import pandas as pd

# Open data, edit, merge, and split

In [9]:
cwd = os.getcwd()
red_data_dir = cwd + "/../data/external/wine_quality/winequality-red.csv"
white_data_dir = cwd + "/../data/external/wine_quality/winequality-white.csv"

### Open
red_df = pd.read_csv(red_data_dir, sep=";")
white_df = pd.read_csv(white_data_dir, sep=";")

### Add red and white ID columns
red_df.insert(0, "colour", "red")
white_df.insert(0, "colour", "white")

### Merge
wine_df = pd.concat([red_df, white_df])

### Split
mask = np.random.rand(len(wine_df)) < 0.8
train_df = wine_df[mask]
test_df = wine_df[~mask]

# Create Tabular object

In [10]:
train_data = TabularDataset(train_df)
label = "quality"
train_data[label].describe()

count    5214.000000
mean        5.826621
std         0.875664
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: quality, dtype: float64

# Training

In [11]:
predictor = TabularPredictor(label=label, problem_type="regression").fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20231205_144103"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231205_144103"
AutoGluon Version:  1.0.0
Python Version:     3.11.5
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Oct  4 21:25:26 PDT 2023; root:xnu-8796.141.3.701.17~4/RELEASE_X86_64
C

# Prediction

In [13]:
test_data = TabularDataset(test_df)

y_pred = predictor.predict(test_data.drop(columns=[label]))
y_pred.head()

8     5.478715
13    5.306387
22    5.246006
26    5.411850
28    5.214265
Name: quality, dtype: float32

# Evaluation

In [14]:
predictor.evaluate(test_data, silent=True)

{'root_mean_squared_error': -0.6213086485184605,
 'mean_squared_error': -0.3860244367238358,
 'mean_absolute_error': -0.4518892833801888,
 'r2': 0.4811905271432706,
 'pearsonr': 0.69630169102161,
 'median_absolute_error': -0.3289504051208496}

In [15]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesMSE,-0.610308,-0.632747,root_mean_squared_error,0.210578,0.059784,1.217504,0.210578,0.059784,1.217504,1,True,4
1,WeightedEnsemble_L2,-0.621309,-0.624532,root_mean_squared_error,0.512509,0.200362,12.307997,0.003867,0.00028,0.141958,2,True,8
2,RandomForestMSE,-0.628298,-0.628359,root_mean_squared_error,0.218742,0.081114,3.413419,0.218742,0.081114,3.413419,1,True,3
3,XGBoost,-0.635051,-0.657876,root_mean_squared_error,0.029678,0.007345,1.876313,0.029678,0.007345,1.876313,1,True,6
4,NeuralNetFastAI,-0.664281,-0.674931,root_mean_squared_error,0.032615,0.034622,5.644249,0.032615,0.034622,5.644249,1,True,5
5,NeuralNetTorch,-0.711262,-0.717353,root_mean_squared_error,0.017452,0.009598,6.140254,0.017452,0.009598,6.140254,1,True,7
6,KNeighborsDist,-0.761512,-0.759131,root_mean_squared_error,0.017029,0.017217,0.014554,0.017029,0.017217,0.014554,1,True,2
7,KNeighborsUnif,-0.818035,-0.826153,root_mean_squared_error,0.017996,0.014972,0.115506,0.017996,0.014972,0.115506,1,True,1
