# Imports

In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import os
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Open data, edit, merge, and split

In [2]:
cwd = os.getcwd()
red_data_dir = cwd + "/../data/external/wine_quality/winequality-red.csv"
white_data_dir = cwd + "/../data/external/wine_quality/winequality-white.csv"

### Open
red_df = pd.read_csv(red_data_dir, sep=";")
white_df = pd.read_csv(white_data_dir, sep=";")

### Add red and white ID columns
red_df.insert(0, "colour", "red")
white_df.insert(0, "colour", "white")

### Merge
wine_df = pd.concat([red_df, white_df])

### Split
mask = np.random.rand(len(wine_df)) < 0.8
train_df = wine_df[mask]
test_df = wine_df[~mask]

# Create Tabular object

In [3]:
train_data = TabularDataset(train_df)
label = "quality"
train_data[label].describe()

count    5212.000000
mean        5.806408
std         0.863742
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: quality, dtype: float64

# Training

In [4]:
predictor = TabularPredictor(label=label, problem_type="regression").fit(train_data, presets="medium_quality")

No path specified. Models will be saved in: "AutogluonModels/ag-20231206_091513"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231206_091513"
AutoGluon Version:  1.0.0
Python Version:     3.11.5
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Oct  4 21:25:26 PDT 2023; root:xnu-8796.141.3.701.17~4/RELEASE_X86_64
CPU Count:          4
Memory Avail:       2.84 GB / 8.00 GB (35.5%)
Disk Space Avail:   82.22 GB / 233.47 GB (35.2%)
Train Data Rows:    5212
Train Data Columns: 12
Label Column:       quality
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2908.26 MB
	Train Data (Original)  Memory Usage: 0.74 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to

# Prediction

In [5]:
test_data = TabularDataset(test_df)

y_pred = predictor.predict(test_data.drop(columns=[label]))
y_pred.head()

3     5.383248
8     5.923350
10    5.151501
14    5.017485
16    5.836979
Name: quality, dtype: float32

# Evaluation

In [6]:
predictor.evaluate(test_data, silent=True)

{'root_mean_squared_error': -0.6494975548026399,
 'mean_squared_error': -0.4218470736946083,
 'mean_absolute_error': -0.46064129432351675,
 'r2': 0.4896904035055506,
 'pearsonr': 0.7003293029597065,
 'median_absolute_error': -0.3126258850097656}

In [7]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesMSE,-0.646353,-0.632514,root_mean_squared_error,0.161858,0.062093,1.258153,0.161858,0.062093,1.258153,1,True,4
1,RandomForestMSE,-0.648951,-0.640256,root_mean_squared_error,0.140786,0.056891,3.184291,0.140786,0.056891,3.184291,1,True,3
2,WeightedEnsemble_L2,-0.649498,-0.613095,root_mean_squared_error,0.251028,0.093534,12.888216,0.002473,0.000299,0.151806,2,True,8
3,XGBoost,-0.663104,-0.618476,root_mean_squared_error,0.035449,0.01073,2.135424,0.035449,0.01073,2.135424,1,True,6
4,NeuralNetFastAI,-0.70874,-0.676803,root_mean_squared_error,0.033689,0.011305,5.030684,0.033689,0.011305,5.030684,1,True,5
5,NeuralNetTorch,-0.755813,-0.714768,root_mean_squared_error,0.017559,0.009107,4.312149,0.017559,0.009107,4.312149,1,True,7
6,KNeighborsDist,-0.769562,-0.80794,root_mean_squared_error,0.020912,0.018355,0.012854,0.020912,0.018355,0.012854,1,True,2
7,KNeighborsUnif,-0.833762,-0.871604,root_mean_squared_error,0.025099,0.02655,2.535976,0.025099,0.02655,2.535976,1,True,1
