# Imports

In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import os
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Open data, edit, merge, and split

In [2]:
cwd = os.getcwd()
red_data_dir = cwd + "/../data/external/wine_quality/winequality-red.csv"
white_data_dir = cwd + "/../data/external/wine_quality/winequality-white.csv"

### Open
red_df = pd.read_csv(red_data_dir, sep=";")
white_df = pd.read_csv(white_data_dir, sep=";")

### Add red and white ID columns
red_df.insert(0, "colour", "red")
white_df.insert(0, "colour", "white")

### Merge
wine_df = pd.concat([red_df, white_df])

### Split
mask = np.random.rand(len(wine_df)) < 0.8
train_df = wine_df[mask]
test_df = wine_df[~mask]

# Create Tabular object

In [3]:
train_data = TabularDataset(train_df)
label = "quality"
train_data[label].describe()

count    5141.000000
mean        5.822603
std         0.874433
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: quality, dtype: float64

# Training

In [4]:
predictor = TabularPredictor(label=label, problem_type="regression").fit(train_data, presets="medium_quality")

No path specified. Models will be saved in: "AutogluonModels/ag-20231217_140452"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231217_140452"
AutoGluon Version:  1.0.0
Python Version:     3.11.5
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.6.0: Tue Nov  7 21:48:06 PST 2023; root:xnu-8796.141.3.702.9~2/RELEASE_X86_64
CPU Count:          4
Memory Avail:       2.16 GB / 8.00 GB (27.0%)
Disk Space Avail:   80.83 GB / 233.47 GB (34.6%)
Train Data Rows:    5141
Train Data Columns: 12
Label Column:       quality
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2208.11 MB
	Train Data (Original)  Memory Usage: 0.73 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to 

# Prediction

In [8]:
test_data = TabularDataset(test_df)

y_pred = predictor.predict(test_data.drop(columns=[label]))
y_pred

0       5.037997
5       5.123959
13      5.315475
16      5.727938
25      5.062355
          ...   
4873    6.489223
4878    4.747540
4890    6.404612
4891    5.904160
4896    6.462568
Name: quality, Length: 1356, dtype: float32

# Evaluation

In [9]:
predictor.evaluate(test_data, silent=True)

{'root_mean_squared_error': -0.5825961109069905,
 'mean_squared_error': -0.3394182284439503,
 'mean_absolute_error': -0.4138073675048738,
 'r2': 0.5501109290326958,
 'pearsonr': 0.747064765424106,
 'median_absolute_error': -0.28075480461120605}

In [10]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.582596,-0.603747,root_mean_squared_error,0.272211,0.107154,8.458988,0.003098,0.000351,0.181165,2,True,8
1,ExtraTreesMSE,-0.590674,-0.61229,root_mean_squared_error,0.178916,0.070402,1.275244,0.178916,0.070402,1.275244,1,True,4
2,XGBoost,-0.598586,-0.629873,root_mean_squared_error,0.038167,0.009913,1.723753,0.038167,0.009913,1.723753,1,True,6
3,RandomForestMSE,-0.59929,-0.619266,root_mean_squared_error,0.141341,0.057341,4.569319,0.141341,0.057341,4.569319,1,True,3
4,NeuralNetFastAI,-0.65592,-0.679664,root_mean_squared_error,0.031075,0.011277,5.256911,0.031075,0.011277,5.256911,1,True,5
5,NeuralNetTorch,-0.698528,-0.698861,root_mean_squared_error,0.017857,0.009116,12.226673,0.017857,0.009116,12.226673,1,True,7
6,KNeighborsDist,-0.72059,-0.722298,root_mean_squared_error,0.020955,0.015211,0.021915,0.020955,0.015211,0.021915,1,True,2
7,KNeighborsUnif,-0.795544,-0.80034,root_mean_squared_error,0.072124,0.075412,10.013118,0.072124,0.075412,10.013118,1,True,1
