<a href="https://colab.research.google.com/github/RM-RAMASAMY/CMPE-255-Assignments/blob/main/MultiLabel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install autogluon.tabular[all]


Collecting autogluon.tabular[all]
  Downloading autogluon.tabular-1.2-py3-none-any.whl.metadata (14 kB)
Collecting autogluon.core==1.2 (from autogluon.tabular[all])
  Downloading autogluon.core-1.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.2 (from autogluon.tabular[all])
  Downloading autogluon.features-1.2-py3-none-any.whl.metadata (11 kB)
Collecting catboost<1.3,>=1.2 (from autogluon.tabular[all])
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting boto3<2,>=1.10 (from autogluon.core==1.2->autogluon.tabular[all])
  Downloading boto3-1.35.83-py3-none-any.whl.metadata (6.7 kB)
Collecting autogluon.common==1.2 (from autogluon.core==1.2->autogluon.tabular[all])
  Downloading autogluon.common-1.2-py3-none-any.whl.metadata (11 kB)
Collecting ray<2.40,>=2.10.0 (from ray[default]<2.40,>=2.10.0; extra == "all"->autogluon.core[all]==1.2; extra == "all"->autogluon.tabular[all])
  Downloading ray-2.39.0-cp310-cp310-manylinux2014

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.common.utils.utils import setup_outputdir
from autogluon.core.utils.loaders import load_pkl
from autogluon.core.utils.savers import save_pkl
import os.path

class MultilabelPredictor:
    multi_predictor_file = 'multilabel_predictor.pkl'

    def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=True, **kwargs):
        if len(labels) < 2:
            raise ValueError("MultilabelPredictor is only intended for predicting MULTIPLE labels (columns), use TabularPredictor for predicting one label (column).")
        if (problem_types is not None) and (len(problem_types) != len(labels)):
            raise ValueError("If provided, `problem_types` must have same length as `labels`")
        if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):
            raise ValueError("If provided, `eval_metrics` must have same length as `labels`")
        self.path = setup_outputdir(path, warn_if_exist=False)
        self.labels = labels
        self.consider_labels_correlation = consider_labels_correlation
        self.predictors = {}  # key = label, value = TabularPredictor or str path to the TabularPredictor for this label
        if eval_metrics is None:
            self.eval_metrics = {}
        else:
            self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}
        problem_type = None
        eval_metric = None
        for i in range(len(labels)):
            label = labels[i]
            path_i = os.path.join(self.path, "Predictor_" + str(label))
            if problem_types is not None:
                problem_type = problem_types[i]
            if eval_metrics is not None:
                eval_metric = eval_metrics[i]
            self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)

    def fit(self, train_data, tuning_data=None, **kwargs):
        if isinstance(train_data, str):
            train_data = TabularDataset(train_data)
        if tuning_data is not None and isinstance(tuning_data, str):
            tuning_data = TabularDataset(tuning_data)
        train_data_og = train_data.copy()
        if tuning_data is not None:
            tuning_data_og = tuning_data.copy()
        else:
            tuning_data_og = None
        save_metrics = len(self.eval_metrics) == 0
        for i in range(len(self.labels)):
            label = self.labels[i]
            predictor = self.get_predictor(label)
            if not self.consider_labels_correlation:
                labels_to_drop = [l for l in self.labels if l != label]
            else:
                labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]
            train_data = train_data_og.drop(labels_to_drop, axis=1)
            if tuning_data is not None:
                tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)
            print(f"Fitting TabularPredictor for label: {label} ...")
            predictor.fit(train_data=train_data, tuning_data=tuning_data, **kwargs)
            self.predictors[label] = predictor.path
            if save_metrics:
                self.eval_metrics[label] = predictor.eval_metric
        self.save()

    def predict(self, data, **kwargs):
        return self._predict(data, as_proba=False, **kwargs)

    def predict_proba(self, data, **kwargs):
        return self._predict(data, as_proba=True, **kwargs)

    def evaluate(self, data, **kwargs):
        data = self._get_data(data)
        eval_dict = {}
        for label in self.labels:
            print(f"Evaluating TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            eval_dict[label] = predictor.evaluate(data, **kwargs)
            if self.consider_labels_correlation:
                data[label] = predictor.predict(data, **kwargs)
        return eval_dict

    def save(self):
        """ Save MultilabelPredictor to disk. """
        for label in self.labels:
            if not isinstance(self.predictors[label], str):
                self.predictors[label] = self.predictors[label].path
        save_pkl.save(path=os.path.join(self.path, self.multi_predictor_file), object=self)
        print(f"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')")

    @classmethod
    def load(cls, path):
        """ Load MultilabelPredictor from disk `path` previously specified when creating this MultilabelPredictor. """
        path = os.path.expanduser(path)
        return load_pkl.load(path=os.path.join(path, cls.multi_predictor_file))

    def get_predictor(self, label):
        predictor = self.predictors[label]
        if isinstance(predictor, str):
            return TabularPredictor.load(path=predictor)
        return predictor

    def _get_data(self, data):
        if isinstance(data, str):
            return TabularDataset(data)
        return data.copy()

    def _predict(self, data, as_proba=False, **kwargs):
        data = self._get_data(data)
        if as_proba:
            predproba_dict = {}
        for label in self.labels:
            print(f"Predicting with TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            if as_proba:
                predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)
            data[label] = predictor.predict(data, **kwargs)
        if not as_proba:
            return data[self.labels]
        else:
            return predproba_dict

In [3]:
train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')
subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
train_data = train_data.sample(n=subsample_size, random_state=0)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
6118,51,Private,39264,Some-college,10,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K
23204,58,Private,51662,10th,6,Married-civ-spouse,Other-service,Wife,White,Female,0,0,8,United-States,<=50K
29590,40,Private,326310,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,44,United-States,<=50K
18116,37,Private,222450,HS-grad,9,Never-married,Sales,Not-in-family,White,Male,0,2339,40,El-Salvador,<=50K
33964,62,Private,109190,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,40,United-States,>50K


In [6]:
labels = ['education-num','education','class']  # which columns to predict based on the others
problem_types = ['regression','multiclass','binary']  # type of each prediction problem (optional)
eval_metrics = ['mean_absolute_error','accuracy','accuracy']  # metrics used to evaluate predictions for each label (optional)
save_path = 'agModels-predictEducationClass'  # specifies folder to store trained models (optional)

time_limit = 60  # how many seconds to train the TabularPredictor for each label, set much larger in your applications!

In [7]:
multi_predictor = MultilabelPredictor(labels=labels, problem_types=problem_types, eval_metrics=eval_metrics, path=save_path)
multi_predictor.fit(train_data, time_limit=time_limit)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       11.10 GB / 12.67 GB (87.6%)
Disk Space Avail:   74.33 GB / 107.72 GB (69.0%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with ver

Fitting TabularPredictor for label: education-num ...


	-2.086	 = Validation score   (-mean_absolute_error)
	0.02s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: KNeighborsDist ... Training model for up to 59.80s of the 59.79s of remaining time.
	-2.1856	 = Validation score   (-mean_absolute_error)
	0.01s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBMXT ... Training model for up to 59.77s of the 59.76s of remaining time.
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

	-1.7808	 = Validation score   (-mean_absolute_error)
	1.8s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 57.94s of the 57.93s of remaining time.
	-1.7854	 = Validation score   (-mean_absolute_error)
	0.34s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 57.57s of t

Fitting TabularPredictor for label: education ...


		('int', [])       : 6 | ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', ...]
		('int', ['bool']) : 1 | ['sex']
	0.1s = Fit runtime
	13 features in original data used to generate 13 features in processed data.
	Train Data (Processed) Memory Usage: 0.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.2s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.2, Train Rows: 390, Val Rows: 98
User-specified model hyperparameters to be fit:
{
	'NN_TORCH': [{}],
	'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0, 'hyperparameter_tune_kwargs': None}}],
	'CAT': [{}],
	'XGB': [{}],
	'FASTAI': [{}],
	'RF': [{'criterion': '

Fitting TabularPredictor for label: class ...


		('object', []) : 8 | ['workclass', 'education', 'marital-status', 'occupation', 'relationship', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('category', [])  : 7 | ['workclass', 'education', 'marital-status', 'occupation', 'relationship', ...]
		('int', [])       : 6 | ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', ...]
		('int', ['bool']) : 1 | ['sex']
	0.2s = Fit runtime
	14 features in original data used to generate 14 features in processed data.
	Train Data (Processed) Memory Usage: 0.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.21s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.2, Train Rows: 400, Val Rows: 100
User-specified model hyperparameters to be fit:
{
	'NN_TORCH': [{}],
	'GBM': [{'extra_trees': True, 'ag_args': 

MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('/content/agModels-predictEducationClass')


In [8]:
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
test_data = test_data.sample(n=subsample_size, random_state=0)
test_data_nolab = test_data.drop(columns=labels)  # unnecessary, just to demonstrate we're not cheating here
test_data_nolab.head()

Loaded data from: https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv | Columns = 15 / 15 | Rows = 9769 -> 9769


Unnamed: 0,age,workclass,fnlwgt,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
5454,41,Self-emp-not-inc,408498,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States
6111,39,Private,746786,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,55,United-States
5282,50,Private,62593,Married-civ-spouse,Farming-fishing,Husband,Asian-Pac-Islander,Male,0,0,40,United-States
3046,31,Private,248178,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,35,United-States
2162,43,State-gov,52849,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States


In [9]:
multi_predictor = MultilabelPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained multilabel predictor from file

predictions = multi_predictor.predict(test_data_nolab)
print("Predictions:  \n", predictions)

Predicting with TabularPredictor for label: education-num ...
Predicting with TabularPredictor for label: education ...
Predicting with TabularPredictor for label: class ...
Predictions:  
       education-num      education   class
5454      10.019720   Some-college    >50K
6111      12.256890        HS-grad    >50K
5282       9.378137        HS-grad    >50K
3046       9.313288        HS-grad   <=50K
2162      13.225019      Bachelors    >50K
...             ...            ...     ...
6965       9.365185        HS-grad    >50K
4762       9.465967        HS-grad   <=50K
234        9.721886        HS-grad   <=50K
6291       9.694777        HS-grad   <=50K
9575       9.403322        HS-grad    >50K

[500 rows x 3 columns]


In [10]:
evaluations = multi_predictor.evaluate(test_data)
print(evaluations)
print("Evaluated using metrics:", multi_predictor.eval_metrics)

Evaluating TabularPredictor for label: education-num ...
Evaluating TabularPredictor for label: education ...
Evaluating TabularPredictor for label: class ...
{'education-num': {'mean_absolute_error': -1.6540824069976807, 'root_mean_squared_error': -2.3013422423663012, 'mean_squared_error': -5.296176116499557, 'r2': 0.3151817321777344, 'pearsonr': 0.5797738922295388, 'median_absolute_error': -0.9387617111206055}, 'education': {'accuracy': 0.28, 'balanced_accuracy': 0.07289523325482246, 'mcc': 0.05657274447502478}, 'class': {'accuracy': 0.814, 'balanced_accuracy': 0.7052696462525335, 'mcc': 0.4716790589850321, 'roc_auc': 0.8472805532919618, 'f1': 0.5714285714285714, 'precision': 0.7045454545454546, 'recall': 0.4806201550387597}}
Evaluated using metrics: {'education-num': 'mean_absolute_error', 'education': 'accuracy', 'class': 'accuracy'}


In [11]:
predictor_class = multi_predictor.get_predictor('class')
predictor_class.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.86,accuracy,0.00864,0.309764,0.00864,0.309764,1,True,11
1,WeightedEnsemble_L2,0.86,accuracy,0.009559,0.412008,0.000918,0.102244,2,True,14
2,CatBoost,0.85,accuracy,0.005226,2.487293,0.005226,2.487293,1,True,7
3,LightGBM,0.85,accuracy,0.006812,0.368301,0.006812,0.368301,1,True,4
4,NeuralNetFastAI,0.84,accuracy,0.016831,0.766431,0.016831,0.766431,1,True,10
5,RandomForestGini,0.84,accuracy,0.089613,0.763478,0.089613,0.763478,1,True,5
6,LightGBMLarge,0.83,accuracy,0.00504,0.521096,0.00504,0.521096,1,True,13
7,LightGBMXT,0.83,accuracy,0.005239,0.36116,0.005239,0.36116,1,True,3
8,NeuralNetTorch,0.83,accuracy,0.013572,1.661514,0.013572,1.661514,1,True,12
9,RandomForestEntr,0.83,accuracy,0.05955,1.02164,0.05955,1.02164,1,True,6
