In [1]:
import os
import pandas as pd
import random
import numpy as np
import autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag
import warnings
warnings.filterwarnings("ignore") 

def seed_everything(seed): # 作用是固定随机种子，使得每次运行模型的时候，结果都是一样的
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
original = pd.read_csv("mixed_desc.csv")

In [2]:
from pathlib import Path
from datetime import datetime
import logging

logger = logging.getLogger(__name__)

def setup_outputdir(path, warn_if_exist=True, create_dir=True, path_suffix=None): 
    """
    作用是创建一个文件夹，用来存放模型
    :param path: 
    :param warn_if_exist: 
    :param create_dir: 
    :param path_suffix: 
    :return: 
    """
    if path:
        assert isinstance(path, (str, Path)), f"Only str and pathlib.Path types are supported for path, got {path} of type {type(path)}."
    if path_suffix is None:
        path_suffix = ""
    if path_suffix and path_suffix[-1] == os.path.sep:
        path_suffix = path_suffix[:-1]
    if path is not None:
        path = f"{path}{path_suffix}"
    if path is None:
        utcnow = datetime.utcnow()
        timestamp = utcnow.strftime("%Y%m%d_%H%M%S")
        path = f"AutogluonModels{os.path.sep}ag-{timestamp}{path_suffix}{os.path.sep}"
        for i in range(1, 1000):
            try:
                if create_dir:
                    os.makedirs(path, exist_ok=False)
                    break
                else:
                    if os.path.isdir(path):
                        raise FileExistsError
                    break
            except FileExistsError:
                path = f"AutogluonModels{os.path.sep}ag-{timestamp}-{i:03d}{path_suffix}{os.path.sep}"
        else:
            raise RuntimeError("more than 1000 jobs launched in the same second")
        logger.log(25, f'No path specified. Models will be saved in: "{path}"')
    elif warn_if_exist:
        try:
            if create_dir:
                os.makedirs(path, exist_ok=False)
            elif os.path.isdir(path):
                raise FileExistsError
        except FileExistsError:
            logger.warning(f'Warning: path already exists! This predictor may overwrite an existing predictor! path="{path}"')
    path = os.path.expanduser(path)  # 作用是将path中包含的"~"和"~user"转换成用户目录
    if path[-1] != os.path.sep:
        path = path + os.path.sep
    return path

In [3]:
from autogluon.tabular import TabularDataset, TabularPredictor

from autogluon.core.utils.loaders import load_pkl # 作用是加载pkl文件
from autogluon.core.utils.savers import save_pkl
import os.path


class MultilabelPredictor(): 
    multi_predictor_file = 'multilabel_predictor.pkl'

    def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=False, **kwargs):
        if len(labels) < 2:
            raise ValueError("MultilabelPredictor is only intended for predicting MULTIPLE labels (columns), use TabularPredictor for predicting one label (column).")
        if (problem_types is not None) and (len(problem_types) != len(labels)):
            raise ValueError("If provided, `problem_types` must have same length as `labels`")
        if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):
            raise ValueError("If provided, `eval_metrics` must have same length as `labels`")
        self.path = setup_outputdir(path, warn_if_exist=False)
        self.labels = labels
        self.consider_labels_correlation = consider_labels_correlation
        self.predictors = {}  # key = label, value = TabularPredictor or str path to the TabularPredictor for this label
        if eval_metrics is None:
            self.eval_metrics = {}
        else:
            self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}
        problem_type = None
        eval_metric = None
        for i in range(len(labels)):
            label = labels[i]
            path_i = self.path + "Predictor_" + label
            if problem_types is not None:
                problem_type = problem_types[i]
            if eval_metrics is not None:
                eval_metric = eval_metrics[i]
            self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)

    def fit(self, train_data, tuning_data=None, **kwargs):
        if isinstance(train_data, str):
            train_data = TabularDataset(train_data)
        if tuning_data is not None and isinstance(tuning_data, str):
            tuning_data = TabularDataset(tuning_data)
        train_data_og = train_data.copy()
        if tuning_data is not None:
            tuning_data_og = tuning_data.copy()
        else:
            tuning_data_og = None
        save_metrics = len(self.eval_metrics) == 0
        for i in range(len(self.labels)):
            label = self.labels[i]
            predictor = self.get_predictor(label)
            if not self.consider_labels_correlation:
                labels_to_drop = [l for l in self.labels if l != label]
            else:
                labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]
            train_data = train_data_og.drop(labels_to_drop, axis=1)
            if tuning_data is not None:
                tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)
            print(f"Fitting TabularPredictor for label: {label} ...")
            predictor.fit(train_data=train_data, tuning_data=tuning_data, **kwargs)
            self.predictors[label] = predictor.path
            if save_metrics:
                self.eval_metrics[label] = predictor.eval_metric
        self.save()

    def predict(self, data, **kwargs):
        return self._predict(data, as_proba=False, **kwargs)

    def predict_proba(self, data, **kwargs):
        return self._predict(data, as_proba=True, **kwargs)

    def evaluate(self, data, **kwargs):
        data = self._get_data(data)
        eval_dict = {}
        for label in self.labels:
            print(f"Evaluating TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            eval_dict[label] = predictor.evaluate(data, **kwargs)
            if self.consider_labels_correlation:
                data[label] = predictor.predict(data, **kwargs)
        return eval_dict

    def save(self):
        """ Save MultilabelPredictor to disk. """
        for label in self.labels:
            if not isinstance(self.predictors[label], str):
                self.predictors[label] = self.predictors[label].path
        save_pkl.save(path=self.path+self.multi_predictor_file, object=self)
        print(f"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')")

    @classmethod
    def load(cls, path):
        path = os.path.expanduser(path)
        if path[-1] != os.path.sep:
            path = path + os.path.sep
        return load_pkl.load(path=path+cls.multi_predictor_file)

    def get_predictor(self, label):
        predictor = self.predictors[label]
        if isinstance(predictor, str):
            return TabularPredictor.load(path=predictor)
        return predictor

    def _get_data(self, data):
        if isinstance(data, str):
            return TabularDataset(data)
        return data.copy()

    def _predict(self, data, as_proba=False, **kwargs):
        data = self._get_data(data)
        if as_proba:
            predproba_dict = {}
        for label in self.labels:
            print(f"Predicting with TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            if as_proba:
                predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)
            data[label] = predictor.predict(data, **kwargs)
        if not as_proba:
            return data[self.labels]
        else:
            return predproba_dict

In [4]:
# Original
cols=['CIDs', 'BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v','Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1','FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha','HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex','NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7','PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9','fr_COO', 'fr_COO2', 'EC1_EC2_EC3_EC4_EC5_EC6'] # cols的作用是只保留这些列
original = original.loc[:,cols] # 只保留cols中的列
original[['EC1', 'EC2']] = original['EC1_EC2_EC3_EC4_EC5_EC6'].str.split('_', expand=True).iloc[:,:2].astype(int) # 将EC1_EC2_EC3_EC4_EC5_EC6列按照_分割，取前两列，转换为int类型，赋值给EC1和EC2列
original.drop(columns = ['EC1_EC2_EC3_EC4_EC5_EC6','CIDs'], inplace=True) 
original['id'] = original.reset_index().index # 重置index，并将index赋值给id列
# Train
train.drop(columns = ['EC3', 'EC4', 'EC5', 'EC6'], inplace=True)

train = pd.concat([train, original])

In [5]:
train_data = TabularDataset(train)
test_data = TabularDataset(test)

train_data.drop('id',axis = 1, inplace = True)
test_data.drop('id',axis = 1, inplace = True)

labels = ['EC1', 'EC2']
problem_types = ['regression', 'regression'] 
eval_metrics = ['roc_auc', 'roc_auc']

In [6]:
multi_predictor = MultilabelPredictor(labels=labels, problem_types=problem_types) 
multi_predictor.fit(train_data,time_limit=3600*12)

Beginning AutoGluon training ... Time limit = 43200s
AutoGluon will save models to "AutogluonModels\ag-20230804_030758\Predictor_EC1\"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Train Data Rows:    15877
Train Data Columns: 31
Label Column: EC1
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5269.39 MB
	Train Data (Original)  Memory Usage: 3.94 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, sp

Fitting TabularPredictor for label: EC1 ...


	-0.4859	 = Validation score   (-root_mean_squared_error)
	1.66s	 = Training   runtime
	0.17s	 = Validation runtime
Fitting model: KNeighborsDist ... Training model for up to 43198.07s of the 43198.07s of remaining time.
	-0.4873	 = Validation score   (-root_mean_squared_error)
	0.02s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: LightGBMXT ... Training model for up to 43197.98s of the 43197.98s of remaining time.
	-0.4351	 = Validation score   (-root_mean_squared_error)
	0.67s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 43197.3s of the 43197.3s of remaining time.
	-0.4373	 = Validation score   (-root_mean_squared_error)
	0.34s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 43196.95s of the 43196.95s of remaining time.
	-0.4431	 = Validation score   (-root_mean_squared_error)
	7.66s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model

Fitting TabularPredictor for label: EC2 ...


	0.02s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: KNeighborsDist ... Training model for up to 43199.83s of the 43199.82s of remaining time.
	-0.4352	 = Validation score   (-root_mean_squared_error)
	0.02s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: LightGBMXT ... Training model for up to 43199.75s of the 43199.74s of remaining time.
	-0.4055	 = Validation score   (-root_mean_squared_error)
	0.33s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 43199.4s of the 43199.4s of remaining time.
	-0.406	 = Validation score   (-root_mean_squared_error)
	0.33s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 43199.06s of the 43199.06s of remaining time.
	-0.4087	 = Validation score   (-root_mean_squared_error)
	9.24s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 43189.68s of the 43

MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('AutogluonModels\ag-20230804_030758\')


In [7]:
predictor_class = multi_predictor.get_predictor('EC1') # 意思是取出EC1的predictor
predictor_class.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.434096,0.136504,20.651369,0.0,0.172418,2,True,12
1,LightGBMXT,-0.435081,0.003,0.667373,0.003,0.667373,1,True,3
2,CatBoost,-0.435684,0.005003,2.257308,0.005003,2.257308,1,True,6
3,XGBoost,-0.436835,0.004519,0.741095,0.004519,0.741095,1,True,9
4,LightGBM,-0.437272,0.002005,0.342847,0.002005,0.342847,1,True,4
5,NeuralNetFastAI,-0.437279,0.031249,11.383404,0.031249,11.383404,1,True,8
6,LightGBMLarge,-0.438752,0.002998,1.138089,0.002998,1.138089,1,True,11
7,ExtraTreesMSE,-0.442542,0.040089,1.25698,0.040089,1.25698,1,True,7
8,RandomForestMSE,-0.443106,0.043152,7.66298,0.043152,7.66298,1,True,5
9,KNeighborsUnif,-0.485949,0.172225,1.664919,0.172225,1.664919,1,True,1


In [8]:
model_pred = multi_predictor.predict(test_data)

Predicting with TabularPredictor for label: EC1 ...
Predicting with TabularPredictor for label: EC2 ...


In [9]:
model_pred.head()

Unnamed: 0,EC1,EC2
0,0.411515,0.764463
1,0.817132,0.77693
2,0.791945,0.734284
3,0.763807,0.787167
4,0.816324,0.779002


In [10]:
test = pd.concat([test, model_pred], axis=1) 
test.drop(columns = ['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v','Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1','FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha','HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex','NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7','PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9','fr_COO', 'fr_COO2'], inplace=True)
test.head()


Unnamed: 0,id,EC1,EC2
0,14838,0.411515,0.764463
1,14839,0.817132,0.77693
2,14840,0.791945,0.734284
3,14841,0.763807,0.787167
4,14842,0.816324,0.779002


In [12]:
# 保存
submission = test[['id', 'EC1', 'EC2']]
submission.to_csv('submission.csv', index=False)
