In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load the file to inspect its content
file_path = r'D:\xjtufiles\3ee\DGA\datasets\DGA_data_all.csv'
df = pd.read_csv(file_path)

# df = pd.read_csv(r'D:\xjtufiles\3ee\DGA_data_pre.csv', index_col = 0)
df = df.iloc[:-1,:]
# df

df['h2'] = pd.to_numeric(df['h2'],errors='coerce')
df['ch4'] = pd.to_numeric(df['ch4'],errors='coerce')
df['c2h6'] = pd.to_numeric(df['c2h6'],errors='coerce')
df['c2h4'] = pd.to_numeric(df['c2h4'],errors='coerce')
df['c2h2'] = pd.to_numeric(df['c2h2'],errors='coerce')
df['act'] = pd.to_numeric(df['act'],errors='coerce')

df = df.reset_index(drop=True) # 重置索引

In [4]:
def remove_outliers_iqr(df, columns):
    """使用 IQR 方法移除离群值"""
    df_cleaned = df.copy()
    for col in columns:
        if pd.api.types.is_numeric_dtype(df_cleaned[col]):
            Q1 = df_cleaned[col].quantile(0.25)
            Q3 = df_cleaned[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 5 * IQR
            upper_bound = Q3 + 5 * IQR
            df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]
    return df_cleaned

# 指定特征列
feature_cols = ['h2', 'ch4', 'c2h6', 'c2h4', 'c2h2']

# 移除离群值
df_cleaned = remove_outliers_iqr(df, feature_cols)

print("离群值处理后数据形状：", df_cleaned.shape)

离群值处理后数据形状： (1596, 6)


In [5]:
# 确保特征列为数值类型
for col in feature_cols:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

# 确保目标列为整数类型
df_cleaned['act'] = df_cleaned['act'].astype(int)

print(df_cleaned.dtypes)

h2      float64
ch4     float64
c2h6    float64
c2h4    float64
c2h2    float64
act       int32
dtype: object


In [8]:
from sklearn.model_selection import train_test_split
df_train, df_test= train_test_split(df_cleaned, test_size = 0.10, random_state=42)

In [10]:
train_data = df_train

In [11]:
train_data.head()

Unnamed: 0,h2,ch4,c2h6,c2h4,c2h2,act
1145,35.0,283.0,121.0,222.0,0.0001,5
434,56.6,366.5,161.5,519.3,1.4,2
910,19.0,62.0,95.0,5.0,0.0001,4
253,65.0,61.0,16.0,143.0,3.0,2
1046,148.7,247.5,178.7,192.2,0.9,4


In [12]:
label = 'act'
train_data[label].describe()

count    1436.000000
mean        3.127437
std         1.541003
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max         6.000000
Name: act, dtype: float64

In [13]:
predictor = TabularPredictor(label=label).fit(train_data)

No path specified. Models will be saved in: "AutogluonModels\ag-20250306_141639"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
Memory Avail:       2.34 GB / 14.94 GB (15.7%)
Disk Space Avail:   142.26 GB / 244.14 GB (58.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	preset

[1000]	valid_set's multi_error: 0.166667


	0.8368	 = Validation score   (accuracy)
	3.69s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: LightGBM ...
	0.8333	 = Validation score   (accuracy)
	2.73s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.8542	 = Validation score   (accuracy)
	1.51s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.8507	 = Validation score   (accuracy)
	1.64s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: CatBoost ...
	0.8264	 = Validation score   (accuracy)
	14.05s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.8438	 = Validation score   (accuracy)
	1.64s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.8403	 = Validation score   (accuracy)
	1.45s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: XGBoost ...
	0.816	 = Validation score   (accuracy)
	1.78s	 = Training   runtime
	0.01s	 = Valid

In [14]:
test_data = df_test

y_pred = predictor.predict(test_data.drop(columns=[label]))
y_pred.head()

1122    5
1354    6
1313    6
477     2
1888    2
Name: act, dtype: int32

In [16]:
predictor.evaluate(test_data, silent=True)

{'accuracy': 0.8875,
 'balanced_accuracy': 0.8731255636881771,
 'mcc': 0.8566444647564339}