In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load the file to inspect its content
file_path = r'D:\xjtufiles\3ee\DGA\datasets\DGA_data_all.csv'
df = pd.read_csv(file_path)

# df = pd.read_csv(r'D:\xjtufiles\3ee\DGA_data_pre.csv', index_col = 0)
df = df.iloc[:-1,:]
# df

df['h2'] = pd.to_numeric(df['h2'],errors='coerce')
df['ch4'] = pd.to_numeric(df['ch4'],errors='coerce')
df['c2h6'] = pd.to_numeric(df['c2h6'],errors='coerce')
df['c2h4'] = pd.to_numeric(df['c2h4'],errors='coerce')
df['c2h2'] = pd.to_numeric(df['c2h2'],errors='coerce')
df['act'] = pd.to_numeric(df['act'],errors='coerce')

df = df.reset_index(drop=True) # 重置索引

In [4]:
def remove_outliers_iqr(df, columns):
    """使用 IQR 方法移除离群值"""
    df_cleaned = df.copy()
    for col in columns:
        if pd.api.types.is_numeric_dtype(df_cleaned[col]):
            Q1 = df_cleaned[col].quantile(0.25)
            Q3 = df_cleaned[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 5 * IQR
            upper_bound = Q3 + 5 * IQR
            df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]
    return df_cleaned

# 指定特征列
feature_cols = ['h2', 'ch4', 'c2h6', 'c2h4', 'c2h2']

# 移除离群值
df_cleaned = remove_outliers_iqr(df, feature_cols)

print("离群值处理后数据形状：", df_cleaned.shape)

离群值处理后数据形状： (1596, 6)


In [5]:
# 确保特征列为数值类型
for col in feature_cols:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

# 确保目标列为整数类型
df_cleaned['act'] = df_cleaned['act'].astype(int)

print(df_cleaned.dtypes)

h2      float64
ch4     float64
c2h6    float64
c2h4    float64
c2h2    float64
act       int32
dtype: object


# 原始数据

In [8]:
from sklearn.model_selection import train_test_split
df_train, df_test= train_test_split(df_cleaned, test_size = 0.10, random_state=42)

In [10]:
train_data = df_train

In [11]:
train_data.head()

Unnamed: 0,h2,ch4,c2h6,c2h4,c2h2,act
1145,35.0,283.0,121.0,222.0,0.0001,5
434,56.6,366.5,161.5,519.3,1.4,2
910,19.0,62.0,95.0,5.0,0.0001,4
253,65.0,61.0,16.0,143.0,3.0,2
1046,148.7,247.5,178.7,192.2,0.9,4


In [12]:
label = 'act'
train_data[label].describe()

count    1436.000000
mean        3.127437
std         1.541003
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max         6.000000
Name: act, dtype: float64

In [13]:
predictor = TabularPredictor(label=label).fit(train_data)

No path specified. Models will be saved in: "AutogluonModels\ag-20250306_141639"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
Memory Avail:       2.34 GB / 14.94 GB (15.7%)
Disk Space Avail:   142.26 GB / 244.14 GB (58.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	preset

[1000]	valid_set's multi_error: 0.166667


	0.8368	 = Validation score   (accuracy)
	3.69s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: LightGBM ...
	0.8333	 = Validation score   (accuracy)
	2.73s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.8542	 = Validation score   (accuracy)
	1.51s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.8507	 = Validation score   (accuracy)
	1.64s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: CatBoost ...
	0.8264	 = Validation score   (accuracy)
	14.05s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.8438	 = Validation score   (accuracy)
	1.64s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.8403	 = Validation score   (accuracy)
	1.45s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: XGBoost ...
	0.816	 = Validation score   (accuracy)
	1.78s	 = Training   runtime
	0.01s	 = Valid

In [14]:
test_data = df_test

y_pred = predictor.predict(test_data.drop(columns=[label]))
y_pred.head()

1122    5
1354    6
1313    6
477     2
1888    2
Name: act, dtype: int32

In [16]:
predictor.evaluate(test_data, silent=True)

{'accuracy': 0.8875,
 'balanced_accuracy': 0.8731255636881771,
 'mcc': 0.8566444647564339}

In [17]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.89375,0.826389,accuracy,0.051987,0.004999,14.046545,0.051987,0.004999,14.046545,1,True,8
1,ExtraTreesEntr,0.89375,0.840278,accuracy,0.238987,0.134983,1.450976,0.238987,0.134983,1.450976,1,True,10
2,XGBoost,0.8875,0.815972,accuracy,0.152904,0.006998,1.78111,0.152904,0.006998,1.78111,1,True,11
3,RandomForestGini,0.8875,0.854167,accuracy,0.224515,0.134745,1.514286,0.224515,0.134745,1.514286,1,True,6
4,ExtraTreesGini,0.8875,0.84375,accuracy,0.249062,0.134575,1.638796,0.249062,0.134575,1.638796,1,True,9
5,RandomForestEntr,0.8875,0.850694,accuracy,0.269929,0.179665,1.637594,0.269929,0.179665,1.637594,1,True,7
6,WeightedEnsemble_L2,0.8875,0.857639,accuracy,0.463583,0.248732,5.492348,0.007999,0.003003,0.284625,2,True,14
7,LightGBMLarge,0.88125,0.829861,accuracy,0.133726,0.026997,4.013716,0.133726,0.026997,4.013716,1,True,13
8,LightGBMXT,0.88125,0.836806,accuracy,0.231069,0.110984,3.693437,0.231069,0.110984,3.693437,1,True,4
9,LightGBM,0.8625,0.833333,accuracy,0.142108,0.045002,2.730601,0.142108,0.045002,2.730601,1,True,5


# 三比值法：C2H2/C2H4,CH4/H2,C2H6/CH4,C2H4/C2H6

In [18]:
# 示例：使用SMOTE + XGBoost
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV

df_cleaned_trinomial = df_cleaned.copy()

# 生成三比值特征 h2	ch4	c2h6 c2h4 c2h2
df_cleaned_trinomial['ch4/h2'] = X['ch4'] / (X['h2'] + 1e-6)  # 比值1为 CH4 与 H2 的比值
df_cleaned_trinomial['c2h6/ch4'] = X['c2h6'] / (X['ch4'] + 1e-6) # 比值2为 C2H6 与 CH4 的比值
df_cleaned_trinomial['c2h4/c2h6'] = X['c2h4'] / (X['c2h6'] + 1e-6) # 比值3为 C2H4 与 C2H6 的比值
df_cleaned_trinomial['c2h2/c2h4'] = X['c2h2'] / (X['c2h4'] + 1e-6) # 比值4为 C2H2 与 C2H4 的比值

df_cleaned_trinomial

Unnamed: 0,h2,ch4,c2h6,c2h4,c2h2,act,ch4/h2,c2h6/ch4,c2h4/c2h6,c2h2/c2h4
1,200.83,59.45,41.000,40.48,150.840,1,0.296022,0.689655,9.873170e-01,3.726284
2,71.60,20.20,2.700,34.60,44.200,1,0.282123,0.133663,1.281481e+01,1.277457
3,29.30,24.90,7.300,11.40,7.500,1,0.849829,0.293173,1.561644e+00,0.657895
5,255.00,48.00,8.300,33.00,29.800,1,0.188235,0.172917,3.975903e+00,0.903030
6,138.50,68.40,0.000,193.60,137.000,1,0.493863,0.000000,1.936000e+08,0.707645
...,...,...,...,...,...,...,...,...,...,...
1986,124.00,166.00,87.000,59.00,0.001,4,1.338710,0.524096,6.781609e-01,0.000017
1992,240.00,20.00,5.000,28.00,96.000,3,0.083333,0.250000,5.599999e+00,3.428571
1993,160.00,10.00,3.000,1.00,1.000,3,0.062500,0.300000,3.333332e-01,0.999999
1994,56.00,61.00,75.000,32.00,31.000,3,1.089286,1.229508,4.266667e-01,0.968750


In [19]:
from sklearn.model_selection import train_test_split
df_cleaned_trinomial_train, df_cleaned_trinomial_test= train_test_split(df_cleaned_trinomial, test_size = 0.10, random_state=42)

In [20]:
trinomial_train_data = df_cleaned_trinomial_train

In [21]:
trinomial_train_data.head()

Unnamed: 0,h2,ch4,c2h6,c2h4,c2h2,act,ch4/h2,c2h6/ch4,c2h4/c2h6,c2h2/c2h4
1145,35.0,283.0,121.0,222.0,0.0001,5,8.085714,0.427562,1.834711,4.504504e-07
434,56.6,366.5,161.5,519.3,1.4,2,6.475265,0.440655,3.21548,0.002695937
910,19.0,62.0,95.0,5.0,0.0001,4,3.263158,1.532258,0.052632,2e-05
253,65.0,61.0,16.0,143.0,3.0,2,0.938462,0.262295,8.937499,0.02097902
1046,148.7,247.5,178.7,192.2,0.9,4,1.664425,0.72202,1.075546,0.004682622


In [23]:
label = 'act'
trinomial_train_data[label].describe()

count    1436.000000
mean        3.127437
std         1.541003
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max         6.000000
Name: act, dtype: float64

In [None]:
predictor_trinomial = TabularPredictor(label=label).fit(trinomial_train_data)

In [None]:
trinomial_test_data = df_cleaned_trinomial_test

y_pred_trinomial = predictor_trinomial.predict(trinomial_test_data.drop(columns=[label]))
y_pred_trinomial.head()

In [None]:
predictor_trinomial.evaluate(trinomial_test_data, silent=True)

In [None]:
predictor_trinomial.leaderboard(trinomial_test_data)