In [57]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
y = glass_identification.data.targets 

print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      214 non-null    float64
 1   Na      214 non-null    float64
 2   Mg      214 non-null    float64
 3   Al      214 non-null    float64
 4   Si      214 non-null    float64
 5   K       214 non-null    float64
 6   Ca      214 non-null    float64
 7   Ba      214 non-null    float64
 8   Fe      214 non-null    float64
dtypes: float64(9)
memory usage: 15.2 KB
None


In [52]:
import seaborn as sns
# %config InlineBackend.figure_format = 'svg'
%matplotlib qt5
import matplotlib.pyplot as plt
# 对特征进行配对绘图
sns.pairplot(X)
# 显示图形
plt.show()

### 预处理
> 无缺失值
>
> 异常值
>
> 归一化

In [58]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [59]:


plt.figure(figsize=(10, 6))
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False
sns.boxplot(data=X)
plt.title('Boxplot of Glass Composition')
plt.show()


In [54]:
def filter(column):
    # 计算 Q1, Q3 和 IQR
    Q1 = X[column].quantile(0.25)
    Q3 = X[column].quantile(0.75)
    IQR = Q3 - Q1
    # 定义异常值的条件
    condition = (X[column] < (Q1 - 1.5 * IQR)) | (X[column] > (Q3 + 1.5 * IQR))
    X[column] = np.where(condition,np.nan,X[column])

for column in X.columns:
    filter(column)


In [55]:
print("brfore:\n",X.info())
X.interpolate(method='linear',inplace=True)
print("After:\n",X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      197 non-null    float64
 1   Na      207 non-null    float64
 2   Mg      214 non-null    float64
 3   Al      196 non-null    float64
 4   Si      202 non-null    float64
 5   K       207 non-null    float64
 6   Ca      188 non-null    float64
 7   Ba      176 non-null    float64
 8   Fe      202 non-null    float64
dtypes: float64(9)
memory usage: 15.2 KB
brfore:
 None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      214 non-null    float64
 1   Na      214 non-null    float64
 2   Mg      214 non-null    float64
 3   Al      214 non-null    float64
 4   Si      214 non-null    float64
 5   K       214 non-null    float64
 6   Ca      214 non-null    float

In [56]:
plt.figure(figsize=(10, 6))
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False
sns.boxplot(data=X)
plt.title('After preprocessed:Boxplot of Glass Composition')
plt.show()

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

*****

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [11]:
import joblib
N_CORES = joblib.cpu_count(only_physical_cores=True)
print(f"Number of physical cores: {N_CORES}")

Number of physical cores: 8


#### RandomForeast

In [22]:
from sklearn.model_selection import GridSearchCV
RmFt = RandomForestClassifier(n_estimators=100, n_jobs=N_CORES, random_state=42)
RmFT_paramas = {
    'n_estimators': [10,20,50,100 ,200],
    'max_depth': [None,5, 10],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5]
}

RmFt_grid = GridSearchCV(RmFt, RmFT_paramas, cv=5, n_jobs=N_CORES,scoring='precision_micro')  
RmFt_grid.fit(X_train, y_train)

print(f"Best parameters: {RmFt_grid.best_params_}")
print(f"Best score: {RmFt_grid.best_score_}")



Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best score: 0.6424080549080549


In [28]:
from sklearn.metrics import precision_score, recall_score, f1_score
RmFt_best = RmFt_grid.best_estimator_
RmFt_y_pred = RmFt_best.predict(X_test)

# 计算准确率
precision = precision_score(y_test, RmFt_y_pred, average='macro')
print(f"Precision: {precision}")

# 计算查全率
recall = recall_score(y_test, RmFt_y_pred, average='micro')
print(f"Recall: {recall}")

# 计算 F1 分数
f1 = f1_score(y_test, RmFt_y_pred, average='macro')
print(f"F1 Score: {f1}")

Precision: 0.8882539682539683
Recall: 0.8153846153846154
F1 Score: 0.8220361509835193


#### GradientBoost

In [29]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# 创建梯度提升模型
gb = GradientBoostingClassifier(random_state=42)

# 定义超参数网格
gb_params = {
    'n_estimators': [10, 20, 50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5]
}

# 创建 GridSearchCV 对象
gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring='precision_micro')

# 进行超参数搜索
gb_grid.fit(X_train, y_train)

# 输出最佳参数和最佳分数
print(f"Best parameters: {gb_grid.best_params_}")
print(f"Best score: {gb_grid.best_score_}")

Best parameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best score: 0.7050574712643678


In [35]:
from sklearn.metrics import precision_score, recall_score, f1_score
gb_best = gb_grid.best_estimator_
gb_y_pred =gb_best.predict(X_test)

# 计算准确率
precision = precision_score(y_test, gb_y_pred, average='macro')
print(f"Precision: {precision}")

# 计算查全率
recall = recall_score(y_test, gb_y_pred, average='micro')
print(f"Recall: {recall}")

# 计算 F1 分数
f1 = f1_score(y_test, gb_y_pred, average='micro')
print(f"F1 Score: {f1}")

Precision: 0.8075318401405358
Recall: 0.7538461538461538
F1 Score: 0.7538461538461538


In [44]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

# 创建 AdaBoost 模型
ada = AdaBoostClassifier(random_state=42)

# 定义超参数网格
ada_params = {
    'n_estimators': [10, 20, 50, 100, 150,200],
    'learning_rate': [0.01, 0.1, 1, 10]
}

# 创建 GridSearchCV 对象
ada_grid = GridSearchCV(ada, ada_params, cv=5, scoring='accuracy')

# 进行超参数搜索
ada_grid.fit(X_train, y_train)

# 输出最佳参数和最佳分数
print(f"Best parameters: {ada_grid.best_params_}")
print(f"Best score: {ada_grid.best_score_}")

Best parameters: {'learning_rate': 0.1, 'n_estimators': 10}
Best score: 0.4970114942528735


In [48]:
from sklearn.metrics import precision_score, recall_score, f1_score

ada_best = ada_grid.best_estimator_
ada_y_pred = ada_best.predict(X_test)

# 计算准确率
precision = precision_score(y_test, ada_y_pred, average='micro')
print(f"Precision: {precision}")

# 计算查全率
recall = recall_score(y_test, ada_y_pred, average='micro')
print(f"Recall: {recall}")

# 计算 F1 分数
f1 = f1_score(y_test, ada_y_pred, average='micro')
print(f"F1 Score: {f1}")

Precision: 0.4307692307692308
Recall: 0.4307692307692308
F1 Score: 0.43076923076923074
