In [32]:
import pretrait_tools as pt
import feature_engineering as fe  

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import pandas as pd
import nltk
from nltk.corpus import stopwords

In [56]:
INPUT_FILE = "../data/all.tsv"

# ---------- Run 全流程 ----------
dialogs = pt.load_dialogs(INPUT_FILE)
df = pt.flatten_dialogs(dialogs)
df, mlb = pt.binarize_labels(df)

# 展示结果
print(df.head(20)) 

    dialog_id  turn_id                                               text  \
0           0        0  as i am trying ti log in to my kindle fire it ...   
1           0        1  hello robert the amazon fire phone is no longe...   
2           0        2               i have a fire and hdx , same on both   
3           0        3  hi robert just additional information to norma...   
4           1        0  i am unable to send emails to group contacts w...   
5           1        1  hi jill , this kind of error can occur if your...   
6           1        2  i am running a microsoft office 365 pro plus ,...   
7           1        3  we thank you for the update that you have prov...   
8           2        0  i bought this game , do n't judge it 's fun , ...   
9           2        1  hi , this issue can be caused by corrupted win...   
10          2        2  as already stated i already tried running wsre...   
11          2        3  start by running the windows store apps troubl...   

In [34]:
from feature_utils import *

In [35]:
import nltk
nltk.download('punkt')          # 单词/句子分词
nltk.download('vader_lexicon')  # 情感分析用


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/langlang056/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/langlang056/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [57]:
import re, nltk, pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import SnowballStemmer

# ========= 预处理 =========
stemmer = SnowballStemmer('english')
sia = SentimentIntensityAnalyzer()

pos_dict = {
    "good", "great", "excellent", "positive", "fortunate", "correct",
    "superior", "happy", "joy", "love", "wonderful", "amazing",
    "awesome", "delight", "fantastic"
}

neg_dict = {
    "bad", "terrible", "awful", "negative", "unfortunate", "wrong",
    "inferior", "sad", "anger", "hate", "horrible", "disgust",
    "worse", "pain", "fail"
}


def tokenize(x):           # 论文里就用简单分词即可
    return nltk.word_tokenize(x.lower())

# ========= 训练 TF-IDF =========
vec = TfidfVectorizer(tokenizer=tokenize, lowercase=False, min_df=2)
vec.fit(df['text'])        # 只在 train 集上 fit 更干净

# ========= 定义特征函数 =========
def extract_dialog_features(dialog):
    """dialog 是同一 dialog_id 的 DataFrame（已按 turn_id 排序）"""
    utters = dialog['text'].tolist()
    tfidf = vec.transform(utters)
    first_vec = tfidf[0]
    
    init_sim  = cosine_similarity(tfidf, first_vec).ravel()
    thread_sim = []
    mean_vec = first_vec.copy()
    for i in range(len(utters)):
        if i == 0:
            thread_sim.append(1.0)
        else:
            thread_sim.append(cosine_similarity(tfidf[i], mean_vec/i).item())
            mean_vec += tfidf[i]
    
    qm   = ["?" in u for u in utters]
    dup  = pd.Series(utters).str.lower().duplicated().tolist()
    
    w_pattern = re.compile(r'\b(who|what|when|where|why|how)\b', re.I)
    w5h1 = [{w: int(bool(re.search(fr'\b{w}\b', u, re.I))) for w in
             ["who","what","when","where","why","how"]} for u in utters]
    
    abs_pos  = np.arange(1, len(utters)+1)
    norm_pos = abs_pos / len(utters)
    
    tokens   = [tokenize(u) for u in utters]
    post_len = [len(t) for t in tokens]
    uniq_len = [len(set(t)) for t in tokens]
    uniq_stm = [len({stemmer.stem(w) for w in t}) for t in tokens]
    
    thx  = [bool(re.search(r'\bthanks?\b', u, re.I)) for u in utters]
    excl = ["!" in u for u in utters]
    vef  = [bool(re.search(r'\b(very|extremely)\s+(good|helpful|useful|nice|excellent)\b', u, re.I))
            for u in utters]
    
    vader = [sia.polarity_scores(u) for u in utters]
    
    pos_cnt = [sum(w in pos_dict for w in t) for t in tokens]
    neg_cnt = [sum(w in neg_dict for w in t) for t in tokens]
    
    # --- 写回原 dialog DataFrame ---
    dialog = dialog.assign(
        init_sim=init_sim, thread_sim=thread_sim,
        qm=qm, dup=dup,
        abs_pos=abs_pos, norm_pos=norm_pos,
        post_len=post_len, uniq_len=uniq_len, uniq_stm=uniq_stm,
        thank=thx, exclam=excl, ve_feedback=vef,
        pos_score=[v['pos'] for v in vader],
        neg_score=[v['neg'] for v in vader],
        neu_score=[v['neu'] for v in vader],
        comp_score=[v['compound'] for v in vader],
        pos_cnt=pos_cnt, neg_cnt=neg_cnt,
        **{f'{w}_flag':[d[w] for d in w5h1] for w in ["who","what","when","where","why","how"]}
    )
    return dialog

# ========= 对全数据套用 =========
df = (df.sort_values(['dialog_id', 'turn_id'])
        .groupby('dialog_id', group_keys=False)
        .apply(extract_dialog_features))

# df 现在已经多了所有数值/二元特征列，直接喂模型或写文件即可


  .apply(extract_dialog_features))


In [58]:
print(df.head())  # 展示结果

   dialog_id  turn_id                                               text  \
0          0        0  as i am trying ti log in to my kindle fire it ...   
1          0        1  hello robert the amazon fire phone is no longe...   
2          0        2               i have a fire and hdx , same on both   
3          0        3  hi robert just additional information to norma...   
4          1        0  i am unable to send emails to group contacts w...   

  labels   role  is_starter  CQ  FD  FQ  GG  ...  neu_score  comp_score  \
0   [OQ]   User           1   0   0   0   0  ...      0.826      0.3182   
1   [PA]  Agent           0   0   0   0   0  ...      0.411     -0.1531   
2   [FD]  Agent           0   0   1   0   0  ...      0.714     -0.3400   
3   [PA]  Agent           0   0   0   0   0  ...      0.889      0.3612   
4   [OQ]   User           1   0   0   0   0  ...      0.661      0.3182   

   pos_cnt  neg_cnt  who_flag  what_flag  when_flag  where_flag  why_flag  \
0        0     

In [59]:
# 随机森林
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GroupShuffleSplit
import seaborn as sns
import matplotlib.pyplot as plt


In [60]:
onehot_cols = [
    'CQ','FD','FQ','GG','IR','JK','NF','O',
    'OQ','PA','PF','RQ'             # 按你表里实际列名写
]

# ---- 目标 y_all：直接取 one-hot 列 ----
y_all = df[onehot_cols].astype(int).values        # shape = (n_samples, n_labels)

# ---- 特征 X_all：把标签列、文本列等剔除 ----
drop_cols = ['dialog_id','turn_id','text','labels','role'] + onehot_cols
feature_cols = [c for c in df.columns if c not in drop_cols]

X_all = df[feature_cols].fillna(0).astype(float).values
groups = df['dialog_id'].values                  # 分对话切分时用



In [61]:

from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X_all, y_all, groups))

X_train, X_test = X_all[train_idx], X_all[test_idx]
y_train, y_test = y_all[train_idx], y_all[test_idx]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# 自定义 base 模型（注意 class_weight 才会生效）
rf = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42)

# 构建多输出模型
clf = MultiOutputClassifier(rf)

# 拟合训练集（多标签 Y_train）
clf.fit(X_train, y_train)



In [67]:
# === 预测概率 ===
proba_list = [est.predict_proba(X_test)[:, 1] for est in clf.estimators_]
y_proba = np.column_stack(proba_list)

# === 0.5 阈值化 ===
y_pred = (y_proba >= 0.5).astype(int)

print("=== 预测结果 ===")
print("y_pred.shape:", y_pred.shape)
print("y_pred[:5]:", y_pred[:5])
print("y_test[:5]:", y_test[:5])

# === 指标 ===
from sklearn.metrics import accuracy_score, f1_score, classification_report
subset_acc = accuracy_score(y_test, y_pred)
print("Exact-match accuracy:", subset_acc)

print("Micro-F1:",  f1_score(y_test, y_pred, average='micro').round(3))
print("Macro-F1:",  f1_score(y_test, y_pred, average='macro').round(3))
print(classification_report(y_test, y_pred,
                            target_names=onehot_cols,
                            zero_division=0, digits=3))



=== 预测结果 ===
y_pred.shape: (1958, 12)
y_pred[:5]: [[0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]]
y_test[:5]: [[0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]]
Exact-match accuracy: 0.449438202247191
Micro-F1: 0.594
Macro-F1: 0.312
              precision    recall  f1-score   support

          CQ      0.000     0.000     0.000        96
          FD      0.531     0.204     0.295       417
          FQ      0.481     0.102     0.168       128
          GG      0.742     0.333     0.460        69
          IR      0.519     0.147     0.229       191
          JK      0.571     0.286     0.381        14
          NF      1.000     0.018     0.035       113
           O      0.000     0.000     0.000         5
          OQ      0.973     0.964     0.968       445
          PA      0.760     0.748     0.754       738

In [71]:
# 带噪声处理的随机森林
# 所有标签列（你已有）
onehot_cols = ['CQ','FD','FQ','GG','IR','JK','NF','O','OQ','PA','PF','RQ']

# 标记噪声标签
noisy_tags = {'GG', 'JK', 'O'}

def clean_labels(label_list):
    label_set = set(label_list)
    # 如果有其他标签就去除噪声标签
    if len(label_set - noisy_tags) > 0:
        label_set = label_set - noisy_tags
    return list(label_set)

import ast

# 确保 labels 是 list 类型（如是字符串，需要 ast.literal_eval）
df['label_list'] = df['labels'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['clean_label_list'] = df['label_list'].apply(clean_labels)

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=onehot_cols)
y_all = mlb.fit_transform(df['clean_label_list'])

# X 特征处理同样保持不变
drop_cols = ['dialog_id','turn_id','text','labels','role','label_list','clean_label_list'] + onehot_cols
feature_cols = [c for c in df.columns if c not in drop_cols]
X_all = df[feature_cols].fillna(0).astype(float).values

# 分割
groups = df['dialog_id'].values
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X_all, y_all, groups))

# 拆分
X_train, X_test = X_all[train_idx], X_all[test_idx]
y_train, y_test = y_all[train_idx], y_all[test_idx]

print(X_train[:5])
print(y_train[:5])



[[ 1.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  2.50000000e-01  3.20000000e+01
   2.70000000e+01  2.70000000e+01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.05000000e-01  6.80000000e-02  8.26000000e-01
   3.18200000e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 0.00000000e+00  1.31466291e-01  1.31466291e-01  0.00000000e+00
   0.00000000e+00  2.00000000e+00  5.00000000e-01  1.00000000e+01
   1.00000000e+01  1.00000000e+01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  2.74000000e-01  3.15000000e-01  4.11000000e-01
  -1.53100000e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 0.00000000e+00  1.58431439e-01  2.71627630e-01  0.00000000e+00
   0.00000000e+00  3.00000000e+00  7.50000000e-01  1.00000000e+01
   1.00000000e+01  1.00000000e+01  0.0

In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# 自定义 base 模型（注意 class_weight 才会生效）
rf = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42)

# 构建多输出模型
clf = MultiOutputClassifier(rf)

# 拟合训练集（多标签 Y_train）
clf.fit(X_train, y_train)

In [73]:
# === 预测概率 ===
proba_list = [est.predict_proba(X_test)[:, 1] for est in clf.estimators_]
y_proba = np.column_stack(proba_list)

# === 0.5 阈值化 ===
y_pred = (y_proba >= 0.5).astype(int)

print("=== 预测结果 ===")
print("y_pred.shape:", y_pred.shape)
print("y_pred[:5]:", y_pred[:5])
print("y_test[:5]:", y_test[:5])

# === 指标 ===
from sklearn.metrics import accuracy_score, f1_score, classification_report
subset_acc = accuracy_score(y_test, y_pred)
print("Exact-match accuracy:", subset_acc)

print("Micro-F1:",  f1_score(y_test, y_pred, average='micro').round(3))
print("Macro-F1:",  f1_score(y_test, y_pred, average='macro').round(3))
print(classification_report(y_test, y_pred,
                            target_names=onehot_cols,
                            zero_division=0, digits=3))

=== 预测结果 ===
y_pred.shape: (1958, 12)
y_pred[:5]: [[0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]]
y_test[:5]: [[0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]]
Exact-match accuracy: 0.449438202247191
Micro-F1: 0.594
Macro-F1: 0.312
              precision    recall  f1-score   support

          CQ      0.000     0.000     0.000        96
          FD      0.531     0.204     0.295       417
          FQ      0.481     0.102     0.168       128
          GG      0.742     0.333     0.460        69
          IR      0.519     0.147     0.229       191
          JK      0.571     0.286     0.381        14
          NF      1.000     0.018     0.035       113
           O      0.000     0.000     0.000         5
          OQ      0.973     0.964     0.968       445
          PA      0.760     0.748     0.754       738

In [43]:
#adaboost
import pandas as pd, numpy as np
from sklearn.model_selection import GroupShuffleSplit

# ---- 1.1 目标列：对话行为 one-hot ----
onehot_cols = [
    'CQ','FD','FQ','GG','IR','JK','NF','O',
    'OQ','PA','PF','RQ'         # 改成你表里实际列名
]
y_all = df[onehot_cols].astype(int).values      # (n_samples, n_labels)

# ---- 1.2 特征列：删除文本、标签、字符串列 ----
drop_cols = ['dialog_id','turn_id','text','labels','role'] + onehot_cols
feature_cols = [c for c in df.columns if c not in drop_cols]
X_all   = df[feature_cols].fillna(0).astype(float).values
groups  = df['dialog_id'].values


In [44]:
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X_all, y_all, groups))
X_train, X_test = X_all[train_idx], X_all[test_idx]
y_train, y_test = y_all[train_idx], y_all[test_idx]

print(X_train[:5])
print(y_train[:5])
print(X_test[:5])
print(y_test[:5])


[[ 1.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  2.50000000e-01  3.20000000e+01
   2.70000000e+01  2.70000000e+01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.05000000e-01  6.80000000e-02  8.26000000e-01
   3.18200000e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 0.00000000e+00  1.31466291e-01  1.31466291e-01  0.00000000e+00
   0.00000000e+00  2.00000000e+00  5.00000000e-01  1.00000000e+01
   1.00000000e+01  1.00000000e+01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  2.74000000e-01  3.15000000e-01  4.11000000e-01
  -1.53100000e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 0.00000000e+00  1.58431439e-01  2.71627630e-01  0.00000000e+00
   0.00000000e+00  3.00000000e+00  7.50000000e-01  1.00000000e+01
   1.00000000e+01  1.00000000e+01  0.0

In [45]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier

# 基学习器：深度1 的树桩（论文同款：非线性，但不过拟合）
base_stump = DecisionTreeClassifier(max_depth=1, random_state=42)

adb = AdaBoostClassifier(
        estimator      = base_stump,
        n_estimators   = 300,        # 论文用百级别
        learning_rate  = 0.1,
        algorithm      = 'SAMME.R',  # 实值概率输出
        random_state   = 42)

clf = MultiOutputClassifier(adb, n_jobs=-1)
clf.fit(X_train, y_train)




In [46]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ---- 4.1 取每个标签为1的概率 ----
proba_list = [est.predict_proba(X_test)[:, 1] for est in clf.estimators_]
y_proba = np.column_stack(proba_list)

# ---- 4.2 阈值化 ----
y_pred = (y_proba >= 0.5).astype(int)

# ---- 4.3 指标 ----
subset_acc = accuracy_score(y_test, y_pred)
micro_f1   = f1_score(y_test, y_pred, average='micro')
macro_f1   = f1_score(y_test, y_pred, average='macro')

print(f"Subset-Acc : {subset_acc:.3f}")
print(f"Micro-F1   : {micro_f1:.3f}")
print(f"Macro-F1   : {macro_f1:.3f}\n")

print(classification_report(
        y_test, y_pred,
        target_names=onehot_cols,
        zero_division=0, digits=3))


Subset-Acc : 0.427
Micro-F1   : 0.576
Macro-F1   : 0.235

              precision    recall  f1-score   support

          CQ      0.000     0.000     0.000        96
          FD      0.626     0.137     0.224       417
          FQ      0.000     0.000     0.000       128
          GG      0.556     0.145     0.230        69
          IR      0.556     0.052     0.096       191
          JK      0.500     0.071     0.125        14
          NF      0.000     0.000     0.000       113
           O      0.000     0.000     0.000         5
          OQ      0.968     0.964     0.966       445
          PA      0.760     0.741     0.750       738
          PF      0.675     0.313     0.427       179
          RQ      0.000     0.000     0.000        86

   micro avg      0.807     0.447     0.576      2481
   macro avg      0.387     0.202     0.235      2481
weighted avg      0.615     0.447     0.480      2481
 samples avg      0.552     0.498     0.514      2481



In [47]:
# svm1
import numpy as np, pandas as pd
from sklearn.model_selection import GroupShuffleSplit

# --- 目标列 ---
onehot_cols = ['CQ','FD','FQ','GG','IR','JK','NF','O',
               'OQ','PA','PF','RQ']
y_all = df[onehot_cols].astype(int).values          # (n_samples, n_labels)

# --- 特征列 ---
drop_cols = ['dialog_id','turn_id','text','labels','role'] + onehot_cols
feature_cols = [c for c in df.columns if c not in drop_cols]
X_all  = df[feature_cols].fillna(0).astype(float).values
groups = df['dialog_id'].values

# --- Train/Test 按对话分组 ---
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X_all, y_all, groups))
X_train, X_test = X_all[train_idx], X_all[test_idx]
y_train, y_test = y_all[train_idx], y_all[test_idx]


In [48]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# SVM 对稀疏向量建议加 StandardScaler(with_mean=False)
svc = make_pipeline(
        StandardScaler(with_mean=False),
        LinearSVC(C=1.0, class_weight='balanced')   # C 可自行调
      )

clf = OneVsRestClassifier(svc, n_jobs=-1)
clf.fit(X_train, y_train)

# --- 距离分数 ---
y_score = clf.decision_function(X_test)            # shape = (n_samples, n_labels)
y_pred  = (y_score >= 0).astype(int)               # 距离≥0 视为1


In [49]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

subset_acc = accuracy_score(y_test, y_pred)        # exact-match
micro_f1   = f1_score(y_test, y_pred, average='micro')
macro_f1   = f1_score(y_test, y_pred, average='macro')

print(f"Subset-Acc : {subset_acc:.3f}")
print(f"Micro-F1   : {micro_f1:.3f}")
print(f"Macro-F1   : {macro_f1:.3f}\n")

print(classification_report(
        y_test, y_pred,
        target_names=onehot_cols,
        zero_division=0, digits=3))


Subset-Acc : 0.197
Micro-F1   : 0.455
Macro-F1   : 0.365

              precision    recall  f1-score   support

          CQ      0.134     0.677     0.224        96
          FD      0.340     0.683     0.454       417
          FQ      0.214     0.711     0.329       128
          GG      0.225     0.870     0.357        69
          IR      0.274     0.738     0.399       191
          JK      0.024     0.714     0.046        14
          NF      0.125     0.566     0.205       113
           O      0.007     0.400     0.015         5
          OQ      0.975     0.964     0.969       445
          PA      0.647     0.828     0.726       738
          PF      0.305     0.810     0.443       179
          RQ      0.118     0.826     0.206        86

   micro avg      0.319     0.796     0.455      2481
   macro avg      0.282     0.732     0.365      2481
weighted avg      0.500     0.796     0.581      2481
 samples avg      0.442     0.817     0.525      2481



In [50]:
# svm2
from sklearn.calibration import CalibratedClassifierCV

base_svc  = LinearSVC(C=1.0, class_weight='balanced')
svc_cal   = CalibratedClassifierCV(base_svc, method='sigmoid', cv=3)  # Platt scaling
clf_prob  = OneVsRestClassifier(svc_cal, n_jobs=-1)
clf_prob.fit(X_train, y_train)

# --- 概率矩阵 ---
proba_list = [est.predict_proba(X_test)[:, 1] for est in clf_prob.estimators_]
y_proba = np.column_stack(proba_list)              # (n_samples, n_labels)
y_pred  = (y_proba >= 0.5).astype(int)


In [51]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

subset_acc = accuracy_score(y_test, y_pred)        # exact-match
micro_f1   = f1_score(y_test, y_pred, average='micro')
macro_f1   = f1_score(y_test, y_pred, average='macro')

print(f"Subset-Acc : {subset_acc:.3f}")
print(f"Micro-F1   : {micro_f1:.3f}")
print(f"Macro-F1   : {macro_f1:.3f}\n")

print(classification_report(
        y_test, y_pred,
        target_names=onehot_cols,
        zero_division=0, digits=3))


Subset-Acc : 0.404
Micro-F1   : 0.550
Macro-F1   : 0.225

              precision    recall  f1-score   support

          CQ      0.000     0.000     0.000        96
          FD      0.527     0.070     0.123       417
          FQ      0.444     0.031     0.058       128
          GG      0.571     0.116     0.193        69
          IR      0.500     0.105     0.173       191
          JK      0.000     0.000     0.000        14
          NF      0.500     0.035     0.066       113
           O      0.000     0.000     0.000         5
          OQ      0.975     0.964     0.969       445
          PA      0.711     0.707     0.709       738
          PF      0.592     0.235     0.336       179
          RQ      0.750     0.035     0.067        86

   micro avg      0.772     0.428     0.550      2481
   macro avg      0.464     0.191     0.225      2481
weighted avg      0.644     0.428     0.457      2481
 samples avg      0.523     0.475     0.488      2481



In [52]:
# Naïve Bayes
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(
        MinMaxScaler(),                  # 把连续特征压到 0-1
        BernoulliNB(alpha=1.0)           # Laplace 平滑
      )

clf = MultiOutputClassifier(pipe, n_jobs=-1)
clf.fit(X_train, y_train)

# ---- BernoulliNB 自带 predict_proba ----
proba_list = [est.predict_proba(X_test)[:,1] for est in clf.estimators_]
y_proba = np.column_stack(proba_list)
y_pred  = (y_proba >= 0.5).astype(int)


In [53]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

subset_acc = accuracy_score(y_test, y_pred)            # exact-match
micro_f1   = f1_score(y_test, y_pred, average='micro')
macro_f1   = f1_score(y_test, y_pred, average='macro')

print(f"Subset-Acc : {subset_acc:.3f}")
print(f"Micro-F1   : {micro_f1:.3f}")
print(f"Macro-F1   : {macro_f1:.3f}\n")

print(classification_report(
        y_test, y_pred,
        target_names=onehot_cols,
        zero_division=0, digits=3))


Subset-Acc : 0.389
Micro-F1   : 0.526
Macro-F1   : 0.234

              precision    recall  f1-score   support

          CQ      0.000     0.000     0.000        96
          FD      0.417     0.012     0.023       417
          FQ      1.000     0.008     0.016       128
          GG      0.375     0.217     0.275        69
          IR      0.000     0.000     0.000       191
          JK      0.217     0.357     0.270        14
          NF      0.200     0.009     0.017       113
           O      0.125     0.200     0.154         5
          OQ      0.977     0.964     0.971       445
          PA      0.534     0.833     0.651       738
          PF      0.343     0.508     0.410       179
          RQ      1.000     0.012     0.023        86

   micro avg      0.598     0.469     0.526      2481
   macro avg      0.432     0.260     0.234      2481
weighted avg      0.536     0.469     0.413      2481
 samples avg      0.541     0.532     0.520      2481



In [54]:
from sklearn.naive_bayes import GaussianNB
from sklearn.multioutput import MultiOutputClassifier

gnb  = GaussianNB(var_smoothing=1e-9)
clf  = MultiOutputClassifier(gnb, n_jobs=-1)
clf.fit(X_train, y_train)

# ---- GaussianNB 也有 predict_proba ----
proba_list = [est.predict_proba(X_test)[:,1] for est in clf.estimators_]
y_proba = np.column_stack(proba_list)
y_pred  = (y_proba >= 0.5).astype(int)


In [55]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

subset_acc = accuracy_score(y_test, y_pred)            # exact-match
micro_f1   = f1_score(y_test, y_pred, average='micro')
macro_f1   = f1_score(y_test, y_pred, average='macro')

print(f"Subset-Acc : {subset_acc:.3f}")
print(f"Micro-F1   : {micro_f1:.3f}")
print(f"Macro-F1   : {macro_f1:.3f}\n")

print(classification_report(
        y_test, y_pred,
        target_names=onehot_cols,
        zero_division=0, digits=3))


Subset-Acc : 0.191
Micro-F1   : 0.361
Macro-F1   : 0.303

              precision    recall  f1-score   support

          CQ      0.078     0.604     0.138        96
          FD      0.310     0.734     0.436       417
          FQ      0.202     0.625     0.305       128
          GG      0.100     0.913     0.180        69
          IR      0.143     0.848     0.245       191
          JK      0.015     1.000     0.029        14
          NF      0.119     0.354     0.179       113
           O      0.003     0.400     0.006         5
          OQ      0.966     0.964     0.965       445
          PA      0.525     0.963     0.680       738
          PF      0.170     0.894     0.285       179
          RQ      0.109     0.651     0.187        86

   micro avg      0.230     0.839     0.361      2481
   macro avg      0.228     0.746     0.303      2481
weighted avg      0.430     0.839     0.529      2481
 samples avg      0.381     0.858     0.459      2481

