In [1]:
import pretrait_tools as pt
import feature_engineering as fe  

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import pandas as pd
import nltk
from nltk.corpus import stopwords

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier

# 所有可能的标签（注意顺序保持一致）
onehot_cols = ['CQ','FD','FQ','GG','IR','JK','NF','O','OQ','PA','PF','RQ']
mlb = MultiLabelBinarizer(classes=onehot_cols)

def load_dataset(path):
    labels = []
    features = []
    with open(path, "r") as f:
        for line in f:
            if line.strip():
                parts = line.strip().split()
                raw_labels = parts[0].split('_')  # 可能是多标签（以_连接）
                feat = list(map(float, parts[1:]))
                labels.append(raw_labels)
                features.append(feat)
    # 转为 one-hot
    y = mlb.fit_transform(labels)
    X = np.array(features)
    return X, y

# 加载数据集
X_train, y_train = load_dataset("../data/msdialog/train_features.tsv")
X_valid, y_valid = load_dataset("../data/msdialog/valid_features.tsv")
X_test, y_test   = load_dataset("../data/msdialog/test_features.tsv")

# 构建多标签随机森林
rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf = MultiOutputClassifier(rf)

# 模型训练
clf.fit(X_train, y_train)

# 验证集预测与报告
y_valid_pred = clf.predict(X_valid)
print("📊 Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

# 测试集预测与报告
y_test_pred = clf.predict(X_test)
print("📊 Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))


📊 Validation Classification Report:
              precision    recall  f1-score   support

          CQ       0.00      0.00      0.00        66
          FD       0.58      0.35      0.44       193
          FQ       0.36      0.08      0.13        65
          GG       0.63      0.46      0.54        41
          IR       0.67      0.24      0.35        84
          JK       1.00      0.08      0.15        12
          NF       0.50      0.02      0.04        47
           O       0.00      0.00      0.00         2
          OQ       0.98      0.97      0.97       221
          PA       0.82      0.86      0.84       365
          PF       0.71      0.41      0.52        97
          RQ       0.29      0.04      0.07        49

   micro avg       0.80      0.55      0.65      1242
   macro avg       0.54      0.29      0.34      1242
weighted avg       0.69      0.55      0.58      1242
 samples avg       0.68      0.61      0.63      1242

📊 Test Classification Report:
             

In [3]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier

# 构造模型
base_estimator = AdaBoostClassifier(
    n_estimators=100,
    learning_rate=0.8,
    random_state=42
)
clf = MultiOutputClassifier(base_estimator)

# 拟合模型
clf.fit(X_train, y_train)

# 预测并评估
y_valid_pred = clf.predict(X_valid)
print("📊 Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

y_test_pred = clf.predict(X_test)
print("📊 Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))




📊 Validation Classification Report:
              precision    recall  f1-score   support

          CQ       0.00      0.00      0.00        66
          FD       0.53      0.34      0.41       193
          FQ       0.50      0.12      0.20        65
          GG       0.71      0.49      0.58        41
          IR       0.54      0.15      0.24        84
          JK       0.33      0.08      0.13        12
          NF       0.27      0.06      0.10        47
           O       0.00      0.00      0.00         2
          OQ       0.98      0.97      0.97       221
          PA       0.79      0.86      0.82       365
          PF       0.69      0.49      0.57        97
          RQ       0.57      0.08      0.14        49

   micro avg       0.76      0.56      0.64      1242
   macro avg       0.49      0.30      0.35      1242
weighted avg       0.66      0.56      0.58      1242
 samples avg       0.67      0.62      0.63      1242

📊 Test Classification Report:
             

In [4]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# 构造 SVM 模型
clf = OneVsRestClassifier(LinearSVC(random_state=42, max_iter=5000))

# 拟合
clf.fit(X_train, y_train)

# 验证集预测
y_valid_pred = clf.predict(X_valid)
print("📊 Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

# 测试集预测
y_test_pred = clf.predict(X_test)
print("📊 Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))


📊 Validation Classification Report:
              precision    recall  f1-score   support

          CQ       0.00      0.00      0.00        66
          FD       0.56      0.31      0.39       193
          FQ       0.00      0.00      0.00        65
          GG       0.58      0.27      0.37        41
          IR       0.43      0.04      0.07        84
          JK       0.00      0.00      0.00        12
          NF       1.00      0.02      0.04        47
           O       0.00      0.00      0.00         2
          OQ       0.97      0.97      0.97       221
          PA       0.76      0.87      0.81       365
          PF       0.82      0.46      0.59        97
          RQ       0.00      0.00      0.00        49

   micro avg       0.79      0.52      0.63      1242
   macro avg       0.43      0.24      0.27      1242
weighted avg       0.63      0.52      0.54      1242
 samples avg       0.65      0.58      0.60      1242

📊 Test Classification Report:
             

In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# 使用 GaussianNB（适合连续特征）
clf = OneVsRestClassifier(GaussianNB())

# 拟合模型
clf.fit(X_train, y_train)

# 验证集预测
y_valid_pred = clf.predict(X_valid)
print("📊 Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

# 测试集预测
y_test_pred = clf.predict(X_test)
print("📊 Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))


📊 Validation Classification Report:
              precision    recall  f1-score   support

          CQ       0.22      0.33      0.27        66
          FD       0.41      0.32      0.36       193
          FQ       0.26      0.28      0.27        65
          GG       0.15      0.95      0.25        41
          IR       0.21      0.74      0.32        84
          JK       0.03      1.00      0.05        12
          NF       0.16      0.32      0.21        47
           O       0.01      1.00      0.01         2
          OQ       0.93      0.97      0.95       221
          PA       0.69      0.87      0.77       365
          PF       0.32      0.81      0.46        97
          RQ       0.24      0.59      0.35        49

   micro avg       0.31      0.70      0.43      1242
   macro avg       0.30      0.68      0.36      1242
weighted avg       0.52      0.70      0.57      1242
 samples avg       0.43      0.74      0.50      1242

📊 Test Classification Report:
             

In [None]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors

class MyMultiLabelKNN:
    def __init__(self, k=10, threshold=0.5):
        self.k = k
        self.threshold = threshold

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.nn = NearestNeighbors(n_neighbors=self.k)
        self.nn.fit(X)

    def predict(self, X_test):
        distances, indices = self.nn.kneighbors(X_test)
        y_pred = []

        for neighbors in indices:
            # 取这些邻居的标签
            neighbor_labels = self.y_train[neighbors]
            # 按列求和（每个标签出现的次数）
            label_counts = np.sum(neighbor_labels, axis=0)
            # 阈值决定是否激活该标签
            label_pred = (label_counts / self.k) >= self.threshold
            y_pred.append(label_pred.astype(int))

        return np.array(y_pred)
    
# 训练和预测
clf = MyMultiLabelKNN(k=10, threshold=0.5)
clf.fit(X_train, y_train)

y_valid_pred = clf.predict(X_valid)
print("📊 Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

y_test_pred = clf.predict(X_test)
print("📊 Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))

TypeError: NearestNeighbors.__init__() takes 1 positional argument but 2 were given