In [5]:
import pretrait_tools as pt
import feature_engineering as fe  

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')          # 单词/句子分词
nltk.download('vader_lexicon')  # 情感分析用

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/langlang056/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/langlang056/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
# =========================================================
# ➊ 读取 & 扁平化  （保持不变）
# =========================================================
INPUT_FILE = "../data/all.tsv"
dialogs = pt.load_dialogs(INPUT_FILE)
df = pt.flatten_dialogs(dialogs)
df, mlb = pt.binarize_labels(df)      # 现在 df 里就有 'CQ' 'FD' … 列

# =========================================================
# ➋ 划分 Train / Test *在这一段加入修正① & ②*
# =========================================================
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(df, groups=df['dialog_id']))
train_df, test_df   = df.iloc[train_idx].copy(), df.iloc[test_idx].copy()

# —— 修正②：Label-Powerset 变单标签（如果你想保持 one-hot 可跳过）——
onehot_cols = ['CQ','FD','FQ','GG','IR','JK','NF','O',
               'OQ','PA','PF','RQ']
def to_lp(row):
    return '&'.join([lab for lab in onehot_cols if row[lab]==1]) or '__NONE__'

train_df['lp_label'] = train_df.apply(to_lp, axis=1)
test_df ['lp_label'] = test_df .apply(to_lp, axis=1)

# =========================================================
# ➌ 训练 TF-IDF  *在这一段加入修正①*
# =========================================================
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(x):           # 论文里就用简单分词即可
    return nltk.word_tokenize(x.lower())

vec = TfidfVectorizer(tokenizer=tokenize, lowercase=False, min_df=2)
vec.fit(train_df['text'])          # ← 只用训练集拟合

# =========================================================
# ➍ 提特征  *把修正③ ④ 放进 extract_dialog_features() 里*
# =========================================================
from feature_utils import *        # 你的其它函数照旧

# ---- 修正④：加载 MPQA 词典，一次即可 ----
mpqa = pd.read_csv('mpqa_subj_lexicon.txt', sep='\t',
                   names=['word','pos','pol'])
pos_dict = set(mpqa.loc[mpqa['pol']=='positive','word'])
neg_dict = set(mpqa.loc[mpqa['pol']=='negative','word'])

# ---- 修正③：改进 duplicate() 写在函数内部 ----
stop = set(nltk.corpus.stopwords.words('english'))
def canon_sent(s):
    toks = [stemmer.stem(t) for t in tokenize(s) if t not in stop]
    return ' '.join(toks)

# ========= 定义特征函数 =========
def extract_dialog_features(dialog):
    """dialog 是同一 dialog_id 的 DataFrame（已按 turn_id 排序）"""
    utters = dialog['text'].tolist()
    tfidf = vec.transform(utters)
    first_vec = tfidf[0]
    
    init_sim  = cosine_similarity(tfidf, first_vec).ravel()
    thread_sim = []
    mean_vec = first_vec.copy()
    for i in range(len(utters)):
        if i == 0:
            thread_sim.append(1.0)
        else:
            thread_sim.append(cosine_similarity(tfidf[i], mean_vec/i).item())
            mean_vec += tfidf[i]
    
    qm   = ["?" in u for u in utters]
    dup  = pd.Series(utters).str.lower().duplicated().tolist()
    
    w_pattern = re.compile(r'\b(who|what|when|where|why|how)\b', re.I)
    w5h1 = [{w: int(bool(re.search(fr'\b{w}\b', u, re.I))) for w in
             ["who","what","when","where","why","how"]} for u in utters]
    
    abs_pos  = np.arange(1, len(utters)+1)
    norm_pos = abs_pos / len(utters)
    
    tokens   = [tokenize(u) for u in utters]
    post_len = [len(t) for t in tokens]
    uniq_len = [len(set(t)) for t in tokens]
    uniq_stm = [len({stemmer.stem(w) for w in t}) for t in tokens]
    
    thx  = [bool(re.search(r'\bthanks?\b', u, re.I)) for u in utters]
    excl = ["!" in u for u in utters]
    vef  = [bool(re.search(r'\b(very|extremely)\s+(good|helpful|useful|nice|excellent)\b', u, re.I))
            for u in utters]
    
    vader = [sia.polarity_scores(u) for u in utters]
    
    pos_cnt = [sum(w in pos_dict for w in t) for t in tokens]
    neg_cnt = [sum(w in neg_dict for w in t) for t in tokens]
    
    # --- 写回原 dialog DataFrame ---
    dialog = dialog.assign(
        init_sim=init_sim, thread_sim=thread_sim,
        qm=qm, dup=dup,
        abs_pos=abs_pos, norm_pos=norm_pos,
        post_len=post_len, uniq_len=uniq_len, uniq_stm=uniq_stm,
        thank=thx, exclam=excl, ve_feedback=vef,
        pos_score=[v['pos'] for v in vader],
        neg_score=[v['neg'] for v in vader],
        neu_score=[v['neu'] for v in vader],
        comp_score=[v['compound'] for v in vader],
        pos_cnt=pos_cnt, neg_cnt=neg_cnt,
        **{f'{w}_flag':[d[w] for d in w5h1] for w in ["who","what","when","where","why","how"]}
    )
    return dialog

# —— 在 train_df / test_df 上分别提特征 —— 
train_df = (train_df.sort_values(['dialog_id','turn_id'])
                    .groupby('dialog_id', group_keys=False)
                    .apply(extract_dialog_features))

test_df  = (test_df.sort_values(['dialog_id','turn_id'])
                    .groupby('dialog_id', group_keys=False)
                    .apply(extract_dialog_features))




FileNotFoundError: [Errno 2] No such file or directory: 'mpqa_subj_lexicon.txt'