In [None]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 假设我们要处理的数据文件已经上传并读取
file_path = 'Blossoms_Food_Tiktok4.xlsx'
data = pd.read_excel(file_path)

# 扩展后的关键词列表
keywords = [
    "打卡了", "刚去", "吃了", "去过", "吃过", "好吃", "去吃", "想吃", "超想吃", "度假", "出游",
    "旅游", "打卡", "景点", "想去", "去玩", "参观", "游玩", "去看看", "旅行", "度假", "出游", 
    "探险", "游览", "美景", "景区", "名胜", "胜地", "行程", "攻略", "路线", "预订", 
    "游客", "导游", "走走", "好玩", "放松", "休闲", "太美了", "必须去", "不能错过", "好想去", 
    "真的不错", "绝了", "爱了", "值得一去"
]
emojis = ["[比心]", "[赞]", "[强]", "[舔屏]", "[爱心]", "[送心]", "[玫瑰]"]  # 示例表情符号
special_phrases = ["【发表图片】"]  # 包含发表图片的情况

def label_by_keywords(comment, keywords, emojis, special_phrases):
    # 检查是否包含 @ 符号
    if "@" in comment:
        return 1
    
    # 检查是否包含表情符号
    for emoji in emojis:
        if emoji in comment:
            return 1
    
    # 检查是否包含特殊短语，如 【发表图片】
    for phrase in special_phrases:
        if phrase in comment:
            return 1
    
    # 检查是否包含关键词
    for keyword in keywords:
        if keyword in comment:
            return 1
    
    # 如果以上条件都不满足，标注为 0
    return 0

# 重新标注数据
data['label'] = data['评论'].apply(lambda x: label_by_keywords(str(x), keywords, emojis, special_phrases))

# 移除重复评论和空白评论
data_cleaned = data.dropna(subset=['评论']).drop_duplicates(subset=['评论'])

# 对评论进行分词处理
data_cleaned['评论_分词'] = data_cleaned['评论'].apply(lambda x: " ".join(jieba.cut(x)))

# 使用TF-IDF对文本进行向量化
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data_cleaned['评论_分词'])

# 构建训练集和测试集
y = data_cleaned['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化随机森林模型
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

# 训练模型
rf_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = rf_model.predict(X_test)

# 输出分类报告
print(classification_report(y_test, y_pred))
