In [2]:
pip install jieba scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [43]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer

# 假设我们要处理的数据文件已经上传并读取
file_path = 'Blossoms_Food_Tiktok4.xlsx'
data = pd.read_excel(file_path)

# 扩展后的关键词列表
keywords = [
    "打卡了", "刚去", "吃了", "去过", "吃过", "好吃", "去吃", "想吃", "超想吃", "度假", "出游",
    "旅游", "打卡", "景点", "想去", "去玩", "参观", "游玩", "去看看", "旅行", "度假", "出游", 
    "探险", "游览", "美景", "景区", "名胜", "胜地", "行程", "攻略", "路线", "预订", 
    "游客", "导游", "走走", "好玩", "放松", "休闲", "太美了", "必须去", "不能错过", "好想去", 
    "真的不错", "绝了", "爱了", "值得一去"
]
emojis = ["[比心]", "[赞]", "[强]", "[舔屏]", "[爱心]", "[送心]", "[玫瑰]"]  # 示例表情符号
special_phrases = ["【发表图片】"]  # 包含发表图片的情况

def label_by_keywords(comment, keywords, emojis, special_phrases):
    # 检查是否包含 @ 符号
    if "@" in comment:
        return 1
    
    # 检查是否包含表情符号
    for emoji in emojis:
        if emoji in comment:
            return 1
    
    # 检查是否包含特殊短语，如 【发表图片】
    for phrase in special_phrases:
        if phrase in comment:
            return 1
    
    # 检查是否包含关键词
    for keyword in keywords:
        if keyword in comment:
            return 1
    
    # 如果以上条件都不满足，标注为 0
    return 0

# 重新标注数据
data['label'] = data['评论'].apply(lambda x: label_by_keywords(str(x), keywords, emojis, special_phrases))

# 移除重复评论和空白评论
data_cleaned = data.dropna(subset=['评论']).drop_duplicates(subset=['评论'])

# 查看数据的前几行
print(data_cleaned.head())


          用户名    IP                                                 评论  \
0    Vectivus    辽宁        [思考]有人说鲜得来是老师傅退休配方带走了之后就不好吃了。有上海人肉身感受过么[思考]   
1  星瞳 /•᷅•᷄\୭  IP未知  繁华真的美化太多以前的食物了，小时候就没觉得排骨年糕好吃[呆无辜][呆无辜]阿拉老娘做的面拖...   
2       阐述你的梦    上海                                    终于看到有人说这家了 是真好吃   
3          🌊🌊    上海                                        刚吃的没切【发表图片】   
4      小当当童书馆    上海                                         阿平面馆（我记住了）   

                   时间  回复数   点赞数  label  
0 2024-01-14 12:54:51  114  8358      1  
1 2024-01-14 15:20:34   40  4544      1  
2 2024-01-14 14:16:27   20   282      1  
3 2024-01-14 14:00:47   32   747      1  
4 2024-01-15 13:42:36    3   124      0  


In [44]:
# 对评论进行分词处理
data_cleaned['评论_分词'] = data_cleaned['评论'].apply(lambda x: " ".join(jieba.cut(x)))

# 使用TF-IDF对文本进行向量化
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data_cleaned['评论_分词'])

# 查看处理后的矩阵形状
print(X.shape)

(1148, 2340)


In [45]:
#构建训练集和测试集

from sklearn.model_selection import train_test_split

# 假设你已经有了 TF-IDF 处理后的特征矩阵 X 和标签 y
y = data_cleaned['label']

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 输出训练集和测试集的形状
print("训练集大小:", X_train.shape)
print("测试集大小:", X_test.shape)


训练集大小: (918, 2340)
测试集大小: (230, 2340)


In [46]:
#使用支持向量机 (SVM) 模型对训练集数据进行训练

from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 初始化SVM模型
svm_model = SVC(kernel='linear', random_state=42)

# 训练模型
svm_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = svm_model.predict(X_test)

# 输出分类报告
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.64      0.71        95
           1       0.78      0.87      0.82       135

    accuracy                           0.78       230
   macro avg       0.78      0.76      0.76       230
weighted avg       0.78      0.78      0.77       230



In [38]:
print(data['label'].value_counts())


label
1    1400
0      20
Name: count, dtype: int64
