# Import dataset

In [1]:
import json
import pandas as pd
import re
import jieba 
import jieba.analyse
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from datetime import datetime
from imblearn.over_sampling import RandomOverSampler
import scipy.stats as stats


2023-12-31 14:45:43.031228: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# models

## Split the dataset into training and test sets

In [13]:
X

<20650x983994 sparse matrix of type '<class 'numpy.float64'>'
	with 11403990 stored elements in Compressed Sparse Row format>

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## LightGBM

In [156]:
import lightgbm as lgb

In [157]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [158]:
#lightgbm模型参数设置，根据自己的需求调一调
params = {
    'task':'train',
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':{'accuracy','auc','binary_logloss'},
    'num_leaves':40,
    'learning_rate':0.05,
    'feature_fraction':0.9,
    'bagging_fraction':0.8,
    'bagging_freq':5,
    'verbose':0,
    'is_unbalance':True
      
}

In [159]:
#训练参数设置
gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)
#模型预测
lgb_pre = gbm.predict(X_test) #括号中需要输入与训练时相同的数据格式

In [161]:
# 计算lightGBM指标
accuracy_lightgbm = accuracy_score(y_test, lgb_pre>0.5)
precision_lightgbm = precision_score(y_test, lgb_pre>0.5)
recall_lightgbm = recall_score(y_test, lgb_pre>0.5)
f1_lightgbm = f1_score(y_test, lgb_pre>0.5)

print('lightGBM')
print(f'Accuracy: {accuracy_lightgbm}')
print(f'Precision: {precision_lightgbm}')
print(f'Recall: {recall_lightgbm}')
print(f'F1 Score: {f1_lightgbm}')

lightGBM
Accuracy: 0.9586763518966909
Precision: 0.9704994971505196
Recall: 0.9451518119490695
F1 Score: 0.9576579556731724


## XGBoost

In [162]:
from xgboost.sklearn import XGBClassifier
import xgboost as xgb

In [167]:
# 训练xgboost模型
# xgboost = XGBClassifier(learning_rate=0.3, n_estimators=50, max_depth=2, min_child_weight=1,
#                           subsample=1, colsample_bytree=1, gamma=0.1, reg_alpha=0.01, reg_lambda=3)
xgboost = XGBClassifier()

xgboost.fit(X_train, y_train)

In [168]:
# 预测
y_pred_xgboost = xgboost.predict(X_test)

In [169]:
# 评估
accuracy_xbg = accuracy_score(y_test, y_pred_xgboost)
precision_xgb = precision_score(y_test, y_pred_xgboost)
recall_xgb = recall_score(y_test, y_pred_xgboost)
f1_xgb = f1_score(y_test, y_pred_xgboost)

print('xgBOOST')
print(f'Accuracy: {accuracy_xbg}')
print(f'Precision: {precision_xgb}')
print(f'Recall: {recall_xgb}')
print(f'F1 Score: {f1_xgb}')

xgBOOST
Accuracy: 0.9610976594027442
Precision: 0.9712758851035405
Recall: 0.9493960169768201
F1 Score: 0.9602113257388146


In [40]:
# normal content
new_tweet_content = ['这是 一条 新的 正常 推文']

# depressed content
# new_tweet_content = ['思 诺思 舒乐安定 代开 疫情 期间 开药 困难 开药 断药 关注 南京 兼职 超话志道 合盆友 解释 快 超话 传送门 南京 兼职']

# feature extraction
new_X = vectorizer.transform(new_tweet_content)

# RF
new_y_pred_rfc = xgboost.predict(new_X)

print('\nnew_tweet_content label:')
print('XGBoost:', new_y_pred_rfc)


new_tweet_content label:
XGBoost: [0]


## Random Forest

In [90]:
# 训练随机森林模型
rfc = RandomForestClassifier(n_estimators=13) # 修改树的棵树， 使用10折交叉验证来确定n_estimators的最佳值为13
rfc.fit(X_train, y_train)

In [91]:
# 预测测试集的label
y_pred_rfc = rfc.predict(X_test)

In [92]:
# 计算模型评估指标
accuracy_rf = accuracy_score(y_test, y_pred_rfc)
precision_rf = precision_score(y_test, y_pred_rfc)
recall_rf = recall_score(y_test, y_pred_rfc)
f1_rf = f1_score(y_test, y_pred_rfc)

print('Accuracy:', accuracy_rf)
print('Precision:', precision_rf)
print('Recall:', recall_rf)
print('F1 Score:', f1_rf)

Accuracy: 0.9063761097659403
Precision: 0.9219430485762145
Recall: 0.888028396256857
F1 Score: 0.9046679815910585


In [96]:
# normal content
new_tweet_content = ['这是 一条 新的 正常 推文']

# depressed content
# new_tweet_content = ['思 诺思 舒乐安定 代开 疫情 期间 开药 困难 开药 断药 关注 南京 兼职 超话志道 合盆友 解释 快 超话 传送门 南京 兼职']

# feature extraction
new_X = vectorizer.transform(new_tweet_content)

# RF
new_y_pred_rfc = rfc.predict(new_X)

print('\nnew_tweet_content label:')
print('RF:', new_y_pred_rfc)


new_tweet_content label:
RF: [0]


## SVM

In [97]:
# 训练SVM模型
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

In [214]:
# 预测测试集的label
y_pred_svc = svc.predict(X_test)

In [215]:
# 计算SVM模型评估指标
accuracy_svc = accuracy_score(y_test, y_pred_svc)
precision_svc = precision_score(y_test, y_pred_svc)
recall_svc = recall_score(y_test, y_pred_svc)
f1_svc = f1_score(y_test, y_pred_svc)

print('\nSupport Vector Machine:')
print('Accuracy:', accuracy_svc)
print('Precision:', precision_svc)
print('Recall:', recall_svc)
print('F1 Score:', f1_svc)


Support Vector Machine:
Accuracy: 0.947861178369653
Precision: 0.9874520738933427
Recall: 0.9080128205128205
F1 Score: 0.9460677909500752


In [216]:
# normal content
# new_tweet_content = ['这是 一条 新的 正常 推文']

# depressed content
new_tweet_content = ['思 诺思 舒乐安定 代开 疫情 期间 开药 困难 开药 断药 关注 南京 兼职 超话志道 合盆友 解释 快 超话 传送门 南京 兼职']

# feature extraction
new_X = vectorizer.transform(new_tweet_content)

# SVM
new_y_pred_svc = svc.predict(new_X)

print('\nnew_tweet_content label:')
print('SVM:', new_y_pred_svc)


new_tweet_content label:
SVM: [1]


In [199]:
# normal content
# new_tweet_content = ['这是 一条 新的 正常 推文']

# depressed content
new_tweet_content = ['思 诺思 舒乐安定 代开 疫情 期间 开药 困难 开药 断药 关注 南京 兼职 超话志道 合盆友 解释 快 超话 传送门 南京 兼职']

# feature extraction
new_X = vectorizer.transform(new_tweet_content)

# SVM
new_y_pred_svc = svc.predict(new_X)
new_y_pred_rfc = rfc.predict(new_X)

print('\nnew_tweet_content label:')
print('SVM:', new_y_pred_svc)
print('RF:', new_y_pred_rfc)


new_tweet_content label:
SVM: [1]
RF: [1]


## BERT

In [10]:
import jieba

try:
    balanced_df_all_cleaned_tokenization = pd.read_csv('~/bert/balanced_df_all_cleaned_tokenization.csv')
except:
    balanced_df_all_cleaned_tokenization = pd.read_csv('balanced_df_all_cleaned_tokenization.csv')
balanced_df_all_cleaned_tokenization['tweet_content'] = balanced_df_all_cleaned_tokenization['tweet_content'].fillna('')
df3 = balanced_df_all_cleaned_tokenization.copy()

# Preprocess the data
X = df3['tweet_content']
y = df3['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Tokenize the sequences using Jieba tokenizer
def tokenize(text):
    return list(jieba.cut(text))

X_train_sequences = [' '.join(tokenize(x)) for x in X_train]
X_test_sequences = [' '.join(tokenize(x)) for x in X_test]


# Convert tokens to integer sequences
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_sequences)

# Convert tokenized sequences to integer sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train_sequences)
X_test_sequences = tokenizer.texts_to_sequences(X_test_sequences)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.387 seconds.
Prefix dict has been built successfully.


In [11]:
# Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_length = 700  # Set the maximum sequence length for padding
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class TextDataset(Dataset):
    def __init__(self, sequence, label):
        self.sequence = sequence
        self.label = label.values
        self.attention_mask = np.where(sequence==0, 0, 1)

    def __getitem__(self, index):
        return self.sequence[index], self.attention_mask[index], self.label[index]

    def __len__(self):
        return len(self.sequence)
    

train_dataset = TextDataset(X_train_padded, y_train)
test_dataset = TextDataset(X_test_padded, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [13]:
from transformers import BertConfig, BertForSequenceClassification, DistilBertConfig
import torch
from torch.optim import AdamW
from tqdm import tqdm
import torch.nn.functional as F

In [70]:
config = BertConfig(vocab_size=len(tokenizer.word_index)+1, hidden_size=32, num_hidden_layers=1, num_attention_heads=2, intermediate_size=32, max_sequence_length=max_sequence_length, max_position_embeddings=max_sequence_length)

In [40]:
device = 'cuda:1' if torch.cuda.is_available() else 'cpu'

In [71]:
bert = BertForSequenceClassification(config).to(device)

bert.train()
optimizer = AdamW(bert.parameters())

epochs = 2
for epoch in tqdm(range(epochs)):
    for i, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        batch = [item.to(device) for item in batch]
        sequence, attention_mask, label = batch
        outputs = bert(sequence, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
            print(f'Epoch: {epoch}, Step: {i}, Loss: {loss.cpu().item()}')

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 0, Step: 0, Loss: 0.6930980086326599
Epoch: 0, Step: 10, Loss: 0.6923404932022095
Epoch: 0, Step: 20, Loss: 0.6892273426055908
Epoch: 0, Step: 30, Loss: 0.6808115839958191
Epoch: 0, Step: 40, Loss: 0.598413348197937
Epoch: 0, Step: 50, Loss: 0.3680679500102997


 50%|█████     | 1/2 [00:02<00:02,  2.99s/it]

Epoch: 1, Step: 0, Loss: 0.19603076577186584
Epoch: 1, Step: 10, Loss: 0.196017786860466
Epoch: 1, Step: 20, Loss: 0.16065713763237
Epoch: 1, Step: 30, Loss: 0.190627321600914
Epoch: 1, Step: 40, Loss: 0.15493103861808777
Epoch: 1, Step: 50, Loss: 0.10660117119550705


100%|██████████| 2/2 [00:05<00:00,  2.99s/it]


In [72]:
bert.eval()
predictions, labels = [], []
with torch.no_grad():
    for data in tqdm(test_dataloader):
        data = [item.to(device) for item in data]
        outputs = bert(sequence, attention_mask=attention_mask, labels=label)
        predicted_label = F.sigmoid(outputs.logits).argmax(axis=1)
        predictions.append(predicted_label.cpu())
        labels.append(label.cpu())


predictions = torch.cat(predictions).numpy()
labels = torch.cat(labels).numpy()

100%|██████████| 25/25 [00:00<00:00, 98.61it/s]


In [73]:
# 计算BERT模型评估指标
accuracy_bert = accuracy_score(labels, predictions)
precision_bert = precision_score(labels, predictions)
recall_bert = recall_score(labels, predictions)
f1_bert = f1_score(labels, predictions)

print('\nBERT:')
print('Accuracy:', accuracy_bert)
print('Precision:', precision_bert)
print('Recall:', recall_bert)
print('F1 Score:', f1_bert)



BERT:
Accuracy: 0.9747899159663865
Precision: 0.9710144927536232
Recall: 0.9852941176470589
F1 Score: 0.9781021897810219


## Roberta

In [23]:
from transformers import RobertaConfig, RobertaForSequenceClassification

In [42]:
roberta_config = RobertaConfig(vocab_size=len(tokenizer.word_index)+1, hidden_size=32, num_hidden_layers=1, num_attention_heads=2, intermediate_size=32, max_position_embeddings=max_sequence_length * 2)
device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
roberta = RobertaForSequenceClassification(roberta_config)
roberta.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(800193, 32, padding_idx=1)
      (position_embeddings): Embedding(1400, 32, padding_idx=1)
      (token_type_embeddings): Embedding(2, 32)
      (LayerNorm): LayerNorm((32,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=32, out_features=32, bias=True)
              (key): Linear(in_features=32, out_features=32, bias=True)
              (value): Linear(in_features=32, out_features=32, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=32, out_features=32, bias=True)
              (LayerNorm): Laye

In [43]:
optimizer = AdamW(roberta.parameters())

epochs = 2
for epoch in tqdm(range(epochs)):
    for i, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        batch = [item.to(device) for item in batch]
        sequence, attention_mask, label = batch
        outputs = roberta(sequence, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
            print(f'Epoch: {epoch}, Step: {i}, Loss: {loss.cpu().item()}')

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 0, Step: 0, Loss: 0.6928017735481262
Epoch: 0, Step: 10, Loss: 0.692473828792572
Epoch: 0, Step: 20, Loss: 0.6899939179420471
Epoch: 0, Step: 30, Loss: 0.6554629802703857
Epoch: 0, Step: 40, Loss: 0.623778223991394
Epoch: 0, Step: 50, Loss: 0.4388459026813507


 50%|█████     | 1/2 [01:23<01:23, 83.39s/it]

Epoch: 1, Step: 0, Loss: 0.1890697032213211
Epoch: 1, Step: 10, Loss: 0.163442462682724
Epoch: 1, Step: 20, Loss: 0.20919883251190186
Epoch: 1, Step: 30, Loss: 0.13392075896263123
Epoch: 1, Step: 40, Loss: 0.1986142098903656
Epoch: 1, Step: 50, Loss: 0.11977552622556686


100%|██████████| 2/2 [02:49<00:00, 84.56s/it]


In [44]:
roberta.eval()
predictions, labels = [], []
with torch.no_grad():
    for data in tqdm(test_dataloader):
        data = [item.to(device) for item in data]
        outputs = roberta(sequence, attention_mask=attention_mask, labels=label)
        predicted_label = F.sigmoid(outputs.logits).argmax(axis=1)
        predictions.append(predicted_label.cpu())
        labels.append(label.cpu())


predictions = torch.cat(predictions).numpy()
labels = torch.cat(labels).numpy()

100%|██████████| 25/25 [00:06<00:00,  3.97it/s]


In [45]:
# 计算Roberta模型评估指标
accuracy_bert = accuracy_score(labels, predictions)
precision_bert = precision_score(labels, predictions)
recall_bert = recall_score(labels, predictions)
f1_bert = f1_score(labels, predictions)

print('Roberta:')
print('Accuracy:', accuracy_bert)
print('Precision:', precision_bert)
print('Recall:', recall_bert)
print('F1 Score:', f1_bert)


Roberta:
Accuracy: 0.9747899159663865
Precision: 0.9827586206896551
Recall: 0.9661016949152542
F1 Score: 0.9743589743589743


## TextCNN

In [48]:
import jieba

In [41]:
# Load the dataset
balanced_df_all_cleaned_tokenization = pd.read_csv('balanced_df_all_cleaned_tokenization.csv')

In [42]:
balanced_df_all_cleaned_tokenization['tweet_content'] = balanced_df_all_cleaned_tokenization['tweet_content'].fillna('')

In [43]:
df3 = balanced_df_all_cleaned_tokenization.copy()

In [44]:
df3.head()

Unnamed: 0,ID,label,tweet_content
0,15315,0,害真 挺 吃 颜想 岁 足 表达 减肥 决心 姐妹 谈恋爱 什 感觉 全世界 闺蜜 介绍 成...
1,17978,0,拥抱 世界 暖心 动作 生活 中 难免 遇 挫折 坎坷 安 慌乱 许时 拥抱 会 充满 量 ...
2,221,0,雨天 穿会 滑倒 坐水里 亲测 艾特子 赟 相信 家会 意见 麻雀 妈妈 问 麻雀 天扎什 ...
3,16604,0,种 幸福 做 桌菜 然家 朋友 满足 吃 完 没 帮忙 刷碗 老规矩 昨晚 想 做 早餐 奶...
4,20755,0,日 清明 青草 疫情 中 牺牲 医护 员 公安干警 基层干部 线 工作 逝世 胞 表示 沉痛...


In [45]:
# Preprocess the data
X = df3['tweet_content']
y = df3['label']

In [46]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [67]:
# Tokenize the sequences using Jieba tokenizer
def tokenize(text):
    return list(jieba.cut(text))
X_train_sequences = [' '.join(tokenize(x)) for x in X_train]
X_test_sequences = [' '.join(tokenize(x)) for x in X_test]


In [70]:
# Convert tokens to integer sequences
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_sequences)

# Convert tokenized sequences to integer sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train_sequences)
X_test_sequences = tokenizer.texts_to_sequences(X_test_sequences)

In [10]:
# Pad sequences
max_sequence_length = 7000  # Set the maximum sequence length for padding
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')


In [11]:
print(X_test_padded.shape)

(6195, 7000)
