In [2]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
%matplotlib inline
warnings.filterwarnings('ignore')

## **Loading Dataset**

In [3]:
df = pd.read_csv('dataset/LiwenliangFull.csv')
# Understand the basic information of dataset and the data types involved
df

Unnamed: 0.1,Unnamed: 0,max_id,微博id,评论页码,评论id,评论时间,评论点赞数,评论者IP归属地,评论者姓名,评论者id,评论者性别,评论者关注数,评论者粉丝数,评论内容
0,0,4893821462382076,4467107636950632,1.0,4893834767763833,2023-04-23 23:43:14,0.0,来自广西,W我想撑下去,5417297741,男,60,63,朋友可以借我一顿饭吗。出门在外找工作遇到困难了，肚子好饿。太艰难了
1,1,4893821462382076,4467107636950632,1.0,4893834284109853,2023-04-23 23:41:19,0.0,来自湖北,一起看日落吧x,5384707935,女,92,332,李医生 昨天武汉下好大雨哦
2,2,4893821462382076,4467107636950632,1.0,4893833496367362,2023-04-23 23:38:11,0.0,来自四川,薄荷甜绿,5255030406,女,169,99,李医生，晚安
3,3,4893821462382076,4467107636950632,1.0,4893832905232325,2023-04-23 23:35:50,0.0,来自法国,我是Liquor,2319895202,女,71,21,四月底了 下雨还是那么冷
4,4,4893821462382076,4467107636950632,1.0,4893831990086470,2023-04-23 23:32:12,0.0,来自河南,pawyer,6072946222,男,884,242,想起了你
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423466,1423466,4469128381645556,4467107636950632,,4469128385866379,2020-02-07 00:31:42,0.0,,安水果果菓果果,2251357683,女,477,88,全国人民都在守着你，加油！！
1423467,1423467,4469128381645556,4467107636950632,,4469128385134746,2020-02-07 00:31:42,0.0,,赌神奔奔918,5937000680,女,268,39,加油！我们等你回来🙏🙏🙏
1423468,1423468,4469128381645556,4467107636950632,,4469128381647504,2020-02-07 00:31:41,0.0,,奔跑的糯米丸子君,6447681726,女,123,30,是假的对不对，能不能出来辟谣一下
1423469,1423469,4469128381645556,4467107636950632,,4469128381645557,2020-02-07 00:31:41,0.0,,可弟子有惑依旧愚钝,2812656575,女,455,182,求求你，加油加油醒过来！


### **Drop Not Related Column (Based on pre-analysis codebook and NULL value)**

In [4]:
codebook = pd.read_csv('dataset/codebook.csv')
codebook_not_related = pd.DataFrame(data=codebook['Not Related'].unique(),columns=['code'])
codebook_not_related.dropna(inplace=True)
not_related = codebook_not_related['code'].tolist()
print(not_related) #Not related keywords

['图片评论', '图片评论  网页链接', '。', '看看', '1', '截图留个爪']


In [5]:
drop_keywords = df['评论内容'].isin(not_related)
df = df[~drop_keywords]
df = df.dropna(subset='评论内容').reset_index(drop=True)
df

Unnamed: 0.1,Unnamed: 0,max_id,微博id,评论页码,评论id,评论时间,评论点赞数,评论者IP归属地,评论者姓名,评论者id,评论者性别,评论者关注数,评论者粉丝数,评论内容
0,0,4893821462382076,4467107636950632,1.0,4893834767763833,2023-04-23 23:43:14,0.0,来自广西,W我想撑下去,5417297741,男,60,63,朋友可以借我一顿饭吗。出门在外找工作遇到困难了，肚子好饿。太艰难了
1,1,4893821462382076,4467107636950632,1.0,4893834284109853,2023-04-23 23:41:19,0.0,来自湖北,一起看日落吧x,5384707935,女,92,332,李医生 昨天武汉下好大雨哦
2,2,4893821462382076,4467107636950632,1.0,4893833496367362,2023-04-23 23:38:11,0.0,来自四川,薄荷甜绿,5255030406,女,169,99,李医生，晚安
3,3,4893821462382076,4467107636950632,1.0,4893832905232325,2023-04-23 23:35:50,0.0,来自法国,我是Liquor,2319895202,女,71,21,四月底了 下雨还是那么冷
4,4,4893821462382076,4467107636950632,1.0,4893831990086470,2023-04-23 23:32:12,0.0,来自河南,pawyer,6072946222,男,884,242,想起了你
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338078,1423466,4469128381645556,4467107636950632,,4469128385866379,2020-02-07 00:31:42,0.0,,安水果果菓果果,2251357683,女,477,88,全国人民都在守着你，加油！！
1338079,1423467,4469128381645556,4467107636950632,,4469128385134746,2020-02-07 00:31:42,0.0,,赌神奔奔918,5937000680,女,268,39,加油！我们等你回来🙏🙏🙏
1338080,1423468,4469128381645556,4467107636950632,,4469128381647504,2020-02-07 00:31:41,0.0,,奔跑的糯米丸子君,6447681726,女,123,30,是假的对不对，能不能出来辟谣一下
1338081,1423469,4469128381645556,4467107636950632,,4469128381645557,2020-02-07 00:31:41,0.0,,可弟子有惑依旧愚钝,2812656575,女,455,182,求求你，加油加油醒过来！


## **Loading the pre-trained model and tokenizer for predicting**

In [6]:
from transformers import BertTokenizer,BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

In [7]:
# 加载 BERT 分词器

model_name_tree_hole = 'model_save/tree_hole'

tokenizer_tree_hole = BertTokenizer.from_pretrained(model_name_tree_hole, do_lower_case=True)
model_tree_hole = BertForSequenceClassification.from_pretrained(
    model_name_tree_hole, # 小写的 12 层预训练模型
    num_labels = 2, # 分类数 --2 表示二分类
                    # 你可以改变这个数字，用于多分类任务  
    output_attentions = False, # 模型是否返回 attentions weights.
    output_hidden_states = False, # 模型是否返回所有隐层状态.
)

model_tree_hole.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21232, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
model_name_whaling_wall = 'model_save/whaling_wall'

tokenizer_whaling_wall = BertTokenizer.from_pretrained(model_name_whaling_wall, do_lower_case=True)
model_whaling_wall = BertForSequenceClassification.from_pretrained(
    model_name_whaling_wall, # 小写的 12 层预训练模型
    num_labels = 2, # 分类数 --2 表示二分类
                    # 你可以改变这个数字，用于多分类任务  
    output_attentions = False, # 模型是否返回 attentions weights.
    output_hidden_states = False, # 模型是否返回所有隐层状态.
)

model_whaling_wall.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21216, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [9]:
from transformers import TextClassificationPipeline
pipe_tree_hole = TextClassificationPipeline(model=model_tree_hole, tokenizer=tokenizer_tree_hole,device=0)
pipe_whaling_wall = TextClassificationPipeline(model=model_whaling_wall, tokenizer=tokenizer_whaling_wall,device=0)

#### **Tree Hole Prediction**

In [10]:
label_tree_hole = []
sentence_tree_hole = []
rate_tree_hole = []
for i in range(0,len(df['评论内容'])):
    pipelist_tree_hole = pipe_tree_hole(df['评论内容'][i])
    label_tree_hole.append(pipelist_tree_hole[0]['label'])
    sentence_tree_hole.append(df['评论内容'][i])
    rate_tree_hole.append(pipelist_tree_hole[0]['score'])


In [11]:
df_tree_hole = pd.DataFrame({'sentence':sentence_tree_hole,'Tree Hole':label_tree_hole,'Tree Hole Probability':rate_tree_hole})
df_tree_hole = df_tree_hole.reset_index(drop=True)
df_tree_hole['Tree Hole'].replace({'LABEL_0':'No','LABEL_1':'Yes'},inplace=True)
df_tree_hole.to_csv('dataset/LiwenliangTreeHole.csv',index=False)
display(df_tree_hole.sample(n = 60))

Unnamed: 0,sentence,Tree Hole,Tree Hole Probability
547419,一年了……时间真快,Yes,0.871556
351050,新年快乐，李医生,No,0.996847
779920,致敬！,No,0.997094
331530,上天保佑我今天心想事成，渡过这至暗时刻！！！,Yes,0.995216
654099,李叔叔，最近是有点多，忙碌起来也挺不错,Yes,0.985839
832783,李医生九月了，您离开了我们半年多了，,No,0.980794
455504,李医生，我刚去打完新冠疫苗，突然想起了你，谢谢你。,Yes,0.510737
239591,李医生 我真的好难受,Yes,0.955696
548612,一年了,Yes,0.793765
914021,今天工作的时候遇到一个和李医生同名同姓的人，但那又怎样呢？,Yes,0.983831


In [12]:
df_tree_hole.describe()

Unnamed: 0,Tree Hole Probability
count,1338082.0
mean,0.927239
std,0.1197664
min,0.5000004
25%,0.9241186
50%,0.9887887
75%,0.9956762
max,0.9981052


#### **Whaling Wall Prediction**

In [13]:
label_whaling_wall = []
sentence_whaling_wall = []
rate_whaling_wall = []
for i in range(0,len(df['评论内容'])):
    pipelist_whaling_wall = pipe_whaling_wall(df['评论内容'][i])
    label_whaling_wall.append(pipelist_whaling_wall[0]['label'])
    sentence_whaling_wall.append(df['评论内容'][i])
    rate_whaling_wall.append(pipelist_whaling_wall[0]['score'])

In [14]:
df_whaling_wall = pd.DataFrame({'sentence':sentence_whaling_wall,'Whaling Wall':label_whaling_wall,'Whaling Wall Probability':rate_whaling_wall})
df_whaling_wall = df_whaling_wall.reset_index(drop=True)
df_whaling_wall['Whaling Wall'].replace({'LABEL_0':'No','LABEL_1':'Yes'},inplace=True)
df_whaling_wall.to_csv('dataset/LiwenliangWhalingWall.csv',index=False)
display(df_whaling_wall.sample(n = 60))

Unnamed: 0,sentence,Whaling Wall,Whaling Wall Probability
218711,李医生新冠还在继续,Yes,0.87392
1257458,谢谢您 温暖的人啊,Yes,0.987499
94160,李医生今年我可以上岸吗，真的好迷茫还有16天了，感觉没有动力了,No,0.654166
45311,李医生，今天是2022年的最后一天啦，2022年糟糕透了，开放以后周围的人都阳了，希望202...,Yes,0.84255
982371,奶奶还是走了 看到她后来每天被病痛一点点折磨消瘦痛苦的样子让我想起小时候上学每天中午在她那吃...,No,0.934978
841086,在你这受的委屈也不少了，今天算啥委屈啊,No,0.761556
237344,什么时候来呢？,No,0.836189
40725,李医生晚安,Yes,0.996099
207089,晚安 亮哥,Yes,0.9961
511934,李医生，我不知道四月该怎么过，感觉心好累，学业压力好大。感觉好自卑，因为自己能力好差。,No,0.684286


In [15]:
df_whaling_wall.describe()

Unnamed: 0,Whaling Wall Probability
count,1338082.0
mean,0.9273308
std,0.1261127
min,0.5000005
25%,0.9364046
50%,0.9908584
75%,0.9953676
max,0.998557


## **Merging Resullt**

In [39]:
df = df.drop('Unnamed: 0',axis=1)
df.rename(columns={'评论内容':'sentence'},inplace=True)

In [23]:
df_tree_hole.index = df_tree_hole.index+1
df_whaling_wall.index = df_whaling_wall.index+1

In [48]:
dfmarked = pd.concat([df,df_tree_hole,df_whaling_wall],axis=1)
dfmarked

Unnamed: 0,max_id,微博id,评论页码,评论id,评论时间,评论点赞数,评论者IP归属地,评论者姓名,评论者id,评论者性别,评论者关注数,评论者粉丝数,sentence,sentence.1,Tree Hole,Tree Hole Probability,sentence.2,Whaling Wall,Whaling Wall Probability
0,4893821462382076,4467107636950632,1.0,4893834767763833,2023-04-23 23:43:14,0.0,来自广西,W我想撑下去,5417297741,男,60,63,朋友可以借我一顿饭吗。出门在外找工作遇到困难了，肚子好饿。太艰难了,,,,,,
1,4893821462382076,4467107636950632,1.0,4893834284109853,2023-04-23 23:41:19,0.0,来自湖北,一起看日落吧x,5384707935,女,92,332,李医生 昨天武汉下好大雨哦,李医生 昨天武汉下好大雨哦,Yes,0.941288,李医生 昨天武汉下好大雨哦,Yes,0.695900
2,4893821462382076,4467107636950632,1.0,4893833496367362,2023-04-23 23:38:11,0.0,来自四川,薄荷甜绿,5255030406,女,169,99,李医生，晚安,李医生，晚安,No,0.996930,李医生，晚安,Yes,0.996329
3,4893821462382076,4467107636950632,1.0,4893832905232325,2023-04-23 23:35:50,0.0,来自法国,我是Liquor,2319895202,女,71,21,四月底了 下雨还是那么冷,四月底了 下雨还是那么冷,Yes,0.987697,四月底了 下雨还是那么冷,No,0.993291
4,4893821462382076,4467107636950632,1.0,4893831990086470,2023-04-23 23:32:12,0.0,来自河南,pawyer,6072946222,男,884,242,想起了你,想起了你,No,0.995238,想起了你,Yes,0.991719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338078,4469128381645556,4467107636950632,,4469128385866379,2020-02-07 00:31:42,0.0,,安水果果菓果果,2251357683,女,477,88,全国人民都在守着你，加油！！,全国人民都在守着你，加油！！,No,0.983879,全国人民都在守着你，加油！！,Yes,0.996354
1338079,4469128381645556,4467107636950632,,4469128385134746,2020-02-07 00:31:42,0.0,,赌神奔奔918,5937000680,女,268,39,加油！我们等你回来🙏🙏🙏,加油！我们等你回来🙏🙏🙏,No,0.993155,加油！我们等你回来🙏🙏🙏,Yes,0.993806
1338080,4469128381645556,4467107636950632,,4469128381647504,2020-02-07 00:31:41,0.0,,奔跑的糯米丸子君,6447681726,女,123,30,是假的对不对，能不能出来辟谣一下,是假的对不对，能不能出来辟谣一下,No,0.546834,是假的对不对，能不能出来辟谣一下,No,0.965628
1338081,4469128381645556,4467107636950632,,4469128381645557,2020-02-07 00:31:41,0.0,,可弟子有惑依旧愚钝,2812656575,女,455,182,求求你，加油加油醒过来！,求求你，加油加油醒过来！,No,0.988626,求求你，加油加油醒过来！,Yes,0.996677


## **Output**

In [None]:
dfmarked = dfmarked.drop('max_id',axis=1)
dfmarked = dfmarked.drop('微博id',axis=1)
dfmarked = dfmarked.drop('评论页码',axis=1)
dfmarked.columns = ['Comment_id','Comment_Time','Comment_Like','Commnet_IP','Commenter_Name','Commenter_id','Commenter_gender','Commenter_Following',
                    'Commenter_Follower','Comments','sentence','Tree Hole','Tree Hole Probability','sentence','Whaling Wall','Whaling Wall Probability']
dfmarked = dfmarked.drop('sentence',axis=1)
dfmarked.to_csv('dataset/LiwenliangMarked.csv',index=False)

In [68]:
dfmarked = pd.read_csv('dataset/LiwenliangMarked.csv')
dfmarked

Unnamed: 0,Comment_id,Comment_Time,Comment_Like,Commnet_IP,Commenter_Name,Commenter_id,Commenter_gender,Commenter_Following,Commenter_Follower,Comments,Tree Hole,Tree Hole Probability,Whaling Wall,Whaling Wall Probability
0,4893834767763833,2023-04-23 23:43:14,0.0,来自广西,W我想撑下去,5417297741,男,60,63,朋友可以借我一顿饭吗。出门在外找工作遇到困难了，肚子好饿。太艰难了,Yes,0.996692,No,0.993389
1,4893834284109853,2023-04-23 23:41:19,0.0,来自湖北,一起看日落吧x,5384707935,女,92,332,李医生 昨天武汉下好大雨哦,Yes,0.941288,Yes,0.695900
2,4893833496367362,2023-04-23 23:38:11,0.0,来自四川,薄荷甜绿,5255030406,女,169,99,李医生，晚安,No,0.996930,Yes,0.996329
3,4893832905232325,2023-04-23 23:35:50,0.0,来自法国,我是Liquor,2319895202,女,71,21,四月底了 下雨还是那么冷,Yes,0.987697,No,0.993291
4,4893831990086470,2023-04-23 23:32:12,0.0,来自河南,pawyer,6072946222,男,884,242,想起了你,No,0.995238,Yes,0.991719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338078,4469128385866379,2020-02-07 00:31:42,0.0,,安水果果菓果果,2251357683,女,477,88,全国人民都在守着你，加油！！,No,0.983879,Yes,0.996354
1338079,4469128385134746,2020-02-07 00:31:42,0.0,,赌神奔奔918,5937000680,女,268,39,加油！我们等你回来🙏🙏🙏,No,0.993155,Yes,0.993806
1338080,4469128381647504,2020-02-07 00:31:41,0.0,,奔跑的糯米丸子君,6447681726,女,123,30,是假的对不对，能不能出来辟谣一下,No,0.546834,No,0.965628
1338081,4469128381645557,2020-02-07 00:31:41,0.0,,可弟子有惑依旧愚钝,2812656575,女,455,182,求求你，加油加油醒过来！,No,0.988626,Yes,0.996677
