In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
%matplotlib inline
warnings.filterwarnings('ignore')

## **Loading Dataset**

In [36]:
df = pd.read_csv('dataset/ComparisonGroupComments_Split.csv')
# Understand the basic information of dataset and the data types involved
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3193 entries, 0 to 3192
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   User_ID                3193 non-null   object 
 1   Cause_of_Death         3193 non-null   object 
 2   Last_Post_Time         3193 non-null   object 
 3   Comment_Time           3193 non-null   object 
 4   Relative_Elapsed_Time  3193 non-null   float64
 5   Commenter_Name         3193 non-null   object 
 6   Comment                3183 non-null   object 
 7   Tree_Hole              3193 non-null   int64  
 8   Wailing_Wall           3193 non-null   int64  
 9   Not_Related            3193 non-null   int64  
 10  Comments_Split         3180 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 274.5+ KB


### **Drop Not Related Column (Based on pre-analysis codebook and NULL value)**

In [37]:
codebook = pd.read_csv('dataset/comparison_group_codebook.csv')
codebook_not_related = pd.DataFrame(data=codebook['Not Related'].unique(),columns=['code'])
codebook_not_related.dropna(inplace=True)
not_related = codebook_not_related['code'].tolist()
print(not_related) #Not related keywords

['评论区', '希望', '祝福', '帮忙', '寻找', '@杭州网警', '联系', '公安', '@刘学州a', '@邓飞', '@河北警察', '@三亚警察', '封大号', '韭菜', '图片评论', '求转发', '@湖北网警巡查执法', '@首都网警', '转发微博', '宁波消防纪委科长胡晨欺上瞒下，欺骗糊弄举报人，告知处分了陈彦州隆严重警告并已记入档案，二月已告知可在支队官网查看通报批评，有录音为证！实际至今陈彦州隆并未受到任何处分！', '。', '.']


In [39]:
drop_keywords = df['Comment'].isin(not_related)
df = df[~drop_keywords]
df = df.dropna(subset='Comment').reset_index(drop=True)
df

Unnamed: 0,User_ID,Cause_of_Death,Last_Post_Time,Comment_Time,Relative_Elapsed_Time,Commenter_Name,Comment,Tree_Hole,Wailing_Wall,Not_Related,Comments_Split
0,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-13 22:10:51,0.0,光明绘海Emi,我朋友有家猫咖，二楼阳台左边门没关，家里有哥哥姐姐四五位，脾气都很好，你记得来,1,0,0,"我,朋友,家,猫,咖,二,楼,阳台,左边,门,关,家里,哥哥,姐姐,四五位,脾气,很好,你,记得"
1,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-13 16:21:28,0.0,郑先生的灯泡,第一次见你介绍自己说叫“旧书”，听你做草坪上弹我的吉他，我们一起在山礼跳舞，给活动帮忙。。。...,0,1,0,"第一次,见,你,介绍,自己,说,叫,“,旧书,”,听,你,做,草坪,上,弹,我,吉他,我们,..."
2,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-13 14:26:33,0.0,搁浅-color,他最后还在祝国家繁荣昌盛，多么温柔的人啊,0,1,0,"他,最后,还在,祝,国家,繁荣昌盛,多么,温柔,人"
3,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-14 08:48:47,1.0,苗苗是颗星,一个明星能推动毕业论文的查重率降低，但很多生命都推动不了硕博机制的革新,1,0,0,"一个,明星,推动,毕业,论文,查重率,降低,很多,生命,推动,不了,硕博,机制,革新"
4,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-13 10:00:46,0.0,Hallucinebula,阶级社会把人变成鬼。冤有头债有主，希望各位同学闲暇时间读点真正的马哲，找到新的生活的意义并且...,0,0,1,"阶级,社会,把,人,变成,鬼,冤有头债有主,希望,各位,同学,闲暇时间,读,点,真正,马哲,..."
...,...,...,...,...,...,...,...,...,...,...,...
3154,走饭,抑郁症/自杀,2012-03-18 10:54:00,2017-09-12 14:22:48,2004.0,宫保方便面,从学校回来了 正在回家的地铁上 好多人 好吵 他们都好开心 我还是离他们远点吧 不要让我恶心到别人,1,0,0,",学校,回来了,正在,回家,地铁,上,好多人,好吵,他们,好,开心,我,还是,离,他们,远,..."
3155,走饭,抑郁症/自杀,2012-03-18 10:54:00,2017-09-12 14:22:16,2004.0,今天也很开心有钱花,有人能理我一下嘛。,1,0,0,"有人,理,我,一下,"
3156,走饭,抑郁症/自杀,2012-03-18 10:54:00,2017-09-12 14:17:53,2004.0,今天也很开心有钱花,我杞人忧天焦虑症真得想死。为什么要让我在体育课上丢人。,1,0,0,"我,杞人忧天,焦虑,症,真,得,想死,为什么,我,体育课,上,丢人,"
3157,走饭,抑郁症/自杀,2012-03-18 10:54:00,2017-09-12 14:17:34,2004.0,今天也很开心有钱花,为什么大家都那么开心。自信。,1,0,0,"为什么,大家,那么,开心,自信,"


## **Loading the pre-trained model and tokenizer for predicting**

In [6]:
from transformers import BertTokenizer,BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

In [7]:
# 加载 BERT 分词器

model_name_tree_hole = 'model_save/tree_hole'

tokenizer_tree_hole = BertTokenizer.from_pretrained(model_name_tree_hole, do_lower_case=True)
model_tree_hole = BertForSequenceClassification.from_pretrained(
    model_name_tree_hole, # 小写的 12 层预训练模型
    num_labels = 2, # 分类数 --2 表示二分类
                    # 你可以改变这个数字，用于多分类任务  
    output_attentions = False, # 模型是否返回 attentions weights.
    output_hidden_states = False, # 模型是否返回所有隐层状态.
)

model_tree_hole.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21232, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
model_name_whaling_wall = 'model_save/whaling_wall'

tokenizer_whaling_wall = BertTokenizer.from_pretrained(model_name_whaling_wall, do_lower_case=True)
model_whaling_wall = BertForSequenceClassification.from_pretrained(
    model_name_whaling_wall, # 小写的 12 层预训练模型
    num_labels = 2, # 分类数 --2 表示二分类
                    # 你可以改变这个数字，用于多分类任务  
    output_attentions = False, # 模型是否返回 attentions weights.
    output_hidden_states = False, # 模型是否返回所有隐层状态.
)

model_whaling_wall.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21216, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [9]:
from transformers import TextClassificationPipeline
pipe_tree_hole = TextClassificationPipeline(model=model_tree_hole, tokenizer=tokenizer_tree_hole,device=0)
pipe_whaling_wall = TextClassificationPipeline(model=model_whaling_wall, tokenizer=tokenizer_whaling_wall,device=0)

#### **Tree Hole Prediction**

In [40]:
label_tree_hole = []
sentence_tree_hole = []
rate_tree_hole = []
for i in range(0,len(df['Comment'])):
    pipelist_tree_hole = pipe_tree_hole(df['Comment'][i])
    label_tree_hole.append(pipelist_tree_hole[0]['label'])
    sentence_tree_hole.append(df['Comment'][i])
    rate_tree_hole.append(pipelist_tree_hole[0]['score'])


In [50]:
df_tree_hole = pd.DataFrame({'sentence':sentence_tree_hole,'Tree Hole Model':label_tree_hole,'Tree Hole Probability':rate_tree_hole})
df_tree_hole = df_tree_hole.reset_index(drop=True)
df_tree_hole['Tree Hole Model'].replace({'LABEL_0':'0','LABEL_1':'1'},inplace=True)
df_tree_hole.to_csv('dataset/ComparisonTreeHole.csv',index=False)
display(df_tree_hole.sample(n = 5))

Unnamed: 0,sentence,Tree Hole Model,Tree Hole Probability
895,这个世界并不完美，但我们仍然可以疗愈自己。加油哦！,1,0.986088
1333,我不想吃药了，不想治了，我好累，我坚持不动了,1,0.995167
2770,我的国家和同胞包容性这么差吗？？？？？？！！！！,1,0.966728
3033,想变聋，不想再让别人刺激我这个玻璃心了,1,0.9962
1764,我们来看过你啦，买了你喜欢的花还有酒还有烟 下次给你带华子嗷，虽然说好的笑的，最后还是我看到...,0,0.89297


In [42]:
df_tree_hole.describe()

Unnamed: 0,Tree Hole Probability
count,3159.0
mean,0.908596
std,0.129542
min,0.502209
25%,0.874136
50%,0.977256
75%,0.993698
max,0.997826


#### **Whaling Wall Prediction**

In [43]:
label_whaling_wall = []
sentence_whaling_wall = []
rate_whaling_wall = []
for i in range(0,len(df['Comment'])):
    pipelist_whaling_wall = pipe_whaling_wall(df['Comment'][i])
    label_whaling_wall.append(pipelist_whaling_wall[0]['label'])
    sentence_whaling_wall.append(df['Comment'][i])
    rate_whaling_wall.append(pipelist_whaling_wall[0]['score'])

In [51]:
df_whaling_wall = pd.DataFrame({'sentence':sentence_whaling_wall,'Whaling Wall Model':label_whaling_wall,'Whaling Wall Probability':rate_whaling_wall})
df_whaling_wall = df_whaling_wall.reset_index(drop=True)
df_whaling_wall['Whaling Wall Model'].replace({'LABEL_0':'0','LABEL_1':'1'},inplace=True)
df_whaling_wall.to_csv('dataset/ComparisonWhalingWall.csv',index=False)
display(df_whaling_wall.sample(n = 5))

Unnamed: 0,sentence,Whaling Wall Model,Whaling Wall Probability
2341,一个十几岁的孩子，能写出这样的经历，思路清晰，文章通透，一定是个善良且努力的孩子,0,0.993021
2683,一路走好妹子，希望来世不要出生在这个肮脏的环境了,1,0.849857
1112,我虚岁27了 今天又被七大姑八大姨催婚 我心中苦涩 一个破碎的我如何去承担一个家庭的责任 我...,0,0.995141
345,很认真的看完了，竟觉身体冰冷，或许你想传达给世人的种种，已经被赋予了意义。无论是身为陌生人还...,1,0.609983
156,我比你大两三岁，就叫你弟弟吧，你其实已经比我优秀很多啦，我可能可以微微的体会到前天的深夜里你...,0,0.958422


In [45]:
df_whaling_wall.describe()

Unnamed: 0,Whaling Wall Probability
count,3159.0
mean,0.929108
std,0.115844
min,0.501219
25%,0.92752
50%,0.985871
75%,0.993317
max,0.997334


## **Merging Resullt**

In [52]:
dfmarked = pd.concat([df,df_tree_hole,df_whaling_wall],axis=1)
dfmarked

Unnamed: 0,User_ID,Cause_of_Death,Last_Post_Time,Comment_Time,Relative_Elapsed_Time,Commenter_Name,Comment,Tree_Hole,Wailing_Wall,Not_Related,Comments_Split,sentence,Tree Hole Model,Tree Hole Probability,sentence.1,Whaling Wall Model,Whaling Wall Probability
0,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-13 22:10:51,0.0,光明绘海Emi,我朋友有家猫咖，二楼阳台左边门没关，家里有哥哥姐姐四五位，脾气都很好，你记得来,1,0,0,"我,朋友,家,猫,咖,二,楼,阳台,左边,门,关,家里,哥哥,姐姐,四五位,脾气,很好,你,记得",我朋友有家猫咖，二楼阳台左边门没关，家里有哥哥姐姐四五位，脾气都很好，你记得来,1,0.965164,我朋友有家猫咖，二楼阳台左边门没关，家里有哥哥姐姐四五位，脾气都很好，你记得来,0,0.970488
1,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-13 16:21:28,0.0,郑先生的灯泡,第一次见你介绍自己说叫“旧书”，听你做草坪上弹我的吉他，我们一起在山礼跳舞，给活动帮忙。。。...,0,1,0,"第一次,见,你,介绍,自己,说,叫,“,旧书,”,听,你,做,草坪,上,弹,我,吉他,我们,...",第一次见你介绍自己说叫“旧书”，听你做草坪上弹我的吉他，我们一起在山礼跳舞，给活动帮忙。。。...,1,0.909041,第一次见你介绍自己说叫“旧书”，听你做草坪上弹我的吉他，我们一起在山礼跳舞，给活动帮忙。。。...,1,0.915967
2,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-13 14:26:33,0.0,搁浅-color,他最后还在祝国家繁荣昌盛，多么温柔的人啊,0,1,0,"他,最后,还在,祝,国家,繁荣昌盛,多么,温柔,人",他最后还在祝国家繁荣昌盛，多么温柔的人啊,1,0.648890,他最后还在祝国家繁荣昌盛，多么温柔的人啊,0,0.924531
3,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-14 08:48:47,1.0,苗苗是颗星,一个明星能推动毕业论文的查重率降低，但很多生命都推动不了硕博机制的革新,1,0,0,"一个,明星,推动,毕业,论文,查重率,降低,很多,生命,推动,不了,硕博,机制,革新",一个明星能推动毕业论文的查重率降低，但很多生命都推动不了硕博机制的革新,1,0.903259,一个明星能推动毕业论文的查重率降低，但很多生命都推动不了硕博机制的革新,0,0.993894
4,红烧土豆叶,学业压力/抑郁症/自杀,2020-10-13 02:00:00,2020-10-13 10:00:46,0.0,Hallucinebula,阶级社会把人变成鬼。冤有头债有主，希望各位同学闲暇时间读点真正的马哲，找到新的生活的意义并且...,0,0,1,"阶级,社会,把,人,变成,鬼,冤有头债有主,希望,各位,同学,闲暇时间,读,点,真正,马哲,...",阶级社会把人变成鬼。冤有头债有主，希望各位同学闲暇时间读点真正的马哲，找到新的生活的意义并且...,1,0.993993,阶级社会把人变成鬼。冤有头债有主，希望各位同学闲暇时间读点真正的马哲，找到新的生活的意义并且...,0,0.992298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3154,走饭,抑郁症/自杀,2012-03-18 10:54:00,2017-09-12 14:22:48,2004.0,宫保方便面,从学校回来了 正在回家的地铁上 好多人 好吵 他们都好开心 我还是离他们远点吧 不要让我恶心到别人,1,0,0,",学校,回来了,正在,回家,地铁,上,好多人,好吵,他们,好,开心,我,还是,离,他们,远,...",从学校回来了 正在回家的地铁上 好多人 好吵 他们都好开心 我还是离他们远点吧 不要让我恶心到别人,1,0.997095,从学校回来了 正在回家的地铁上 好多人 好吵 他们都好开心 我还是离他们远点吧 不要让我恶心到别人,0,0.992052
3155,走饭,抑郁症/自杀,2012-03-18 10:54:00,2017-09-12 14:22:16,2004.0,今天也很开心有钱花,有人能理我一下嘛。,1,0,0,"有人,理,我,一下,",有人能理我一下嘛。,1,0.945645,有人能理我一下嘛。,0,0.988946
3156,走饭,抑郁症/自杀,2012-03-18 10:54:00,2017-09-12 14:17:53,2004.0,今天也很开心有钱花,我杞人忧天焦虑症真得想死。为什么要让我在体育课上丢人。,1,0,0,"我,杞人忧天,焦虑,症,真,得,想死,为什么,我,体育课,上,丢人,",我杞人忧天焦虑症真得想死。为什么要让我在体育课上丢人。,1,0.997031,我杞人忧天焦虑症真得想死。为什么要让我在体育课上丢人。,0,0.994753
3157,走饭,抑郁症/自杀,2012-03-18 10:54:00,2017-09-12 14:17:34,2004.0,今天也很开心有钱花,为什么大家都那么开心。自信。,1,0,0,"为什么,大家,那么,开心,自信,",为什么大家都那么开心。自信。,1,0.993933,为什么大家都那么开心。自信。,0,0.994247


In [62]:
dfmarked['Whaling Wall Model'] =dfmarked['Whaling Wall Model'].astype('int64')
dfmarked['Tree Hole Model'] =dfmarked['Tree Hole Model'].astype('int64')

wailing_wall_equal_ratio = sum(dfmarked['Wailing_Wall'] == dfmarked['Whaling Wall Model'])/len(dfmarked)
tree_hole_equal_ratio = sum(dfmarked['Tree_Hole'] == dfmarked['Tree Hole Model'])/len(dfmarked)

print('Wailing Wall Model Equal Ratio: ',wailing_wall_equal_ratio*100)
print('Tree Hole Model Equal Ratio: ',tree_hole_equal_ratio*100)

Wailing Wall Model Equal Ratio:  78.25261158594492
Tree Hole Model Equal Ratio:  71.63659385881608


## **Output**

In [None]:
dfmarked = dfmarked.drop('max_id',axis=1)
dfmarked = dfmarked.drop('微博id',axis=1)
dfmarked = dfmarked.drop('评论页码',axis=1)
dfmarked.columns = ['Comment_id','Comment_Time','Comment_Like','Commnet_IP','Commenter_Name','Commenter_id','Commenter_gender','Commenter_Following',
                    'Commenter_Follower','Comments','sentence','Tree Hole','Tree Hole Probability','sentence','Whaling Wall','Whaling Wall Probability']
dfmarked = dfmarked.drop('sentence',axis=1)
dfmarked.to_csv('dataset/LiwenliangMarked.csv',index=False)

In [68]:
dfmarked = pd.read_csv('dataset/LiwenliangMarked.csv')
dfmarked

Unnamed: 0,Comment_id,Comment_Time,Comment_Like,Commnet_IP,Commenter_Name,Commenter_id,Commenter_gender,Commenter_Following,Commenter_Follower,Comments,Tree Hole,Tree Hole Probability,Whaling Wall,Whaling Wall Probability
0,4893834767763833,2023-04-23 23:43:14,0.0,来自广西,W我想撑下去,5417297741,男,60,63,朋友可以借我一顿饭吗。出门在外找工作遇到困难了，肚子好饿。太艰难了,Yes,0.996692,No,0.993389
1,4893834284109853,2023-04-23 23:41:19,0.0,来自湖北,一起看日落吧x,5384707935,女,92,332,李医生 昨天武汉下好大雨哦,Yes,0.941288,Yes,0.695900
2,4893833496367362,2023-04-23 23:38:11,0.0,来自四川,薄荷甜绿,5255030406,女,169,99,李医生，晚安,No,0.996930,Yes,0.996329
3,4893832905232325,2023-04-23 23:35:50,0.0,来自法国,我是Liquor,2319895202,女,71,21,四月底了 下雨还是那么冷,Yes,0.987697,No,0.993291
4,4893831990086470,2023-04-23 23:32:12,0.0,来自河南,pawyer,6072946222,男,884,242,想起了你,No,0.995238,Yes,0.991719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338078,4469128385866379,2020-02-07 00:31:42,0.0,,安水果果菓果果,2251357683,女,477,88,全国人民都在守着你，加油！！,No,0.983879,Yes,0.996354
1338079,4469128385134746,2020-02-07 00:31:42,0.0,,赌神奔奔918,5937000680,女,268,39,加油！我们等你回来🙏🙏🙏,No,0.993155,Yes,0.993806
1338080,4469128381647504,2020-02-07 00:31:41,0.0,,奔跑的糯米丸子君,6447681726,女,123,30,是假的对不对，能不能出来辟谣一下,No,0.546834,No,0.965628
1338081,4469128381645557,2020-02-07 00:31:41,0.0,,可弟子有惑依旧愚钝,2812656575,女,455,182,求求你，加油加油醒过来！,No,0.988626,Yes,0.996677
