# POTUS Election 2020 Analysis (2)

python=3.7

Retweet Network Analysis

In [2]:
import pandas as pd
import networkx as nx
import seaborn as sns
import re
from joblib import dump, load
from tldextract import extract
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## 构建只包含三种媒体的转推网络

In [2]:
df = pd.read_csv("data/full_data[topic=POTUS2016].csv")
df_urls = pd.read_csv("data/labeled_urls[topic=POTUS2016].csv")

In [3]:
# 判断url是否是三种媒体之一
def is_3media(row)->bool:
    return row['fake_news'] or row['mainstream'] or row['debunking']

# 构建一个{url: domain}字典
# 挑选出属于三种媒体的url
df_urls['3_media'] = df_urls.apply(is_3media, axis=1)
df_url_3media = df_urls[df_urls['3_media']]
df_url_3media = df_url_3media.loc[:, ['url','domain']].copy(deep=True)
# 索引为url
df_url_3media.index = df_url_3media['url']
df_url_3media = df_url_3media.drop(['url'], axis=1)
# 转换成字典
url_dict = df_url_3media.to_dict()['domain']

# 展示一下
df_url_3media.sample(5)

Unnamed: 0_level_0,domain
url,Unnamed: 1_level_1
https://www.washingtonpost.com/news/the-fix/wp/2016/12/16/now-you-can-fact-check-trumps-tweets-in-the-tweets-themselves/?utm_term=.bd461c664346&tid=sm_tw,washingtonpost.com
http://zerohedge.com,zerohedge.com
http://wapo.st/2jwr0GM,washingtonpost.com
http://wapo.st/2jmebyf,washingtonpost.com
http://www.nytimes.com/2016/12/06/us/politics/michael-flynn-son-trump.html?smprod=nytcore-ipad&smid=nytcore-ipad-share,nytimes.com


In [4]:
# 筛选包含三种媒体的帖子
contain_3media_ind = []
url_3media_set = set(df_url_3media.index)

for i in df.index:
    mtch = set(re.findall('"(.+?)"', str(df['entities.urls'][i])))
    if url_3media_set.intersection(mtch):
        contain_3media_ind.append(i)

tweets_contain_3media = df.iloc[contain_3media_ind]
tweets_contain_3media.to_csv("data/3media_tweets[topic=POTUS2016].csv", index=False)
tweets_contain_3media.shape

(272149, 83)

In [5]:
# 构建边列表，只包含转推数据
rt_list = tweets_contain_3media[tweets_contain_3media['referenced_tweets.retweeted.id'].notnull()]
print(len(rt_list))

# 给每条边标上转发的媒体域名
# df_urls.index = df_urls['url']
# rt_list['domain'] = ''
# n = 0
# for i in rt_list.index:
#     mtch = list(re.findall('"(.+?)"', str(rt_list['entities.urls'][i])))
#     rt_list['domain'][i] = [df_urls['domain'][url] for url in mtch]
#     n += 1
#     if n % 10000 == 0:
#         print(int(n / 10000), end=' ')

# 转推
rt_list.to_csv("data/3media_retweets[topic=POTUS2016].csv", index=False)
# 边列表
edge_list = pd.concat(axis=1, objs=[rt_list['author.username'],rt_list['retweeted_username'],rt_list['id']])

212860


In [6]:
# 边列表
edge_list.columns = ['source','target','label']
edge_list.to_csv("data/edge_list_of_RTnetworks[topic=POTUS2016].csv", index=False)

# 转换成图文件
D = nx.from_pandas_edgelist(edge_list, create_using=nx.DiGraph, edge_attr='label')    # 利用边的信息来构建有向图
nx.write_gexf(D, "data/RTnetworks[topic=POTUS2016].gexf")  # 保存有向图数据

In [7]:
# 取出k-core子图，并保存
D.remove_edges_from(nx.selfloop_edges(D))
Dk = nx.k_core(D, k=2, core_number=None)
print(f"number of nodes: {Dk.number_of_nodes()}")
print(f"number of edges: {Dk.number_of_edges()}")
nx.write_gexf(Dk, "data/k-core_RTnetworks[topic=POTUS2016].gexf")

number of nodes: 27989
number of edges: 84971


In [8]:
# 根据k-core子图的边挑出子图中的转推帖子
labels = list(Dk.edges().values())
labels = {d['label'] for d in labels}
kcore_rt = rt_list[rt_list['id'].isin(labels)]
kcore_rt.to_csv("data/retweets[restrict=kcore+3media][topic=POTUS2016].csv", index=False)

## 清洗转推文本

!!!!!!警告!!!!!!这里修改了

In [9]:
# 去掉标点符号、网址、换行等字符
import string

def wordopt(text):
    text = text.lower()
    text = re.sub(r'\\n', '', text) # 此处加上这句代码
    text = re.sub('\[.*?\]', '', text) # 去掉中括号括起来的字符串
    text = re.sub('https?://\S+|www\.\S+', '', text) # 去掉网址
    text = re.sub("\\W"," ",text) # 去掉非单词字符
    text = re.sub('<.*?>+', '', text) # 去掉HTML, XML标签
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # 将文本中所有标点符号删除
    text = re.sub('\n', '', text) # 删除换行符
    text = re.sub('\w*\d\w*', '', text) # 删除字母数字混合的“单词”
    return text

rtwts_3media = kcore_rt  # 只分析k-core图里的转发数据
rtwts_3media['text'] = rtwts_3media.text.apply(lambda x : wordopt(x))

In [10]:
# 去掉停用词

import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
eng_stopwords = nltk.corpus.stopwords.words("english")

def remove_eng_stopwords(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [word for word in token_text if word not in eng_stopwords]
    join_text = ' '.join(remove_stop)
    return join_text

rtwts_3media['text'] = rtwts_3media.text.apply(lambda x : remove_eng_stopwords(x))

In [11]:
# 词形还原

# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()

def word_lemmatizer(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [lemm.lemmatize(w) for w in token_text]
    join_text = ' '.join(remove_stop)
    return join_text

rtwts_3media['text'] = rtwts_3media.text.apply(lambda x : word_lemmatizer(x))

In [12]:
# 去除噪声
from nltk.corpus import stopwords
Word_STOPWORDS = ["e", "te", "i", "me", "qe", "ne", "nje", "a", "per", "sh", "nga", "ka", "u", "eshte", "dhe", "shih", "nuk",
             "m", "dicka", "ose", "si", "shume", "etj", "se", "pa", "sipas", "s", "t", "dikujt", "dike", "mire", "vet",
             "bej", "ai", "vend", "prej", "ja", "duke", "tjeter", "kur", "ia", "ku", "ta", "keq", "dy", "ben", "bere",
             "behet", "dickaje", "edhe", "madhe", "la", "sa", "gjate", "zakonisht", "pas", "veta", "mbi", "disa", "iu",
             "mos", "c", "para", "dikush", "gje", "be", "pak", "tek", "fare", "beri", "po", "bie", "k", "do", "gjithe",
             "vete", "mund", "kam", "le", "jo", "beje", "tij", "kane", "ishte", "jane", "vjen", "ate", "kete", "neper",
             "cdo", "na", "marre", "merr", "mori", "rri", "deri", "b", "kishte", "mban", "perpara", "tyre", "marr",
             "gjitha", "as", "vetem", "nen", "here", "tjera", "tjeret", "drejt", "qenet", "ndonje", "nese", "jap",
             "merret", "rreth", "lloj", "dot", "saj", "nder", "ndersa", "cila", "veten", "ma", "ndaj", "mes", "ajo",
             "cilen", "por", "ndermjet", "prapa", "mi", "tere", "jam", "ashtu", "kesaj", "tille", "behem", "cilat",
             "kjo", "menjehere", "ca", "je", "aq", "aty", "prane", "ato", "pasur", "qene", "cilin", "teper", "njera",
             "tej", "krejt", "kush", "bejne", "ti", "bene", "midis", "cili", "ende", "keto", "kemi", "sic", "kryer",
             "cilit", "atij", "gjithnje", "andej", "siper", "sikur", "ketej", "ciles", "ky", "papritur", "ua",
             "kryesisht", "gjithcka", "pasi", "kryhet", "mjaft", "ketij", "perbashket", "ata", "atje", "vazhdimisht",
             "kurre", "tone", "keshtu", "une", "sapo", "rralle", "vetes", "ishin", "afert", "tjetren", "ketu", "cfare",
             "to", "anes", "jemi", "asaj", "secila", "kundrejt", "ketyre", "pse", "tilla", "mua", "nepermjet", "cilet",
             "ndryshe", "kishin", "ju", "tani", "atyre", "dic", "yne", "kudo", "sone", "sepse", "cilave", "kem", "ty",
             "t'i", "nbsp", "tha", "re", "the", "jr", "t", "n"]
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)
text_unknows= Word_STOPWORDS
stop.update(text_unknows)

# 去除噪声字符或字符串

from bs4 import BeautifulSoup
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    '''Removing the square brackets'''
    return re.sub('\[[^]]*\]', '', text)

def remove_between_square_brackets(text):
    '''Removing URL's'''
    return re.sub(r'http\S+', '', text)

def remove_stopwords(text):
    '''Removing the stopwords from text'''
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

def denoise_text(text):
    '''Removing the noisy text'''
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

rtwts_3media['text'] = rtwts_3media.text.apply(lambda x : denoise_text(x))

In [13]:
# 去除标点符号
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

rtwts_3media['text'] = rtwts_3media['text'].apply(punctuation_removal)

!!!!!!警告!!!!!!这里修改了

In [14]:
# 将每个用户自己的文本合在一起
user_texts = rtwts_3media.groupby(by='author.username').agg(text=("text", lambda x: ",".join(set(x))))
user_texts.reset_index(drop=False, inplace=True)
user_texts.to_csv("data/user_texts[restrict=kcore+3media+retweet][topic=POTUS2016].csv", index=False)
user_texts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26830 entries, 0 to 26829
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   author.username  26830 non-null  object
 1   text             26830 non-null  object
dtypes: object(2)
memory usage: 419.3+ KB


## 用Perspective API分析语言毒性

In [9]:
perspective_res = pd.read_csv("data/user_texts_with_perspective_api_results[restrict=kcore+retweet+3media][topic=POTUS2016].csv")
perspective_res.info()

def get_score_from_json(x):
    # x为待处理的json字符串
    if pd.isna(x):
        return None
    s = re.search("'score': {'value': (.+?),", x)
    return float(s.group(1))

perspective_res['toxicity_score'] = perspective_res['perspective_api_results'].apply(get_score_from_json)
perspective_res.to_csv("data/user_texts_with_perspective_api_results[restrict=kcore+retweet+3media][topic=POTUS2016].csv", index=False)
perspective_res.sample(5).loc[:, ['toxicity_score']]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26830 entries, 0 to 26829
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   author.username          26830 non-null  object 
 1   text                     26830 non-null  object 
 2   perspective_api_results  26830 non-null  object 
 3   toxicity_score           26830 non-null  float64
dtypes: float64(1), object(3)
memory usage: 838.6+ KB


Unnamed: 0,toxicity_score
19521,0.038756
22071,0.146031
17350,0.160431
7192,0.377959
16704,0.109134


## 用LIWC进行分析

In [8]:
# 读取LIWC字典
import liwc
liwcPath = r'data/LIWC2015_English.dic'
parse, category_names = liwc.load_token_parser(liwcPath)

In [16]:
# 用LIWC对每个用户进行分析
from sklearn.feature_extraction.text import TfidfVectorizer

def liwc_analyse(user_twts):
    
    global parse
    corpus = []
    words = []

    review = re.sub('[^a-zA-Z0-9]', ' ', user_twts['text'])
    review = review.split()
    review = list(category for token in review for category in parse(token))
    statements = ' '.join(review)
    corpus.append(statements)
    words.append(review)
    
    # TF-IDF
    try:
        vectorizer = TfidfVectorizer(max_features=5000)
        X_fit = vectorizer.fit(corpus)
        X_transformed = X_fit.transform(corpus)

        features = vectorizer.get_feature_names()
        df_count = pd.DataFrame(X_transformed.toarray(),columns = features)
    except:
        df_count = pd.DataFrame()

    user_twts = pd.DataFrame(user_twts).T
    user_twts.reset_index(drop=True, inplace=True)
    
    return pd.concat([user_twts, df_count], axis=1) # 代价比较高

# user_rt_sentiments = user_texts.apply(liwc_analyse, axis=1).tolist()

In [17]:
from pandarallel import pandarallel

# Initialization
pandarallel.initialize(progress_bar=True, nb_workers=8)

# Standard pandas apply
# df.apply(func)

# Parallel apply
user_rt_sentiments = user_texts.parallel_apply(liwc_analyse, axis=1).tolist()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3354), Label(value='0 / 3354'))), …

In [18]:
dump(user_rt_sentiments, "pkl/user_rt_sentiments[restrict=kcore+3media][topic=POTUS2016][type=list].pkl")

['pkl/user_rt_sentiments[restrict=kcore+3media][topic=POTUS2016][type=list].pkl']

In [19]:
# 把liwc分析结果变成字典，键值为username，方便操作
user_rt_sentiments_dict = {a['author.username'][0]:dict(pd.Series(a.T[0])) for a in user_rt_sentiments}
# 保存字典形式的liwc分析结果
dump(user_rt_sentiments_dict, "pkl/user_rt_sentiments_dict[restrict=kcore+3media][topic=POTUS2016].pkl")
len(user_rt_sentiments)

26830

## 社区分类

### 读取大小前8的社区，给用户标上情感和毒性

In [5]:
# 由之前的数据，将前8个社区分为'mainstream','fake_news','debunking'三类
def get_communities(node_file:str, topn_labels:list)->list:
    '''将指定编号的社区提取出来'''
    # 导入节点列表
    vex = pd.read_csv(node_file)
    # 社区列表
    com_ls = []
    for s in topn_labels:
        com_ls.append(vex[vex['modularity_class'] == s])
    # 每个社区中的结点按入度降序排序
    for com in com_ls:
        com.sort_values(by='indegree', ascending=False, inplace=True)
        com.reset_index(inplace=True)
    # 返回包含前n个社区的列表
    return com_ls

top8_list = [(12,0.2551), (13,0.1888), (3,0.1696), (2,0.1603), (7,0.1081), (4,0.0433), (5,0.0333), (0,0.024)]
communities_topn = get_communities(\
    "data/(nodes)k-core_RTnetworks[topic=POTUS2016][modularity=1].csv", \
    [top8_list[i][0] for i in range(len(top8_list))])

In [21]:
def get_user_liwc_score(u, category:str):
    global user_rt_sentiments_dict
    try:
        tmp = user_rt_sentiments_dict[u][category]
        return tmp
    except:
        return None

# 给每个社区的用户标上对应的情绪分数
user_rt_sentiments_dict = load("pkl/user_rt_sentiments_dict[restrict=kcore+3media][topic=POTUS2016].pkl")
for c in communities_topn:
    c['positive_score'] = c['Id'].apply(get_user_liwc_score, args=('positive',))
    c['negative_score'] = c['Id'].apply(get_user_liwc_score, args=('negative',))
    c['affect_score'] = c['Id'].apply(get_user_liwc_score, args=('affect',))

In [7]:
perspective_res.index = perspective_res['author.username']

def get_user_toxicity(u):
    global perspective_res
    try:
        tmp = perspective_res['toxicity_score'][u]
        if pd.isna(tmp):
            raise
        return tmp
    except:
        return None

# 给每个社区的用户标上对应的毒性分数
for c in communities_topn:
    c['toxicity_score'] = c['Id'].apply(get_user_toxicity)

In [22]:
dump(communities_topn, "pkl/communities_topn[topic=POTUS2016].pkl")

['pkl/communities_topn[topic=POTUS2016].pkl']

### 每个社区按前10用户来分类

In [9]:
communities_topn[0].loc[0:9,:]

Unnamed: 0,index,Id,Label,modularity_class,indegree,outdegree,degree,Color,positive_score,negative_score,affect_score,toxicity_score
0,21281,PrisonPlanet,PrisonPlanet,12,3238,0,3238,#df89ff,,,,
1,11360,FoxNews,FoxNews,12,894,0,894,#df89ff,,,,
2,20315,gatewaypundit,gatewaypundit,12,809,1,810,#df89ff,,,,0.15214
3,3338,LindaSuhler,LindaSuhler,12,711,12,723,#df89ff,,,,0.195994
4,23337,deYook,deYook,12,667,0,667,#df89ff,,,,
5,3719,DanScavino,DanScavino,12,509,0,509,#df89ff,,,,
6,27938,PVeritas_Action,PVeritas_Action,12,504,0,504,#df89ff,,,,
7,14347,BreitbartNews,BreitbartNews,12,491,0,491,#df89ff,,,,
8,15721,DrMartyFox,DrMartyFox,12,453,5,458,#df89ff,,,,0.254629
9,11273,LouDobbs,LouDobbs,12,439,0,439,#df89ff,,,,


认为此社区为虚假信息媒体社区，右翼社区。

- PrisonPlanet：Paul Joseph Watson，生于1982年5月24日，是一位英国的右翼YouTuber、广播主持人和阴谋论者。

- FoxNews：在主流媒体列表和虚假信息媒体列表中。

- gatewaypundit：在虚假信息媒体列表中，极右翼媒体。

- LindaSuhler: 支持特朗普的一个社交媒体影响者，大量转发DonaldJTrumpJr、catturd2等人(发表过阴谋论)的与特朗普相关的推文。

- deYook：7000粉的社交媒体影响者，现在主要发表支持巴勒斯坦的言论，16年时也分享支持特朗普的新闻，只是偶尔分享一些虚假信息媒体的新闻，如infowars。

- DanScavino: Daniel Scavino Jr. 是一位美国政治顾问，他在特朗普政府中担任了白宫副幕僚长兼通讯主任（2020年至2021年），以及社交媒体主任（2017年至2021年）。据报道，Scavino 还被指控在2020年大选后通过社交媒体传播关于选举舞弊的虚假信息。

- PVeritas_Action：Project Veritas Action 是一个由 James O’Keefe 在2010年创立的美国极右翼活动组织。该组织制作了一些经过欺骗性编辑的秘密录音视频，试图破坏主流媒体组织和进步团体的信誉。Project Veritas Action 还使用陷阱来为其目标制造负面新闻，并在其视频和行动中传播虚假信息和阴谋论。

- BreitbartNews：极右翼媒体，在虚假信息媒体列表中。

- DrMartyFox：Dr. Marty Fox，使用比较激烈的言论支持特朗普，但看不出来分享过什么虚假信息。

- LouDobbs：Lou Dobbs（全名Louis Carl Dobbs，1945年9月24日出生）是一位美国保守派政治评论员、作家和前电视主持人，曾经发表过阴谋论，特朗普的坚定支持者。

In [10]:
communities_topn[1].loc[0:9,:]

Unnamed: 0,index,Id,Label,modularity_class,indegree,outdegree,degree,Color,positive_score,negative_score,affect_score,toxicity_score
0,22234,kurteichenwald,kurteichenwald,13,5843,0,5843,#73c000,,,,
1,27013,JoyAnnReid,JoyAnnReid,13,645,3,648,#73c000,,,,0.270478
2,25018,Blue______Wave,Blue______Wave,13,508,0,508,#73c000,,,,
3,16609,ObsoleteDogma,ObsoleteDogma,13,501,0,501,#73c000,,,,
4,22338,jonathanalter,jonathanalter,13,495,0,495,#73c000,,,,
5,24475,mmpadellan,mmpadellan,13,340,0,340,#73c000,,,,
6,12747,ananavarro,ananavarro,13,339,0,339,#73c000,,,,
7,7251,SamuelAAdams,SamuelAAdams,13,289,1,290,#73c000,,,,0.169385
8,25333,DylanByers,DylanByers,13,279,0,279,#73c000,,,,
9,12237,haroldpollack,haroldpollack,13,254,0,254,#73c000,,,,


认为此社区为主流媒体记者社区。

- kurteichenwald：Kurt Alexander Eichenwald（出生于1961年6月28日）是一位美国记者和纽约时报畅销书的作者，他曾是纽约时报的高级作家和调查记者。

- JoyAnnReid：Joy-Ann M. Lomena-Reid，通常被称为Joy Reid，是一位美国记者和电视节目主持人。她是MSNBC的全国记者。

- Blue______Wave：账号名称为“#RemoveTrump”，发表强烈反对特朗普的内容。

- ObsoleteDogma：Matt O’Brien1是一位经济学家和作家，目前在华盛顿邮报担任经济和商业评论员的职务。

- jonathanalter：NBC新闻(主流媒体)的特约记者。

- mmpadellan：BrooklynDad_Defiant!（Majid Padellan）在社交媒体上的活动引起了一些争议，但这主要是因为他接受了一个民主党政治行动委员会的资金，而没有明确披露这一点。然而，没有明确的证据表明他发布过虚假信息。

- ananavarro：Ana Violeta Navarro-Cárdenas（原名Ana Violeta Navarro Flores，1971年12月28日出生）是一位尼加拉瓜裔美国政治策略家和评论员。她在各种电视节目和新闻媒体上露面，包括CNN、CNN en Español、ABC News和Telemundo。她也是白天脱口秀节目《The View》的共同主持人，并因其工作获得艾美奖提名。她是共和党的成员，并将自己的政治立场描述为“中间派”。

- SamuelAAdams：作家。

- DylanByers：Dylan Byers是一位美国记者，他是Puck的创始合伙人和高级记者，他曾在NBC新闻担任高级媒体记者，他也在CNN工作过，他还在Politico和Adweek工作过。

- haroldpollack：Harold Pollack是一位美国教授，他在芝加哥大学任教。他的研究主要集中在公共卫生和卫生政策领域。

In [11]:
communities_topn[2].loc[0:9,:]

Unnamed: 0,index,Id,Label,modularity_class,indegree,outdegree,degree,Color,positive_score,negative_score,affect_score,toxicity_score
0,7633,PolitiFact,PolitiFact,3,5509,0,5509,#00c4ff,,,,
1,15807,CNN,CNN,3,2785,0,2785,#00c4ff,,,,
2,13837,politifactlive,politifactlive,3,1336,0,1336,#00c4ff,,,,
3,21422,NPR,NPR,3,1189,2,1191,#00c4ff,,,,0.112437
4,1496,CNNPolitics,CNNPolitics,3,877,1,878,#00c4ff,,,,0.252448
5,25179,nprpolitics,nprpolitics,3,698,3,701,#00c4ff,,,,0.025321
6,23330,cnni,cnni,3,586,1,587,#00c4ff,,,,0.285477
7,12723,TIME,TIME,3,373,0,373,#00c4ff,,,,
8,7827,PPact,PPact,3,322,0,322,#00c4ff,,,,
9,12159,TheDailyEdge,TheDailyEdge,3,277,2,279,#00c4ff,,,,0.094383


认为此社区为主流媒体+辟谣媒体。

- PolitiFact：中左媒体，在辟谣媒体列表中。

- CNN：偏左的主流媒体，在主流媒体列表中。

- politifactlive：PolitiFact子账号。

- NPR：National Public Radio，中左媒体，在主流媒体列表中。

- CNNPolitics：CNN的子账号，主流媒体。

- nprpolitics：NPR子账号，主流媒体。

- cnni：CNN International，CNN的子账号，主要报道国际事件。

- TIME：主流媒体。

- PPact：Planned Parenthood Action Fund（PPAF）是一个非营利性的，非党派的组织。它是Planned Parenthood Federation of America的倡导分支，该组织提供性健康和生殖健康的医疗和教育服务。

- TheDailyEdge：推特简介“打击法西斯主义。揭露共和党的谎言和腐败”。

In [12]:
communities_topn[3].loc[0:9,:]

Unnamed: 0,index,Id,Label,modularity_class,indegree,outdegree,degree,Color,positive_score,negative_score,affect_score,toxicity_score
0,15518,nytimes,nytimes,2,5409,5,5414,#4c463e,,,,0.360437
1,9023,washingtonpost,washingtonpost,2,3948,1,3949,#4c463e,,,,0.025203
2,19873,ScottShaneNYT,ScottShaneNYT,2,818,0,818,#4c463e,,,,
3,20299,USATODAY,USATODAY,2,708,3,711,#4c463e,,,,0.079251
4,1407,NewYorker,NewYorker,2,670,0,670,#4c463e,,,,
5,9156,MSNBC,MSNBC,2,418,0,418,#4c463e,,,,
6,10792,TheLastWord,TheLastWord,2,387,0,387,#4c463e,,,,
7,10729,propublica,propublica,2,354,0,354,#4c463e,,,,
8,17963,anneapplebaum,anneapplebaum,2,337,1,338,#4c463e,,,,0.330896
9,6106,nytopinion,nytopinion,2,332,1,333,#4c463e,,,,0.277503


认为此社区为主流媒体社区。

In [13]:
communities_topn[4].loc[0:9,:]

Unnamed: 0,index,Id,Label,modularity_class,indegree,outdegree,degree,Color,positive_score,negative_score,affect_score,toxicity_score
0,16197,GlennKesslerWP,GlennKesslerWP,7,2511,19,2530,#ff8805,,,,0.254629
1,6344,pbump,pbump,7,939,0,939,#ff8805,,,,
2,19204,greenhousenyt,greenhousenyt,7,498,0,498,#ff8805,,,,
3,13654,TimothyNoah1,TimothyNoah1,7,476,0,476,#ff8805,,,,
4,9961,brianstelter,brianstelter,7,227,6,233,#ff8805,,,,0.276379
5,9387,michikokakutani,michikokakutani,7,168,7,175,#ff8805,,,,0.262931
6,24196,ThePlumLineGS,ThePlumLineGS,7,148,0,148,#ff8805,,,,
7,12026,jonathanweisman,jonathanweisman,7,144,0,144,#ff8805,,,,
8,16786,kylegriffin1,kylegriffin1,7,139,1,140,#ff8805,,,,0.186492
9,6905,RyanLizza,RyanLizza,7,129,0,129,#ff8805,,,,


In [14]:
communities_topn[5].loc[0:9,:]

Unnamed: 0,index,Id,Label,modularity_class,indegree,outdegree,degree,Color,positive_score,negative_score,affect_score,toxicity_score
0,16441,guardian,guardian,4,643,1,644,#ff5584,,,,0.159878
1,8518,Independent,Independent,4,537,5,542,#ff5584,,,,0.15214
2,613,GeorgeMonbiot,GeorgeMonbiot,4,252,0,252,#ff5584,,,,
3,16324,dansinker,dansinker,4,152,0,152,#ff5584,,,,
4,19432,IndyUSA,IndyUSA,4,78,1,79,#ff5584,,,,0.659969
5,21706,GuardianUS,GuardianUS,4,69,1,70,#ff5584,,,,0.016839
6,20543,robdelaney,robdelaney,4,68,0,68,#ff5584,,,,
7,2354,guardiannews,guardiannews,4,39,0,39,#ff5584,,,,
8,27901,Scientists4EU,Scientists4EU,4,35,0,35,#ff5584,,,,
9,18442,hairmich42,hairmich42,4,25,1,26,#ff5584,,,,0.039227


In [15]:
communities_topn[6].loc[0:9,:]

Unnamed: 0,index,Id,Label,modularity_class,indegree,outdegree,degree,Color,positive_score,negative_score,affect_score,toxicity_score
0,18464,nytpolitics,nytpolitics,5,1257,0,1257,#00bd94,,,,
1,25297,postpolitics,postpolitics,5,987,1,988,#00bd94,,,,0.286744
2,27781,SteveRattner,SteveRattner,5,70,0,70,#00bd94,,,,
3,15053,vplus,vplus,5,47,1,48,#00bd94,,,,0.139089
4,15283,FlaDems,FlaDems,5,45,0,45,#00bd94,,,,
5,16057,Simon_Cullen,Simon_Cullen,5,43,0,43,#00bd94,,,,
6,27758,peaceisactive,peaceisactive,5,19,3,22,#00bd94,,,,0.292228
7,23604,velvetus1,velvetus1,5,16,3,19,#00bd94,,,,0.220438
8,12076,firefire100,firefire100,5,14,0,14,#00bd94,,,,
9,23652,juliezweil,juliezweil,5,13,0,13,#00bd94,,,,


In [16]:
communities_topn[7].loc[0:9,:]

Unnamed: 0,index,Id,Label,modularity_class,indegree,outdegree,degree,Color,positive_score,negative_score,affect_score,toxicity_score
0,1134,mmfa,mmfa,0,783,3,786,#d3b3b0,,,,0.717606
1,14776,RBReich,RBReich,0,313,0,313,#d3b3b0,,,,
2,18661,PENamerica,PENamerica,0,153,0,153,#d3b3b0,,,,
3,26694,LOLGOP,LOLGOP,0,115,0,115,#d3b3b0,,,,
4,15813,richardhine,richardhine,0,100,1,101,#d3b3b0,,,,0.079251
5,2437,Smith83K,Smith83K,0,90,2,92,#d3b3b0,,,,0.295883
6,10294,rcooley123,rcooley123,0,50,0,50,#d3b3b0,,,,
7,3498,markfollman,markfollman,0,42,1,43,#d3b3b0,,,,0.03852
8,24182,GoAngelo,GoAngelo,0,22,0,22,#d3b3b0,,,,
9,26992,ggrushko,ggrushko,0,16,0,16,#d3b3b0,,,,


In [23]:
# 归类
ms_list = [3]    # 主流
fn_list = [0]                   # 虚假信息
db_list = [2]                   # 辟谣

def communitiy_classifying(communities, label_list, commnity_number_lists):
    '''将社区分类。'''
    assert len(label_list) == len(commnity_number_lists)
    com_3type = {}
    for i in range(len(label_list)):
        com_3type[label_list[i]] = pd.concat([communities[no] for no in commnity_number_lists[i]], axis=0)
    # 按入度降序排序
    for t in com_3type.values():
        t.sort_values(['indegree'], ascending=False, inplace=True)
        t.reset_index(drop=True, inplace=True)
        t.drop(['index'], axis=1, inplace=True)
    return com_3type

# communities_topn = load("pkl/communities_topn[topic=POTUS2016].pkl")
com_3type = communitiy_classifying(communities_topn, ['mainstream','fake_news','debunking'], [ms_list, fn_list, db_list])

### 情感和毒性分析

#### 毒性-情感联合分布

In [None]:
def toxicity_sentiment_heatmap(community, category, xlim=(-0.01, 0.4), ylim=(-0.05, 0.7)):
    '''绘制2D热力图。'''
    global com_3type
    df = com_3type[community]
    # 创建核密度估计图
    plt.figure()
    sns.kdeplot(data=df, x=category+'_score', y='toxicity_score', fill=True, levels=15)
    plt.xlim(xlim)
    plt.ylim(ylim)
    plt.title(community)
    plt.show()

In [4]:
import numpy as np
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

# 假设你的DataFrame是df，且你想要在x，y上绘制热力图
def toxicity_sentiment_heatmap_3d(community, category):
    '''绘制交互式3D热力图。'''
    global com_3type
    
    df = com_3type[community].copy()
    df = df[df[category+'_score'].notna() & df['toxicity_score'].notna()]   # 去除缺失值
    data = np.vstack([df[category+'_score'].values, df['toxicity_score'].values])

    # 计算数据的核密度估计
    kde = gaussian_kde(data)

    # 创建一个均匀的网格来评估kde
    xgrid = np.linspace(data[0].min(), data[0].max(), 100)
    ygrid = np.linspace(data[1].min(), data[1].max(), 100)
    Xgrid, Ygrid = np.meshgrid(xgrid, ygrid)
    Z = kde.evaluate(np.vstack([Xgrid.ravel(), Ygrid.ravel()]))

    # 创建一个3D图形
    fig = go.Figure(data=[go.Surface(z=Z.reshape(Xgrid.shape), x=Xgrid, y=Ygrid, colorscale='Jet')])

    # 更新图形的布局
    fig.update_layout(title=community, autosize=False,
                    width=700, height=700,
                    scene=dict(
                        xaxis_title=category+'_score',
                        yaxis_title='toxicity_score',
                        zaxis_title='Kernel Density'
                    ),
                    margin=dict(l=65, r=50, b=65, t=90))

    # 显示图形
    fig.show()

##### Toxicity-Positivity

In [None]:
toxicity_sentiment_heatmap_3d('mainstream', 'positive')

In [None]:
toxicity_sentiment_heatmap_3d('fake_news', 'positive')

In [None]:
toxicity_sentiment_heatmap_3d('debunking', 'positive')

##### Toxicity-Negativity

In [None]:
toxicity_sentiment_heatmap_3d('mainstream', 'negative')

In [None]:
toxicity_sentiment_heatmap_3d('fake_news', 'negative')

In [None]:
toxicity_sentiment_heatmap_3d('debunking', 'negative')

##### Toxicity-Affect

In [None]:
toxicity_sentiment_heatmap_3d('mainstream', 'affect')

In [None]:
toxicity_sentiment_heatmap_3d('fake_news', 'affect')

In [None]:
toxicity_sentiment_heatmap_3d('debunking', 'affect')