# POTUS Election 2016 Analysis ver2 (1)

python=3.7

2016 POTUS Election

筛选与2016年美国总统大选有关的帖子和URL处理

In [2]:
import pandas as pd
import seaborn as sns
import re
from joblib import dump, load
from joblib import Parallel, delayed  # 并行计算
from tldextract import extract
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## 筛选2016年10月10日-2017年1月20日有关的英文帖子

In [2]:
keywords = ["MAGA", "MakeAmericaGreatAgain", "Trump", "AmericaFirst", "Make America Great Again", \
             "Hillary", "Clinton", "ImWithHer", "OHHillYes", \
             "election2016", "elections2016", "debates2016"]

# 处理csv原文件(编号从1到27)
def filter(csv_file_index, keywords, source_path, target_path, result_file_suffix):
    df = pd.read_csv(source_path + str(csv_file_index) + ".csv")  # 从source_path文件夹中取出原文件
    df_f1 = df[df['text'].str.contains('|'.join(keywords), case=False)]  # 对帖文进行关键词检索
    df_f2 = df[df['entities.hashtags'].astype(str).str.contains('|'.join(keywords), case=False)]  # 对话题标签进行关键词检索
    df_cat = pd.concat([df_f1,df_f2], axis=0, join='inner')  # 合并两种检索结果
    df_cat.sort_index(axis=0, ascending=True, inplace=True)  # 恢复帖文顺序
    df_cat.drop_duplicates(subset='id', keep='first', inplace=True)  # 去重
    df_cat.to_csv(target_path + str(csv_file_index) + result_file_suffix + ".csv", index=False)  # 保存

In [4]:
# 并行处理从13到16号csv原文件
begin = 1
end = 1
source_folder = "/mnt/data/Project7/fakenews/csv/"
target_folder = "data/csv_potus_election2016/"
suffix = "_potus_election2016"
# Parallel(n_jobs=4)(delayed(filter)(ind, keywords, source_folder, target_folder, suffix) \
#                               for ind in range(begin, end+1))

In [5]:
# 合并处理结果，并筛选出英文帖子
results = []
for i in range(begin, end+1):
    df = pd.read_csv(target_folder + str(i) + suffix + ".csv")
    results.append(df)
results.reverse()
df_cat = pd.concat(results, axis=0)
df_cat.drop_duplicates(subset='id', keep='first', inplace=True)  # 去重
df_cat = df_cat[df_cat['lang'] == 'en']  # 英文帖子
df_cat.reset_index(drop=True, inplace=True)  # 重置行索引
df_cat.to_csv("data/full_data[topic=POTUS2016].csv", index=False)  # 保存

In [45]:
df_cat.loc[1023030:, ['created_at']]  # 1023032-

Unnamed: 0,created_at
1023030,2017-01-21T00:00:04.000Z
1023031,2017-01-21T00:00:00.000Z
1023032,2017-01-20T23:59:54.000Z
1023033,2017-01-20T23:59:53.000Z
1023034,2017-01-20T23:59:52.000Z
...,...
2084876,2016-10-10T13:46:20.000Z
2084877,2016-10-10T13:46:19.000Z
2084878,2016-10-10T13:46:18.000Z
2084879,2016-10-10T13:46:17.000Z


In [54]:
data = df_cat.loc[1023032:, :]
data.to_csv("data/full_data[topic=POTUS2016].csv", index=False)

In [47]:
# 统计用户数
# 帖子作者
authors = set(data[data['author.username'].notna()]['author.username'])
# 被回复者
in_reply_to_users = set(data[data['in_reply_to_username'].notna()]['in_reply_to_username'])
# 被转推者
retweeted_users = set(data[data['retweeted_username'].notna()]['retweeted_username'])
# 被引用者
quoted_users = set(data[data['quoted_username'].notna()]['quoted_username'])
# 涉及到的全部用户
all_users = authors | in_reply_to_users | retweeted_users | quoted_users

# 列出各种用户的数量
print("authors:", len(authors))
print("in_reply_to_users:", len(in_reply_to_users))
print("retweeted_users:", len(retweeted_users))
print("quoted_users:", len(quoted_users))
print("all_users:", len(all_users))

authors: 387580
in_reply_to_users: 15799
retweeted_users: 24571
quoted_users: 5376
all_users: 394207


## 处理URL

In [55]:
df = pd.read_csv("data/full_data[topic=POTUS2016].csv")

In [56]:
# 取出帖子中的url
def get_url_from_url_list(url_list: str, url_set: set):
    if type(url_list) != str:
        return url_list
    fd = re.findall('"(.+?)"', url_list)
    url_set.update(fd)

url_set = set()
df['entities.urls'].apply(get_url_from_url_list, args=(url_set, ))
len(url_set)

160248

### 短url（长度小于等于23）

In [57]:
short_urls = set()
for a in url_set:
    if len(a) <= 23:
        short_urls.add(a)

dump(short_urls, "pkl/short_urls[topic=POTUS2016][type=set].pkl")
print("number of short urls: ", len(short_urls))

number of short urls:  58106


去国外服务器上跑转换短URL

In [3]:
# 转换后的短URL
df_reverted_urls = pd.read_csv("data/reverted_short_urls[topic=POTUS2016].csv")
df_reverted_urls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58106 entries, 0 to 58105
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           58106 non-null  object
 1   reverted_url  58106 non-null  object
dtypes: object(2)
memory usage: 908.0+ KB


### 长url（长度大于23）

In [60]:
long_urls = set({})
for a in url_set:
    if len(a) > 23:
        long_urls.add(a)

dump(long_urls, "pkl/long_urls[topic=POTUS2016][type=set].pkl")
print("number of short urls: ", len(long_urls))

number of short urls:  102142


In [4]:
# long_urls = load("pkl/long_urls[topic=POTUS2016][type=set].pkl")
df_long_urls = pd.DataFrame(long_urls, columns=['url'])
df_long_urls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102142 entries, 0 to 102141
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     102142 non-null  object
dtypes: object(1)
memory usage: 798.1+ KB


### 长短URL合并&标记

In [11]:
df_long_urls['reverted_url'] = df_long_urls['url']
df_urls = pd.concat([df_reverted_urls, df_long_urls], ignore_index=True)

In [12]:
# 从URL中提取域名的函数
from tldextract import extract
def get_domain(url:str) -> str:
    res = extract(url)
    return res[1] + '.' + res[2]

df_urls['domain'] = df_urls['reverted_url'].apply(get_domain)

In [13]:
# 导入三种媒体的域名列表
# fake news media
fn_df = pd.read_csv("data/FakeNewsDomain_from_iffy.news_23.09.06.csv")
# mainstream media
ms_df = pd.read_csv("data/high_credibility_websites_CoVaxxy_ver2.csv")
# debunking media
db_df = pd.read_csv("data/(Merge_ver2)debunking_fact-checking_sites.csv")

In [14]:
# 给网址打标记
fn_set = set(fn_df['Domain'])
ms_set = set(ms_df['site'])
db_set = set(db_df['domain'].dropna())
db_re = '|'.join(db_set)

df_urls['fake_news'] = ''
df_urls['mainstream'] = ''
df_urls['debunking'] = ''

# 多进程地打标记
def url_labeling(urls:pd.DataFrame, media, label:str):
    '''
    label = 'fake_news', 'mainstream' or 'debunking'
    '''
    if label != 'debunking':
        for i in urls.index:
            urls[label][i] = True if urls['domain'][i] in media else False
    else:
        for i in urls.index:
            urls[label][i] = True if re.search(pattern=db_re, string=urls['reverted_url'][i]) else False
            
    return urls[label]

# 多进程打标记
import multiprocessing as mp
from multiprocessing import Pool
pool = Pool(processes=3)
res_fn = pool.apply_async(url_labeling, (df_urls, fn_set, 'fake_news'))
res_ms = pool.apply_async(url_labeling, (df_urls, ms_set, 'mainstream'))
res_db = pool.apply_async(url_labeling, (df_urls, db_re, 'debunking'))
pool.close()
pool.join()

In [None]:
# 合并标记结果，保存文件
df_urls_labeled = pd.concat([df_urls.loc[:, ['url','reverted_url','domain']], res_fn.get(), res_ms.get(), res_db.get()], axis=1)
df_urls_labeled.to_csv("data/labeled_urls[topic=POTUS2016].csv", index=False)