# COVID-19 Analysis (1)

python=3.7

2020 POTUS Election

筛选与COVID-19有关的帖子和URL处理

In [2]:
import pandas as pd
import seaborn as sns
import re
from joblib import dump, load
from joblib import Parallel, delayed  # 并行计算
from tldextract import extract
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# 输出DataFrame时显示所有的列
pd.set_option('display.max_columns', None)
# 输出DataFrame时每行显示完整的内容
pd.set_option('display.max_colwidth', None)

## 筛选2019年12月日-2023年3月14日有关的英文帖子

In [16]:
keywords = ['wuhanpneumonia','COVID19','coronavirus','corona virus','2019-nCoV','SARS-CoV-2']

# 处理csv原文件(编号从1到27)
def filter(csv_file_index, keywords, source_path, target_path, result_file_suffix):
    df = pd.read_csv(source_path + str(csv_file_index) + ".csv")  # 从source_path文件夹中取出原文件
    df_f1 = df[df['text'].str.contains('|'.join(keywords), case=False)]  # 对帖文进行关键词检索
    df_f2 = df[df['entities.hashtags'].astype(str).str.contains('|'.join(keywords), case=False)]  # 对话题标签进行关键词检索
    df_cat = pd.concat([df_f1,df_f2], axis=0, join='inner')  # 合并两种检索结果
    df_cat.sort_index(axis=0, ascending=True, inplace=True)  # 恢复帖文顺序
    df_cat.drop_duplicates(subset='id', keep='first', inplace=True)  # 去重
    df_cat.to_csv(target_path + str(csv_file_index) + result_file_suffix + ".csv", index=False)  # 保存

In [1]:
# 并行处理从13到16号csv原文件
begin = 9
end = 27
source_folder = "/mnt/data/Project7/fakenews/csv/"
target_folder = "data/csv_filtered/"
suffix = "_filtered"
Parallel(n_jobs=4)(delayed(filter)(ind, keywords, source_folder, target_folder, suffix) \
                              for ind in range(begin, end+1))

In [6]:
# 合并处理结果，并筛选出英文帖子
results = []
for i in range(9, 27+1):
    df = pd.read_csv(target_folder + str(i) + suffix + ".csv")
    results.append(df)
results.reverse()
df_cat = pd.concat(results, axis=0)
df_cat.drop_duplicates(subset='id', keep='first', inplace=True)  # 去重
df_cat = df_cat[df_cat['lang'] == 'en']  # 英文帖子
df_cat.reset_index(drop=True, inplace=True)  # 重置行索引
df_cat.to_csv("data/full_data[topic=COVID19].csv", index=False)  # 保存

In [7]:
df_cat.loc[:, ['created_at']]

Unnamed: 0,created_at
0,2023-03-14T23:56:44.000Z
1,2023-03-14T23:44:10.000Z
2,2023-03-14T23:36:54.000Z
3,2023-03-14T23:35:24.000Z
4,2023-03-14T23:31:45.000Z
...,...
3406007,2020-01-16T01:51:20.000Z
3406008,2020-01-16T01:45:56.000Z
3406009,2020-01-16T01:42:33.000Z
3406010,2020-01-16T01:39:07.000Z


In [9]:
data = df_cat.loc[:, :]
data.to_csv("data/full_data[topic=COVID19].csv", index=False)

In [10]:
# 统计用户数
# 帖子作者
authors = set(data[data['author.username'].notna()]['author.username'])
# 被回复者
in_reply_to_users = set(data[data['in_reply_to_username'].notna()]['in_reply_to_username'])
# 被转推者
retweeted_users = set(data[data['retweeted_username'].notna()]['retweeted_username'])
# 被引用者
quoted_users = set(data[data['quoted_username'].notna()]['quoted_username'])
# 涉及到的全部用户
all_users = authors | in_reply_to_users | retweeted_users | quoted_users

# 列出各种用户的数量
print("authors:", len(authors))
print("in_reply_to_users:", len(in_reply_to_users))
print("retweeted_users:", len(retweeted_users))
print("quoted_users:", len(quoted_users))
print("all_users:", len(all_users))

authors: 1223502
in_reply_to_users: 50785
retweeted_users: 66362
quoted_users: 19370
all_users: 1250926


## 处理URL

In [3]:
df = pd.read_csv("data/full_data[topic=COVID19].csv")

In [4]:
# 取出帖子中的url
def get_url_from_url_list(url_list: str, url_set: set):
    if type(url_list) != str:
        return url_list
    fd = re.findall('"(.+?)"', url_list)
    url_set.update(fd)

url_set = set()
df['entities.urls'].apply(get_url_from_url_list, args=(url_set, ))
len(url_set)

310852

### 短url（长度小于等于23）

In [5]:
short_urls = set()
for a in url_set:
    if len(a) <= 23:
        short_urls.add(a)

dump(short_urls, "pkl/short_urls[topic=COVID19][type=set].pkl")
print("number of short urls: ", len(short_urls))

number of short urls:  42327


去国外服务器上跑转换短URL

In [3]:
# 转换后的短URL
df_reverted_urls = pd.read_csv("data/reverted_short_urls[topic=COVID19].csv")
df_reverted_urls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42327 entries, 0 to 42326
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           42327 non-null  object
 1   reverted_url  42327 non-null  object
dtypes: object(2)
memory usage: 661.5+ KB


### 长url（长度大于23）

In [6]:
long_urls = set({})
for a in url_set:
    if len(a) > 23:
        long_urls.add(a)

dump(long_urls, "pkl/long_urls[topic=COVID19][type=set].pkl")
print("number of short urls: ", len(long_urls))

number of short urls:  268525


In [4]:
# long_urls = load("pkl/long_urls[topic=COVID19][type=set].pkl")
df_long_urls = pd.DataFrame(long_urls, columns=['url'])
df_long_urls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268525 entries, 0 to 268524
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     268525 non-null  object
dtypes: object(1)
memory usage: 2.0+ MB


### 长短URL合并&标记

In [5]:
df_long_urls['reverted_url'] = df_long_urls['url']
df_urls = pd.concat([df_reverted_urls, df_long_urls], ignore_index=True)

In [6]:
# 从URL中提取域名的函数
from tldextract import extract
def get_domain(url:str) -> str:
    res = extract(url)
    return res[1] + '.' + res[2]

df_urls['domain'] = df_urls['reverted_url'].apply(get_domain)

In [8]:
# 导入三种媒体的域名列表
# fake news media
fn_df = pd.read_csv("data/FakeNewsDomain_from_iffy.news_23.09.06.csv")
# mainstream media
ms_df = pd.read_csv("data/high_credibility_websites_CoVaxxy_ver2.csv")
# debunking media
db_df = pd.read_csv("data/(Merge_ver2)debunking_fact-checking_sites.csv")

In [14]:
# 给网址打标记
fn_set = set(fn_df['Domain'])
ms_set = set(ms_df['site'])
db_set = set(db_df['domain'].dropna())
db_re = '|'.join(db_set)

df_urls['fake_news'] = ''
df_urls['mainstream'] = ''
df_urls['debunking'] = ''

# 多进程地打标记
def url_labeling(urls:pd.DataFrame, media, label:str):
    '''
    label = 'fake_news', 'mainstream' or 'debunking'
    '''
    if label != 'debunking':
        for i in urls.index:
            urls[label][i] = True if urls['domain'][i] in media else False
    else:
        for i in urls.index:
            urls[label][i] = True if re.search(pattern=db_re, string=urls['reverted_url'][i]) else False
            
    return urls[label]

# 多进程打标记
import multiprocessing as mp
from multiprocessing import Pool
pool = Pool(processes=3)
res_fn = pool.apply_async(url_labeling, (df_urls, fn_set, 'fake_news'))
res_ms = pool.apply_async(url_labeling, (df_urls, ms_set, 'mainstream'))
res_db = pool.apply_async(url_labeling, (df_urls, db_re, 'debunking'))
pool.close()
pool.join()

In [None]:
# 合并标记结果，保存文件
df_urls_labeled = pd.concat([df_urls.loc[:, ['url','reverted_url','domain']], res_fn.get(), res_ms.get(), res_db.get()], axis=1)
df_urls_labeled.to_csv("data/labeled_urls[topic=COVID19].csv", index=False)