# Time slice analysis

python=3.7
|topic      |platform   |language   |
|-----------|-----------|-----------|
|POTUS2016  |Twitter    |en         |

In [2]:
import pandas as pd
import numpy as np
import string
import re
from joblib import dump, load
import warnings
warnings.filterwarnings('ignore')

# 输出DataFrame时显示所有的列
pd.set_option('display.max_columns', None)
# 输出DataFrame时每行显示完整的内容
pd.set_option('display.max_colwidth', None)

## Load debunking community (debunking dataset)

这一步只是为了取出debunking数据集，如果保存有debunking数据集文件就不必像下面一样繁琐。

In [2]:
# 归类
ms_list = [0, 3, 4, 5, 6, 7]    # 主流
fn_list = [1]                   # 虚假信息
db_list = [2]                   # 辟谣

def communitiy_classifying(communities, label_list, commnity_number_lists):
    '''将社区分类。'''
    assert len(label_list) == len(commnity_number_lists)
    com_3type = {}
    for i in range(len(label_list)):
        com_3type[label_list[i]] = pd.concat([communities[no] for no in commnity_number_lists[i]], axis=0)
    # 按入度降序排序
    for t in com_3type.values():
        t.sort_values(['indegree'], ascending=False, inplace=True)
        t.reset_index(drop=True, inplace=True)
        t.drop(['index'], axis=1, inplace=True)
    return com_3type

communities_topn = load("pkl/communities_topn[info=1core][time=16.10.10-16.12.19][topic=POTUS2016].pkl")
com_3type = communitiy_classifying(communities_topn, ['mainstream','fake_news','debunking'], [ms_list, fn_list, db_list])

In [3]:
df = pd.read_csv("data/3media_retweets[topic=POTUS2016][time=16.10.10-16.12.19][lang=en].csv")
debunk_users = set(com_3type['debunking']['Id'])
df_debunk = df[df['author.username'].isin(debunk_users)]
df_debunk.to_csv("data/debunking_retweets[topic=POTUS2016][debunking=media+keywords].csv")
df_debunk.shape

(26325, 84)

## Text Cleaning

In [5]:
# 去掉标点符号、网址、换行等字符
def wordopt(text):
    text = text.lower()
    text = re.sub(r'\\n', '', text) # 此处加上这句代码
    text = re.sub('\[.*?\]', '', text) # 去掉中括号括起来的字符串
    text = re.sub('https?://\S+|www\.\S+', '', text) # 去掉网址
    text = re.sub("\\W"," ",text) # 去掉非单词字符
    text = re.sub('<.*?>+', '', text) # 去掉HTML, XML标签
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # 将文本中所有标点符号删除
    text = re.sub('\n', '', text) # 删除换行符
    text = re.sub('\w*\d\w*', '', text) # 删除字母数字混合的“单词”
    return text

# 去掉停用词
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
eng_stopwords = nltk.corpus.stopwords.words("english")
def remove_eng_stopwords(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [word for word in token_text if word not in eng_stopwords]
    join_text = ' '.join(remove_stop)
    return join_text

# 词形还原
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()
def word_lemmatizer(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [lemm.lemmatize(w) for w in token_text]
    join_text = ' '.join(remove_stop)
    return join_text

# 去除噪声
from nltk.corpus import stopwords
Word_STOPWORDS = ["e", "te", "i", "me", "qe", "ne", "nje", "a", "per", "sh", "nga", "ka", "u", "eshte", "dhe", "shih", "nuk",
             "m", "dicka", "ose", "si", "shume", "etj", "se", "pa", "sipas", "s", "t", "dikujt", "dike", "mire", "vet",
             "bej", "ai", "vend", "prej", "ja", "duke", "tjeter", "kur", "ia", "ku", "ta", "keq", "dy", "ben", "bere",
             "behet", "dickaje", "edhe", "madhe", "la", "sa", "gjate", "zakonisht", "pas", "veta", "mbi", "disa", "iu",
             "mos", "c", "para", "dikush", "gje", "be", "pak", "tek", "fare", "beri", "po", "bie", "k", "do", "gjithe",
             "vete", "mund", "kam", "le", "jo", "beje", "tij", "kane", "ishte", "jane", "vjen", "ate", "kete", "neper",
             "cdo", "na", "marre", "merr", "mori", "rri", "deri", "b", "kishte", "mban", "perpara", "tyre", "marr",
             "gjitha", "as", "vetem", "nen", "here", "tjera", "tjeret", "drejt", "qenet", "ndonje", "nese", "jap",
             "merret", "rreth", "lloj", "dot", "saj", "nder", "ndersa", "cila", "veten", "ma", "ndaj", "mes", "ajo",
             "cilen", "por", "ndermjet", "prapa", "mi", "tere", "jam", "ashtu", "kesaj", "tille", "behem", "cilat",
             "kjo", "menjehere", "ca", "je", "aq", "aty", "prane", "ato", "pasur", "qene", "cilin", "teper", "njera",
             "tej", "krejt", "kush", "bejne", "ti", "bene", "midis", "cili", "ende", "keto", "kemi", "sic", "kryer",
             "cilit", "atij", "gjithnje", "andej", "siper", "sikur", "ketej", "ciles", "ky", "papritur", "ua",
             "kryesisht", "gjithcka", "pasi", "kryhet", "mjaft", "ketij", "perbashket", "ata", "atje", "vazhdimisht",
             "kurre", "tone", "keshtu", "une", "sapo", "rralle", "vetes", "ishin", "afert", "tjetren", "ketu", "cfare",
             "to", "anes", "jemi", "asaj", "secila", "kundrejt", "ketyre", "pse", "tilla", "mua", "nepermjet", "cilet",
             "ndryshe", "kishin", "ju", "tani", "atyre", "dic", "yne", "kudo", "sone", "sepse", "cilave", "kem", "ty",
             "t'i", "nbsp", "tha", "re", "the", "jr", "t", "n"]
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)
text_unknows= Word_STOPWORDS
stop.update(text_unknows)

# 去除噪声字符或字符串

from bs4 import BeautifulSoup
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    '''Removing the square brackets'''
    return re.sub('\[[^]]*\]', '', text)

def remove_between_square_brackets(text):
    '''Removing URL's'''
    return re.sub(r'http\S+', '', text)

def remove_stopwords(text):
    '''Removing the stopwords from text'''
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

def denoise_text(text):
    '''Removing the noisy text'''
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

# 去除标点符号
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

In [6]:
def text_cleaning(text):
    text = wordopt(text)
    text = remove_eng_stopwords(text)
    text = word_lemmatizer(text)
    text = denoise_text(text)
    text = punctuation_removal(text)
    return text

df_txt = df_debunk
df_txt['text'] = df_txt['text'].apply(text_cleaning)

## Group by date

In [7]:
# Convert 'created_at' column to datetime
df_txt['created_at'] = pd.to_datetime(df_txt['created_at'])
# Extract date from 'created_at' column
df_txt['date'] = df_txt['created_at'].dt.date
# Group by date
grouped_df = df_txt.groupby('date')

In [8]:
# In each time slice, aggregate texts for each user
time_slices = dict()
for name, group in grouped_df:
    time_slices[name] = group.groupby(by='author.username').agg(text=("text", lambda x: ' '.join(set(x))))

In [9]:
# Convert time_slices from dict into DataFrame, then save it as csv
for k, v in time_slices.items():
    v['date'] = k

df_merge_slices = pd.concat(time_slices.values())
df_merge_slices.sort_values(by='date', inplace=True)
df_merge_slices.to_csv("data/time_slices[topic=POTUS2016][lang=en][media=debunk].csv")

## Toxicity detection

In [4]:
perspective_res = pd.read_csv("data/toxicity_of_time_slices[topic=POTUS2016][lang=en][media=debunk].csv")
perspective_res.info()

def get_score_from_json(x):
    # x为待处理的json字符串
    if pd.isna(x):
        return None
    s = re.search("'score': {'value': (.+?),", x)
    return float(s.group(1))

perspective_res['toxicity'] = perspective_res['perspective_api_results'].apply(get_score_from_json)
perspective_res.to_csv("data/toxicity_of_time_slices[topic=POTUS2016][lang=en][media=debunk].csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13683 entries, 0 to 13682
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   author.username          13683 non-null  object 
 1   text                     13683 non-null  object 
 2   date                     13683 non-null  object 
 3   perspective_api_results  13683 non-null  object 
 4   toxicity                 13683 non-null  float64
dtypes: float64(1), object(4)
memory usage: 534.6+ KB


## Sentiments detection

In [5]:
# 读取LIWC字典
import liwc
liwcPath = r'data/LIWC2015_English.dic'
parse, category_names = liwc.load_token_parser(liwcPath)

# 用LIWC对每个用户进行分析
from sklearn.feature_extraction.text import TfidfVectorizer

def liwc_analyse_ver2(text, categories=['positive','negative','affect']):
    corpus = []
    words = []

    review = re.sub('[^a-zA-Z0-9]', ' ', text)
    review = review.split()
    review = list(category for token in review for category in parse(token))
    statements = ' '.join(review)
    corpus.append(statements)
    words.append(review)
    
    # TF-IDF
    try:
        vectorizer = TfidfVectorizer(max_features=5000)
        X_fit = vectorizer.fit(corpus)
        X_transformed = X_fit.transform(corpus)

        features = vectorizer.get_feature_names()
        df = pd.DataFrame(X_transformed.toarray(),columns=features)
        result = {col: df.get(col) for col in categories}
        result_df = pd.DataFrame(result)
    except:
        result_df = pd.DataFrame({k:[None] for k in categories})

    return result_df.T[0]

In [6]:
# 并行分析
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=10)
selected_categories = ['positive','negative','affect']
perspective_res.loc[:, selected_categories] = perspective_res['text'].parallel_apply(liwc_analyse_ver2)
perspective_res.to_csv("data/time_slices[topic=POTUS2016][lang=en][media=debunk].csv")

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1369), Label(value='0 / 1369'))), …

## Summarize daily datas

In [3]:
def calculate_statistics(df:pd.DataFrame):
    # 将日期列转换为日期类型
    df['date'] = pd.to_datetime(df['date'])
    scores = ['positive', 'negative', 'affect', 'toxicity']
    # 聚集每天的数据
    df_groupby_date = df.groupby('date')

    # 计算每天的用户数
    daily_user_count = df_groupby_date['author.username'].nunique()
    # 计算每种分数有多少人是None
    score_none_count = df_groupby_date[scores].apply(lambda x: x.isnull().sum())
    # 计算每天四种分数的平均值、中位数
    daily_mean = df_groupby_date[scores].mean()
    daily_median = df_groupby_date[scores].median()

    # 计算去掉极端值的平均值
    def mean_no_extreme(df:pd.DataFrame):
        q1 = df[scores].quantile(0.25)
        q3 = df[scores].quantile(0.75)
        iqr = q3 - q1
        df_no_extreme = df[~((df[scores] < (q1 - 1.5 * iqr)) | (df[scores] > (q3 + 1.5 * iqr)))]
        return df_no_extreme[scores].mean()
    
    daily_mean_no_extreme = df_groupby_date.apply(mean_no_extreme)

    # 创建一个新的DataFrame来存储这些数据
    daily_data = pd.DataFrame({
        'date': daily_user_count.index,
        'user_count': daily_user_count.values,
        'positive_none_count': score_none_count['positive'],
        'negative_none_count': score_none_count['negative'],
        'affect_none_count': score_none_count['affect'],
        'toxicity_none_count': score_none_count['toxicity'],
        'positive_mean': daily_mean['positive'].values,
        'negative_mean': daily_mean['negative'].values,
        'affect_mean': daily_mean['affect'].values,
        'toxicity_mean': daily_mean['toxicity'].values,
        'positive_median': daily_median['positive'].values,
        'negative_median': daily_median['negative'].values,
        'affect_median': daily_median['affect'].values,
        'toxicity_median': daily_median['toxicity'].values,
        'positive_mean_no_extreme': daily_mean_no_extreme['positive'].values,
        'negative_mean_no_extreme': daily_mean_no_extreme['negative'].values,
        'affect_mean_no_extreme': daily_mean_no_extreme['affect'].values,
        'toxicity_mean_no_extreme': daily_mean_no_extreme['toxicity'].values,
    })

    return daily_data

In [4]:
data = pd.read_csv("data/time_slices[topic=POTUS2016][lang=en][media=debunk].csv")
daily_statistics = calculate_statistics(data)
daily_statistics.to_csv("data/daily_statistics[topic=POTUS2016][lang=en][media=debunk].csv", index=False)
daily_statistics.head(5)

Unnamed: 0_level_0,date,user_count,positive_none_count,negative_none_count,affect_none_count,toxicity_none_count,positive_mean,negative_mean,affect_mean,toxicity_mean,positive_median,negative_median,affect_median,toxicity_median,positive_mean_no_extreme,negative_mean_no_extreme,affect_mean_no_extreme,toxicity_mean_no_extreme
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-10-10,2016-10-10,375,169,248,138,0,0.09229,0.074126,0.240623,0.097578,0.09167,0.060858,0.243432,0.051069,0.09229,0.065993,0.234832,0.078819
2016-10-11,2016-10-11,369,240,244,167,0,0.086339,0.116352,0.256076,0.191134,0.089087,0.089803,0.243207,0.169385,0.070577,0.116352,0.224072,0.180552
2016-10-12,2016-10-12,85,58,62,49,0,0.093911,0.101019,0.269947,0.138629,0.09167,0.089803,0.261744,0.095684,0.093911,0.101019,0.258072,0.126182
2016-10-13,2016-10-13,20,16,10,9,0,0.08211,0.155778,0.342949,0.128621,0.060858,0.174078,0.348155,0.081673,0.060858,0.155778,0.342949,0.095477
2016-10-14,2016-10-14,21,17,11,8,0,0.091512,0.12888,0.254591,0.242763,0.09167,0.148257,0.280056,0.305024,0.107789,0.146836,0.254591,0.242763


In [5]:
daily_statistics.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 71 entries, 2016-10-10 to 2016-12-19
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      71 non-null     datetime64[ns]
 1   user_count                71 non-null     int64         
 2   positive_none_count       71 non-null     int64         
 3   negative_none_count       71 non-null     int64         
 4   affect_none_count         71 non-null     int64         
 5   toxicity_none_count       71 non-null     int64         
 6   positive_mean             64 non-null     float64       
 7   negative_mean             67 non-null     float64       
 8   affect_mean               68 non-null     float64       
 9   toxicity_mean             71 non-null     float64       
 10  positive_median           64 non-null     float64       
 11  negative_median           67 non-null     float64       
 12  affe

In [10]:
# Calculate days from the earliest date to the latest
delta = daily_statistics['date'][-1] - daily_statistics['date'][0]
print(delta.days + 1)

71
