In [10]:
import pandas as pd
import numpy as np
import json
import os
from src.features import generate_features

In [5]:
with open('./config/viz_params.json') as f:
    viz_params = json.load(f)

# Cfg variables
path = viz_params['path']
top_k = viz_params['top_k']
top_k_fig_path = viz_params['top_k_fig_path']
user_hist_path = viz_params['user_hist_path']
user_hist_zoom_path = viz_params['user_hist_zoom_path']
good_path = viz_params['good_path']
bad_path = viz_params['bad_path']
good_tags = viz_params['good_tags']
bad_tags = viz_params['bad_tags']
maximum_posts = viz_params['maximum_posts']

In [8]:
jsons = [os.path.join(path, name) for name in sorted(os.listdir(path)) if 'dataset' in name]

In [31]:
# Count the number of occurences of every hashtag in the JSON
def hashtag_counts(json, misinformation_hashtags):
    df = pd.read_json(json, lines = True)
    ht = df['entities'].apply(lambda e: [x['text'] for x in e['hashtags']])
       
    if not misinformation_hashtags is None:
        ht = ht.apply(lambda x: x if set(x) & misinformation_hashtags else None).dropna()
    return pd.Series(ht.sum()).value_counts(), len(ht)


# Count the number of posts every user has made in the JSON
def user_counts(json, misinformation_hashtags):
    df = pd.read_json(json, lines=True)
    us = df['user'].apply(lambda x: x['screen_name'])
    return us.value_counts(), len(df)

In [32]:
# Decide whether to count hashtags or users
def count_features(jsons, misinformation_hashtags = None, top_k = None, mode = 'hashtag'):
    # Decide whether to count hashtags or users
    if mode == 'hashtag':
        method = hashtag_counts
    elif mode == 'user':
        method = user_counts
        
    # Compile count of first JSON in list
    total_series, tweet_count = method(jsons[0], misinformation_hashtags)
    print(f'vc shape {total_series.shape}', end='\r')
    
    # Append counts to every subsequent JSON
    for json in jsons[1:]:
        vc_series, vc_count = method(json, misinformation_hashtags)
        total_series = total_series.add(vc_series, fill_value = 0)
        tweet_count += vc_count
        print(f'vc shape {total_series.shape}', end='\r')
        
    # Return the top users/hashtags in all of the data
    return total_series.sort_values().sort_values(ascending=False)/tweet_count

In [39]:
marker_rate = count_features(jsons, set(["WuhanVirus", "Hydroxychloroquine"]))

vc shape (801,)

In [40]:
baseline_rate = count_features(jsons)

vc shape (118286,)

In [45]:
top_200 = baseline_rate.iloc[:200]

In [49]:
marker_rate

WuhanVirus            0.528837
Hydroxychloroquine    0.486804
COVID19               0.186706
coronavirus           0.091887
ChinaVirus            0.051808
                        ...   
DragonVirus19         0.000978
DrZev                 0.000978
DownWithTheCCP        0.000978
DE                    0.000978
RFI_En                0.000978
Length: 801, dtype: float64

In [59]:
test = pd.DataFrame(top_200)
test.columns = ["baseline"]
full_df = test.join(pd.DataFrame(marker_rate)).fillna(0)

In [61]:
polarity = full_df[0] - full_df["baseline"]

In [62]:
polarity

COVID19              0.094138
coronavirus          0.053633
Coronavirus          0.013383
Covid19              0.006578
covid19             -0.004386
                       ...   
HapusCOVID19        -0.000141
Paris               -0.000141
QAnon                0.000838
lockdownextension   -0.000140
UPDATE               0.000838
Length: 200, dtype: float64