# Hashtag Polarities

In [86]:
import pandas as pd
import numpy as np
import json
import os
from src.features import generate_features
import tweepy
import datetime

In [2]:
with open('./config/viz_params.json') as f:
    viz_params = json.load(f)

# Cfg variables
path = viz_params['path']
top_k = viz_params['top_k']
top_k_fig_path = viz_params['top_k_fig_path']
user_hist_path = viz_params['user_hist_path']
user_hist_zoom_path = viz_params['user_hist_zoom_path']
good_path = viz_params['good_path']
bad_path = viz_params['bad_path']
good_tags = viz_params['good_tags']
bad_tags = viz_params['bad_tags']
maximum_posts = viz_params['maximum_posts']
api_keys = viz_params['api_keys']

In [3]:
jsons = [os.path.join(path, name) for name in sorted(os.listdir(path)) if 'dataset' in name]

In [4]:
# Count the number of occurences of every hashtag in the JSON
def hashtag_counts(json, misinformation_hashtags):
    df = pd.read_json(json, lines = True)
    ht = df['entities'].apply(lambda e: [x['text'] for x in e['hashtags']])
       
    if not misinformation_hashtags is None:
        ht = ht.apply(lambda x: x if set(x) & misinformation_hashtags else None).dropna()
    return pd.Series(ht.sum()).value_counts(), len(ht)


# Count the number of posts every user has made in the JSON
def user_counts(json, misinformation_hashtags):
    df = pd.read_json(json, lines=True)
    us = df['user'].apply(lambda x: x['screen_name'])
    return us.value_counts(), len(df)

In [5]:
# Decide whether to count hashtags or users
def count_features(jsons, misinformation_hashtags = None, top_k = None, mode = 'hashtag'):
    # Decide whether to count hashtags or users
    if mode == 'hashtag':
        method = hashtag_counts
    elif mode == 'user':
        method = user_counts
        
    # Compile count of first JSON in list
    total_series, tweet_count = method(jsons[0], misinformation_hashtags)
    print(f'vc shape {total_series.shape}', end='\r')
    
    # Append counts to every subsequent JSON
    for json in jsons[1:]:
        vc_series, vc_count = method(json, misinformation_hashtags)
        total_series = total_series.add(vc_series, fill_value = 0)
        tweet_count += vc_count
        print(f'vc shape {total_series.shape}', end='\r')
        
    # Return the top users/hashtags in all of the data
    return total_series.sort_values().sort_values(ascending=False)/tweet_count

In [6]:
marker_rate = count_features(jsons, set(["WuhanVirus", "Hydroxychloroquine"]))

vc shape (801,)

In [7]:
baseline_rate = count_features(jsons)

vc shape (118286,)

In [8]:
top_200 = baseline_rate.iloc[:200]

In [9]:
marker_rate

WuhanVirus            0.528837
Hydroxychloroquine    0.486804
COVID19               0.186706
coronavirus           0.091887
ChinaVirus            0.051808
                        ...   
DragonVirus19         0.000978
DrZev                 0.000978
DownWithTheCCP        0.000978
DE                    0.000978
RFI_En                0.000978
Length: 801, dtype: float64

In [10]:
test = pd.DataFrame(top_200)
test.columns = ["baseline"]
full_df = test.join(pd.DataFrame(marker_rate)).fillna(0)

In [11]:
polarity = full_df[0] - full_df["baseline"]

In [12]:
polarity = polarity/full_df['baseline']

In [13]:
polarity.sort_values().iloc[:-2]

Ecuador             -1.000000
Health              -1.000000
Video               -1.000000
NewYork             -1.000000
YoMeQuedoEnCasa     -1.000000
                      ...    
HongKong            49.401944
CCP                124.461046
ChineseVirus       212.622862
CCPVirus           215.299841
ChinaVirus         327.033041
Length: 198, dtype: float64

# User polarities

In [14]:
with open(api_keys) as f:
    keys = json.load(f)

    consumer_key = keys['consumer_key']
    consumer_secret = keys['consumer_secret']
    access_token = keys['access_token']
    access_token_secret = keys['access_token_secret']
        

In [15]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

In [16]:
api = tweepy.API(auth)

In [19]:
def get_all_users(jsons):
    all_users = set()
    for json in jsons:
        df = pd.read_json(json, lines=True)
        us = set(df['user'].apply(lambda x: x['screen_name']))
        all_users = all_users.union(us)
        print(len(all_users), end='\r')
    return pd.Series(list(all_users))

In [20]:
all_users = get_all_users(jsons)

774198

In [21]:
all_users

0         BoysandGirlsAid
1            BuzzDramatic
2           SeamasterGMT1
3           PalleShravani
4                 treyn43
               ...       
774193        PrettyJaden
774194       TreforJones2
774195      chukwujekwu90
774196           _TheMann
774197    umakantsingh_IN
Length: 774198, dtype: object

In [115]:
def normalized_user_polarity(x):
    tweets = []
    page = 1
    at_start_date = False
    # Get data FROM 3-1-20 TO 10-1-20
    try:
        while not at_start_date:
            new_tweets = api.user_timeline(screen_name=x, page=page)
            if not new_tweets:
                return None
            for tweet in new_tweets:
                if datetime.datetime(2020, 10, 1) <= tweet.created_at:
                    pass
                elif datetime.datetime(2020, 3, 1) > tweet.created_at:
                    at_start_date = True
                    break
                else:
                    tweets.append(tweet)
            page += 1
    except:
        return None
    
    # Get all hashtags
    ht = [h['text'] for tweet in tweets for h in tweet._json['entities']['hashtags']]
    user_pol = 0
    ht_count = 0
    
    # Add all hashtag polarities
    for hashtag in ht:
        try:
            user_pol += polarity.loc[hashtag]
            ht_count += 1
        except:
            continue
    if ht_count == 0:
        return None
    return user_pol / ht_count

In [107]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [None]:
user_polarities = all_users.progress_apply(normalized_user_polarity)

  0%|          | 2688/774198 [09:37<5997:28:43, 27.99s/it]