# Analysis of Spread of PDAF Protest Sentiment on Twitter

## Executive Summary

## Introduction

## The Dataset

## Methods

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import json
import glob
import ast
import re

import networkx as nx
from itertools import combinations
from collections import Counter 
from datetime import datetime

### Data Preparation

In [None]:
# load data paths
paths = glob.glob("data/*")
paths_tweets_to_merge = [path for path in paths if "tweets" in path and "merged" not in path]
paths_tweets_to_merge.remove("data/tweets_mpm_list_following.json")
paths_tweets_to_merge

In [None]:
# Merge tweets dataset

hashtags = ["millionpeoplemarch", "janetnapoles", "onepinoy", "pdafkalampag", "porkbarrel",
            "porkbarrelscam", "protestangbayan", "pdafscam", "scrappork", "stoppork",
            "tayoangboss", "yesconchitacan"]

df_tweets_merged = pd.DataFrame()

for path, hashtag in zip(paths_tweets_to_merge, hashtags):

    df = pd.read_json(path)
    # add a column for the #hashtag
    df["hashtag"] = hashtag
    
    # merge
    df_tweets_merged = pd.concat([df_tweets_merged, df])
    
# reset index
df_tweets_merged = df_tweets_merged.reset_index(drop=True)

# filter only those tweets in 2013
# still get NaT values
condition_1 = df_tweets_merged.date.isnull()
# get time after June 2013
condition_2 = df_tweets_merged.date > pd.Timestamp('2013-06-01 00:00:00')
# get time before Jan 2014
condition_3 = df_tweets_merged.date < pd.Timestamp('2014-01-01 00:00:00')

# merge conditions
merged_conditions = condition_1 | condition_2 & condition_3
df_tweets_merged = df_tweets_merged[merged_conditions]
df_tweets_merged = df_tweets_merged.sort_values(by="date").reset_index(drop=True)

# # save to json
pd.DataFrame.to_json(df_tweets_merged, "data/tweets_merged.json", orient="records")

In [None]:
## IMPLEMENT KEY DUPLICATE REMOVER DURING CLEANING?
## Dictionaries automatically remove duplicates

In [None]:
def read_dict_from_file(path):

    # open scraped file as string
    f = open(path).read()

    # convert scraped file from string to dictionary
    dic = ast.literal_eval(f)

    # return dictionary of file
    return dic

In [None]:
# REMOVE BELOW NON-Number/Alphabet characters
# DO THE SAME FOR THE TWITTER PANDAS DATAFRAME

In [None]:
def clean_dict(dirty_dict, name_type="screen_name"):

    # define new dictionary
    new_dict = {}

    # get keys from input dirty dict
    keys = dirty_dict.keys()

    for key in keys:

        # format is [{"full_name" : "XX", "screen_name" : "YY"}]
        value = dirty_dict[key]
        
        # pattern for cleaning
        pattern = re.compile("[\w]+")

        # clean full_names and make lower_case
        # get followers full_name per user
        full_names = ["".join(re.findall(pattern, d[name_type])).lower() for d in value]

#         # get followers screen name per user
#         screen_names = [d["screen_name"] for d in value]
    
        # clean keys
        new_key = "".join(re.findall(pattern, key)).lower()
        
        # make key: value pair using full_name per user
        new_dict[new_key] = full_names

    # return new_dict
    return new_dict

# Analysis

## Preparing Follower and Following Data

In [None]:
# load dirty followers dictionary
path = "data/user_followers.json"
dirty_followers = read_dict_from_file(path)

# load dirty following dictionary
path = "data/user_following.json"
dirty_following = read_dict_from_file(path)

In [None]:
# get clean followers, full_name
clean_followers = clean_dict(dirty_followers, "screen_name")

# get clean following, full_name
clean_following = clean_dict(dirty_following, "screen_name")

In [None]:
# function to combine (get union) dictionaries
def union_dict(a, b):
    # a is the bigger dataset we will update with b
    for k, v in a.items():
        # if key is in a and b, then just append to current data in a
        if k in b.keys():
            # get union of unique items (remove redundancy)
            a[k] = list(set(a[k] + b[k]))

    # append keys in b that are not in a
    for k, v in b.items():
        if k not in a.keys():
            a[k] = v

    return a

In [None]:
# Test
a = {"a":[1, 2, 5], "b":[2, 3, 4]}
b = {"a":[1, 2, 3, 4], "c":[8, 9]}
union_dict(a, b)

In [None]:
# convert following to followers format
# e.g. {"user_A": ["followed_user_1", "followed_user_2"]
# turn to: {"followed_user_1" : ["user_A], "followed_user_2": ["user_A"]}
# So network direction goes from user (followed) to follower

def swap_key_val(old_dic):

    # get keys of dictionary to swap
    keys = old_dic.keys()

    # convert keys to vals and vals to keys
    # initiate new dictioanary (swapped)
    new_dic = {}
    for key in keys:
        # swap value and key
        a = {val: [key] for val in old_dic[key]}
        # add to new_dic using union_dict function
        new_dic = union_dict(new_dic, a)
    return new_dic

In [None]:
# Test
a = {"follower_a":["a", "b"], "follower_b":["b", "c", "d"]}
b = swap_key_val(a)
b

In [None]:
a = clean_followers.copy()
b = clean_following.copy()

# get followers data from following data
followers_from_following = swap_key_val(b)

# get union of followers data (scraped) and followers from following data
unified_followers = union_dict(a, followers_from_following)

In [None]:
# save unified_followers to json
with open('data/unified_followers.json', 'w') as f:
    json.dump(unified_followers, f)

In [None]:
# load saved file

In [None]:
# get unified following data
# get following data from followers data

a = clean_followers.copy()
b = clean_following.copy()

following_from_followers = swap_key_val(a)

# get union of followers data (scraped) and followers from following data
unified_following = union_dict(b, following_from_followers)

In [None]:
# load saved file

In [None]:
# save unified_following to json
with open('data/unified_following.json', 'w') as f:
    json.dump(unified_following, f)

In [None]:
# length of scraped following
len(clean_following)

In [None]:
# length of unified following
len(unified_following)

In [None]:
# num of all users:followers scraped
len(unified_followers.keys())

In [None]:
# num in just clean_followers
len(clean_followers.keys())

In [None]:
# num in user:followers from following data
len(followers_from_following.keys())

## Making Followers Graph - Asymmetric

We generate a graph of all the nodes in the followers and following dataset.

In [None]:
# # load Graph of followers
# G = nx.from_dict_of_lists(unified_followers, create_using=nx.DiGraph())
# len(G.nodes)

### Basic Properties

In [None]:
# # out_degrees
# degs = [k for n, k in G.out_degree]
# avg_deg = np.mean(degs)
# min_deg = np.min(degs)
# max_deg = np.max(degs)

# vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
# pd.DataFrame.from_dict(vals).T

In [None]:
# # in_degrees
# degs = [k for n, k in G.in_degree]
# avg_deg = np.mean(degs)
# min_deg = np.min(degs)
# max_deg = np.max(degs)

# vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
# pd.DataFrame.from_dict(vals).T

In [None]:
# # degrees
# degs = [k for n, k in G.degree]
# avg_deg = np.mean(degs)
# min_deg = np.min(degs)
# max_deg = np.max(degs)

# vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
# pd.DataFrame.from_dict(vals).T

In [None]:
# clustering

### Degree Distribution

In [None]:
# # out degree distribution
# degs = [v for k, v in G.out_degree]

# # count
# deg_count = Counter(degs)

# # plot
# plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

# plt.xlabel("out_degree")
# plt.ylabel("num nodes")
# plt.title("Out degree distribution")

In [None]:
# # in degree distribution
# degs = [v for k, v in G.in_degree]

# # count
# deg_count = Counter(degs)

# # plot
# plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

# plt.xlabel("in_degree")
# plt.ylabel("num nodes")
# plt.title("In degree distribution")

In [None]:
# # degree distribution
# degs = [v for k, v in G.degree]

# # count
# deg_count = Counter(degs)

# # plot
# plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

# plt.xlabel("degree")
# plt.ylabel("num nodes")
# plt.title("Degree distribution")

In [None]:
# VISUALIZATION
# # Use spring layout
# pos = nx.spring_layout(G)

# # draw graph
# plt.figure(figsize=(10, 10))
# nx.draw(G, pos=pos, node_size=2, with_labels=False)

## Filtering Followers Subgraph - Symmetric

The function below only selects that nodes that mutually follow each other (regardless if they tweeted about the protest or not). a <--> b will be preserved while c --> d will not be.

In [None]:
# def filter_symmetric(matrix, nodes):
#     # get only symmetric
#     matrix_sym = np.multiply(matrix, matrix.T)
    
#     # make diagonal = 0
#     np.fill_diagonal(matrix_sym, 0)
    
#     # get indices with at least one degree in the symmetric network
#     nonzero = np.array(np.sum(matrix_sym, axis=0))
#     inds = list(np.nonzero(nonzero[0])[0])
    


#     # get nodes corresponding to index of nodes in symm network
#     sym_nodes = np.array(nodes)[inds]
    
#     return matrix_sym, sym_nodes

In [None]:
# # TEST
# matrix = np.matrix([[1, 0, 1, 1], [1, 1, 0, 1], [1, 0, 0, 0], [0, 1, 1, 1]])
# nodes = ['a', 'b', 'c', 'd']
# filter_symmetric(matrix, nodes)

In [None]:
# # make adjacency matrix
# matrix = nx.adjacency_matrix(G).todense()

# # get nodes list
# nodes = list(G.nodes)

# # THIS SHOULD BE EQUAL
# matrix.shape, len(nodes)

In [None]:
# # CAN WE REMOVE THIS? SINCE THE IMPLEMENTATION BELOW WORKS AND THIS BREAKS
# # THIS KILLS THE KERNEL
# # filter symmetric adjacencty matrix
# sym_matrix, sym_nodes = filter_symmetric(matrix, nodes)

# print("No. of links in asymmetric network:", np.sum(matrix))
# print("Shape of asymmetric network:", matrix.shape)
# print("No. of links in symmetric network:", np.sum(sym_matrix)/2)
# print("Shape of symmetric network:", sym_matrix.shape, len(sym_nodes))

In [None]:
# Filter symmetric given dictionary of followers only
def filter_symmetric_from_dict(dic):
    # get dictionary of followers
    
    # define new dictionary    
    sym_dic = {}
    
    # run through keys and values
    for k, v in dic.items():
        # check if symmetric by looking at key and valu
        for i in v:
            try:
                if k in dic[i]:
                    if k in sym_dic:
                        sym_dic[k] += [i]
                    else:
                        sym_dic[k] = [i]
            except:
                continue
    return sym_dic

In [None]:
a = {'z':['e', 'b'], 'a':['b', 'c', 'd'], 'b':['a', 'z'], 'c':['a'], 'd':['c'], 'e':['z']}
filter_symmetric_from_dict(a)

In [None]:
# # Filter the network for symmetric only 

# # list of users in symmetric network
# user_list = list(unified_followers.keys())

# # filter only users in symmetric network
# symm_followers = filter_symmetric_from_dict(unified_followers)

# # make graph
# H = nx.from_dict_of_lists(symm_followers, create_using=nx.DiGraph())

In [None]:
# # Number of nodes in symmetric network
# len(H.nodes)

### Basic Properties

In [None]:
# degs = [k for n, k in H.degree]
# avg_deg = np.mean(degs)
# min_deg = np.min(degs)
# max_deg = np.max(degs)

# vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
# pd.DataFrame.from_dict(vals).T

### Degree Distribution

In [None]:
# # degree distribution
# degs = [v for k, v in H.degree]

# # count
# deg_count = Counter(degs)

# # plot
# plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

# plt.xlabel("degree")
# plt.ylabel("num nodes")
# plt.title("Degree distribution")

In [None]:
# VISUALIZATION
# # Use spring layout
# pos = nx.spring_layout(H)

# # draw graph
# plt.figure(figsize=(10, 10))
# nx.draw(H, pos=pos, node_size=2, with_labels=True, font_color='b', font_size=10)

## Fraction of Recruited Users Over Time (Nodes)


In [None]:
## CONSIDER RETWEETS - CURRENTLY CODE BELOW REMOVES RETWEETS DUE TO LACK OF DATE

In [None]:
# loading Tweets as dataframe
tweets_0 = pd.read_json("data/tweets_merged.json")

# remove hashtag column to avoid duplicates (some tweets have >1 hashtags)
tweets_0 = tweets_0.drop("hashtag", axis=1)

# drop duplicates by tweet_id EXCEPT NULL -- HOW TO DO THIS?
tweets_0 = tweets_0[tweets_0['tweet_id'].isnull(
) | ~tweets_0[tweets_0['tweet_id'].notnull()].duplicated(subset='tweet_id', keep='first')]

# cleaning up user_screen_name
pattern = re.compile("[\w]+")
tweets_0.user_screen_name = tweets_0.user_screen_name.apply(
    lambda x: re.findall(pattern, "".join(re.findall(pattern, x)).lower())[0])

# convert date to date time
tweets = tweets_0.copy()
tweets.date = pd.to_datetime(tweets_0.date)

# remove date outside 2013
tweets = tweets[(tweets['date'] > '2013-01-01 00:00:00')
                & (tweets['date'] <= '2013-12-31 00:00:00')]

# HOW ABOUT THE RETWEETS

# sort values by date
tweets = tweets.sort_values(by="date")

# drop duplicates in case there are duplicates
tweets = tweets.drop_duplicates(keep="first")

# reset index
tweets = tweets.reset_index()

tweets.head()

In [None]:
# length of dataframe
len(tweets_0)

In [None]:
# length of tweets only
len(tweets_0[np.logical_and(tweets_0.reply == 0, tweets_0.retweet == 0)])

In [None]:
# number of retweets
len(tweets_0[tweets_0.retweet == 1])

In [None]:
# number of replies
len(tweets_0[tweets_0.reply == 1])

In [None]:
# get only the first time a unique user tweet/retweet or replies with relevant #hash
unique_users = tweets.drop_duplicates(subset="user_screen_name", keep="first")

# sort by date
unique_users = unique_users.sort_values(by="date")

# reset index
unique_users = unique_users.reset_index()

# get cumulative distribution, each user adds 1 to the count
count_cumulative = [i + 1 for i in range(len(unique_users))]
prop_cumulative = [i / len(unique_users) for i in range(len(unique_users))]

# plot the distribution
plt.plot(unique_users.date, prop_cumulative)
plt.xticks(rotation=90)
plt.xlabel("date")
plt.ylabel("$N_a/N$")
plt.title("Fraction of recruited users over time")

In [None]:
# get days with most frequent tweets
unique_users.date.dt.date.value_counts()[:10]

In [None]:
## DO THE SAME ABOVE FOR COMBINATIONS OF TWEETS, RETWEETS, and REPLIES

In [None]:
# number of unique users
# Excluding retweets
len(unique_users)

In [None]:
# Add date column to tweets data (without time)
tweets['date'] = [tweets.date[i].date() for i in range(len(tweets))]

# Number of tweets and unique tweets and replies
# No retweets
print("No. of tweets/replies:", len(tweets))
print("No. of unique tweets/replies:", len(tweets.text.unique()))

# Add date to nodes data for filtering say of degrees?

In [None]:
# filter only those in unified_followers dataset
users_list = list(unified_followers.keys())
tweets_2 = tweets[tweets.user_screen_name.isin(
    users_list)].reset_index(drop=True)
tweets_2 = tweets_2.drop("index", axis=1)
tweets_2.head()

In [None]:
# check for similar tweet_id values - there shouldn't be duplicates
# WE CAN REMOVE THIS
for i in range(len(tweets)):
    if len(tweets[tweets.tweet_id == tweets.tweet_id.values[i]]) > 1:
        display(tweets[tweets.tweet_id == tweets.tweet_id.values[i]])


In [None]:
# No. of filtered tweets and users
len(tweets_2), len(tweets_2.user_screen_name.unique())

## Filtering Only Users that Tweeted the #Hashtag and their Followers (who may or may not have tweeted the #Hashtag)

### Asymmetric Graph

In [None]:
# get list of unique user_screen_name in pandas dataframe
# this is the list that is both in the followers list and in the tweets data
# No yet concern for TIME so WE USE tweets_0
# tweets, replies, and retweets
final_users = list(tweets_0.user_screen_name.unique())
len(final_users)

In [None]:
# filter user - follower dictionary to include only those selected above
# users_followers = {k: unified_followers[k] for k in final_users}
# Get the intersection of user:followers list and the tweets

# some users in the dataframe has not been scraped yet
users_followers = {}
for k in final_users:
    try: users_followers[k] = unified_followers[k]
    except: continue

# these are the keys only
len(users_followers)

In [None]:
# make a network using the filtered users_followers above
# load Graph of followers
# includes nodes that may not have tweeted about the protest
G = nx.from_dict_of_lists(users_followers, create_using=nx.DiGraph())

# Use spring layout
# pos = nx.spring_layout(G)

len(G.nodes)

#### Basic Properties

In [None]:
# out degree
degs = [k for n, k in G.out_degree]
avg_deg = np.mean(degs)
min_deg = np.min(degs)
max_deg = np.max(degs)

vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
pd.DataFrame.from_dict(vals).T

In [None]:
# in degree
degs = [k for n, k in G.in_degree]
avg_deg = np.mean(degs)
min_deg = np.min(degs)
max_deg = np.max(degs)

vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
pd.DataFrame.from_dict(vals).T

In [None]:
# degree
degs = [k for n, k in G.degree]
avg_deg = np.mean(degs)
min_deg = np.min(degs)
max_deg = np.max(degs)

vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
pd.DataFrame.from_dict(vals).T

#### Degree Distribution

In [None]:
# out degree distribution
degs = [v for k, v in G.out_degree]

# count
deg_count = Counter(degs)

# plot
plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

plt.xlabel("out_degree")
plt.ylabel("num nodes")
plt.title("Out degree distribution")

In [None]:
# in degree distribution
degs = [v for k, v in G.in_degree]

# count
deg_count = Counter(degs)

# plot
plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

plt.xlabel("in_degree")
plt.ylabel("num nodes")
plt.title("In degree distribution")

In [None]:
# degree distribution
degs = [v for k, v in G.degree]

# count
deg_count = Counter(degs)

# plot
plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

plt.xlabel("degree")
plt.ylabel("num nodes")
plt.title("Degree distribution")

In [None]:
# # Use spring layout
# pos = nx.spring_layout(G)

# # draw graph
# plt.figure(figsize=(8, 8))
# nx.draw(G, pos=pos, node_size=2, with_labels=True)

### Symmetric Graph

In [None]:
# filter only symmetric
sym_users_followers = filter_symmetric_from_dict(users_followers)
len(sym_users_followers)

In [None]:
# load Graph of followers
H = nx.from_dict_of_lists(sym_users_followers)

#### Degree Distribution

In [None]:
# degree distribution
degs = [v for k, v in H.degree]

# count
deg_count = Counter(degs)

# plot
plt.plot(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

plt.xlabel("degree")
plt.ylabel("num nodes")
plt.title("Degree distribution")

In [None]:
# Use spring layout
pos = nx.spring_layout(H)

# draw graph
plt.figure(figsize=(10, 10))

# draw graph
nx.draw(H, pos=pos, node_size=2, with_labels=False,
        font_color='b', font_size=20)

# WE CAN'T SCRAPE ALL, LOTS OF MISSING LINKS

In [None]:
# top degrees in symmetric networks
ind = np.argsort([v for k, v in H.degree])
np.array([k for k, v in H.degree])[ind][::-1][:5]

## Filter Only the Networks Whose Nodes Tweeted About the #Hashtags

In [None]:
# get only nodes (following and followers) that are in a specific list
def filter_users_from_dic(followers_dic, user_list):
    # make dictionary of filtered keys
    # filter user - follower dictionary to include only those that tweeted
    filtered_dic = {}
    for k in user_list:
        try: filtered_dic[k] = followers_dic[k]
        except: continue

    # filter values in dictionary - select only those in user_list
    for k, v in filtered_dic.items():
        filtered_dic[k] = [i for i in filtered_dic[k] if i in user_list]

    return filtered_dic

### Asymmetric Graph

In [None]:
# get list of unique user_screen_name in pandas dataframe
# this is the list that is both in the followers list and in the tweets data
# No yet concern for TIME so WE USE tweets_0
# tweets, replies, and retweets
final_users = list(tweets_0.user_screen_name.unique())
len(final_users)

In [None]:
# Filter only the nodes in final_users list (tweeter data)
users_followers = filter_users_from_dic(unified_followers, final_users)
len(users_followers)

In [None]:
# make a network using the filtered users_followers above
# load Graph of followers
# includes nodes that may not have tweeted about the protest
G = nx.from_dict_of_lists(users_followers, create_using=nx.DiGraph())

# Use spring layout
# pos = nx.spring_layout(G)

len(G.nodes)

#### Basic Properties

In [None]:
# out degree
degs = [k for n, k in G.out_degree]
avg_deg = np.mean(degs)
min_deg = np.min(degs)
max_deg = np.max(degs)

vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
pd.DataFrame.from_dict(vals).T

In [None]:
# in degree
degs = [k for n, k in G.in_degree]
avg_deg = np.mean(degs)
min_deg = np.min(degs)
max_deg = np.max(degs)

vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
pd.DataFrame.from_dict(vals).T

In [None]:
# degree
degs = [k for n, k in G.degree]
avg_deg = np.mean(degs)
min_deg = np.min(degs)
max_deg = np.max(degs)

vals = {"avg_degree":[avg_deg], "min_degree":[min_deg], "max_degree":[max_deg]}
pd.DataFrame.from_dict(vals).T

#### Degree Distribution

In [None]:
# out degree distribution
degs = [v for k, v in G.out_degree]

# count
deg_count = Counter(degs)

# plot
plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

plt.xlabel("out_degree")
plt.ylabel("num nodes")
plt.title("Out degree distribution")

In [None]:
# in degree distribution
degs = [v for k, v in G.in_degree]

# count
deg_count = Counter(degs)

# plot
plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

plt.xlabel("in_degree")
plt.ylabel("num nodes")
plt.title("In degree distribution")

In [None]:
# degree distribution
degs = [v for k, v in G.degree]

# count
deg_count = Counter(degs)

# plot
plt.loglog(deg_count.keys(), deg_count.values(), 'o', alpha=0.5)

plt.xlabel("degree")
plt.ylabel("num nodes")
plt.title("Degree distribution")

In [None]:
# # Use spring layout
# pos = nx.spring_layout(G)

# # draw graph
# plt.figure(figsize=(8, 8))
# nx.draw(G, pos=pos, node_size=2, with_labels=True)

### Symmetric Graph

In [None]:
# filter only symmetric
sym_users_followers = filter_symmetric_from_dict(users_followers)
len(sym_users_followers)

In [None]:
# load Graph of followers
H = nx.from_dict_of_lists(sym_users_followers)

#### Degree Distribution

In [None]:
# degree distribution
degs = [v for k, v in H.degree]

# count
deg_count = Counter(degs)

x = np.array(list(deg_count.keys()))
inds = np.argsort(x)
x = x[inds]
y = np.array(list(deg_count.values()))
y = y[inds]

# plot
plt.plot(x, y, '-', alpha=1)

plt.xlabel("degree")
plt.ylabel("num nodes")
plt.title("Degree distribution")

In [None]:
# Use spring layout
pos = nx.spring_layout(H)

# draw graph
plt.figure(figsize=(10, 10))

# draw graph
nx.draw(H, pos=pos, node_size=2, with_labels=False,
        font_color='b', font_size=20)

# WE CAN'T SCRAPE ALL, LOTS OF MISSING LINKS

In [None]:
# top degrees in symmetric networks
ind = np.argsort([v for k, v in H.degree])
np.array([k for k, v in H.degree])[ind][::-1][:5]

## Recruitment Thresholds Distribution

### Overall Threshold Distribution - Asymmetric

In [None]:
## WE DON'T HAVE DATE FOR RETWEETS. HOW CAN WE SOLVE THIS

In [None]:
def get_thresholds(following_counts, unique_users, following_dict):
    # input is a dictionary of {users : num of following}
    # input is unique users to select

    # initiate threshold dict
    thresh_dic = {}

    # Count the number of friends who activated at the time of activation, ka
    for user in list(following_counts.keys()):

        # check if user in user_screen_name in the dataset otherwise continue
        if user not in unique_users.user_screen_name.values:
            continue

        date_activated = unique_users[unique_users.user_screen_name == user].date

        # filter users whose date < date activated
        preactivated_users = unique_users[unique_users.date.values <
                                          date_activated.values]

        # filter only those being followed by user
        # DOUBLE CHECK THIS THIS SHOULDNT BE UNIFIED FOLLOWING
        _ = preactivated_users[preactivated_users.user_screen_name.isin(
            following_dict[user])]

        # get number of preactivated users that are being followed
        ka = len(_)

        # get number of people followed
        k = following_counts[user]

        if ka > k:
            print(ka, k)
            print(user)

        # if no connections, remove
        if k == 0:
            continue

        # get threshold
        thresh = ka / k

        # append to dictionary
        thresh_dic[user] = thresh

    return thresh_dic

In [None]:
# get following counts
following_counts = {k: len(v) for k, v in unified_following.items()}

# unique_users are unfiltered. Symmetric
unique_users = tweets.drop_duplicates(subset="user_screen_name", keep="first")

# show some threshold values
thresholds = get_thresholds(following_counts, unique_users, unified_following)

# display
len(thresholds)

In [None]:
# mean threshold
np.mean([v for k, v in thresholds.items()])

In [None]:
# plot distribution proportion of nodes vs ka/k
thresh = [v for k, v in thresholds.items()]
thresh_counts = Counter(thresh)
x = np.array(list(thresh_counts.keys()))
inds = np.argsort(x)
x_asym = x[inds]
y_asym = np.array(list(thresh_counts.values()))[inds]
plt.semilogy(x_asym, y_asym)
plt.xlabel("ka/k")
plt.ylabel("num nodes")
plt.title("Num nodes vs ka/k")

### Overall Threshold Distribution - Symmetric

In [None]:
# Filter the network for symmetric only
# list of users and who they follow in symmetric network
symm_following = filter_symmetric_from_dict(unified_following)

# make graph
H = nx.from_dict_of_lists(symm_following, create_using=nx.DiGraph())

len(H.nodes)

In [None]:
# get following counts
following_counts = {k: len(v) for k, v in symm_following.items()}

# unique_users are unfiltered. Symmetric
unique_users = unique_users

# show some threshold values
thresholds = get_thresholds(following_counts, unique_users, symm_following)

len(thresholds)

In [None]:
# mean threshold
np.mean([v for k, v in thresholds.items()])

In [None]:
# plot distribution proportion of nodes vs ka/k
thresh = [v for k, v in thresholds.items()]
thresh_counts = Counter(thresh)
x = np.array(list(thresh_counts.keys()))
inds = np.argsort(x)
x_sym = x[inds]
y_sym = np.array(list(thresh_counts.values()))[inds]
plt.semilogy(x_sym, y_sym)
plt.xlabel("ka/k")
plt.ylabel("num nodes")
plt.title("Num nodes vs ka/k")

In [None]:
# CHANGE THIS TO PROPORTIONS

plt.semilogy(x_sym, y_sym, "-", alpha=0., ms=4)
plt.semilogy(x_asym, y_asym, "-", alpha=0., ms=4)

# f, ax = plt.subplots()
# ax.set(xscale="log", yscale="log")

plt.bar(x_asym, y_asym, label="asymmetric", width=0.02, alpha=0.5)
plt.bar(x_sym, y_sym, label="symmetric", width=0.02, alpha=0.5)

plt.legend()
plt.xlabel("ka/k")
plt.ylabel("num nodes")
plt.title("Num nodes vs ka/k")

Assuming that retaining the symmetric network is like removing ifluencers and media (with huge number of followers but few users followed), or retaining groups of friends in the network, the threshold distribution shifts to higher threshold values. It might be that people in the symmetric subgraph needed more pressure to join the protest / be recruited. Media and influencers could have played a role in activating a significant portion of the users (lower required ka/k)

### Distribution of Low, Med, and High Threshold Users Over Time

In [None]:
# TO DO THIS!!!!
# ASYMMETRIC
# get following counts
following_counts = {k: len(v) for k, v in unified_following.items()}

# unique_users are unfiltered. Symmetric
unique_users = tweets.drop_duplicates(subset="user_screen_name", keep="first")

# show some threshold values
thresholds = get_thresholds(following_counts, unique_users, unified_following)

# display
len(thresholds)

In [None]:
# get the date
unique_users_date = unique_users.loc[:, ["user_screen_name", "date"]]
unique_users_date.head()

In [None]:
# get a dataframe of threshold values
df_thresh = pd.DataFrame.from_dict(thresholds, orient="index").reset_index()
df_thresh.columns = ["user_screen_name", "threshold"]
df_thresh.head()

In [None]:
# combine user_screen_name, date, and thresholds
df_thresh_date = pd.merge(unique_users_date, df_thresh,
         left_on="user_screen_name", right_on="user_screen_name")
df_thresh_date.head()

In [None]:
# max thresholds
low = 0.2
med = 0.5

# filter thresholds of users (low thresholds)
low_thresh = df_thresh_date[df_thresh_date.threshold <= low].reset_index(
    drop=True)

# get counts per date only (distribution)
low_thresh = low_thresh.groupby("date").count().reset_index()

# filter thresholds of users (med thresholds)
med_thresh = df_thresh_date[np.logical_and(
    low < df_thresh_date.threshold, df_thresh_date.threshold <= med)].reset_index(drop=True)

# get counts per date only (distribution)
med_thresh = med_thresh.groupby("date").count().reset_index()

# filter thresholds of users (high thresholds)
high_thresh = df_thresh_date[med < df_thresh_date.threshold].reset_index(
    drop=True)

# get counts per date only (distribution)
high_thresh = high_thresh.groupby("date").count().reset_index()

In [None]:
# size of each threshold category
np.sum(low_thresh.threshold), np.sum(med_thresh.threshold), np.sum(high_thresh.threshold)

In [None]:
# plot proportions (RIGHT NOW THRESHOLDS ARE PLOTTED)

x = low_thresh.date
y = low_thresh.threshold / np.sum(low_thresh.threshold)
plt.plot(x, y, label=f"low $\leq$ {low}")

x = med_thresh.date
y = med_thresh.threshold / np.sum(med_thresh.threshold)
plt.plot(x, y, label=f"{low } < med $\leq$ {med}")

x = high_thresh.date
y = high_thresh.threshold / np.sum(high_thresh.threshold)
plt.plot(x, y, label=f"{med} < high")
plt.legend()

plt.xlabel("date")
plt.ylabel("proportion of users who activated")
plt.title("Proportion of users who activated over time for three threshold categories")

In [None]:
# SYMMETRIC

### Threshold Distribution Before and After Protest

In [None]:
# threshold before protest
tweets_before = tweets_0[tweets_0.date < datetime(2013, 8, 26)].reset_index(drop=True)
len(tweets_before)

In [None]:
# get following counts
# asymmetric
following_counts = {k: len(v) for k, v in unified_following.items()}

# unique_users are unfiltered. Symmetric
unique_users = tweets_before.drop_duplicates(subset="user_screen_name", keep="first")

# show some threshold values
thresholds = get_thresholds(following_counts, unique_users, unified_following)

# display
len(thresholds)

In [None]:
# mean threshold
np.mean([v for k, v in thresholds.items()])

In [None]:
# plot distribution proportion of nodes vs ka/k
# proportion of nodes that activated before protest
thresh = [v for k, v in thresholds.items()]
thresh_counts = Counter(thresh)
x = np.array(list(thresh_counts.keys()))
inds = np.argsort(x)
x_asym_before = x[inds]
y_asym_before = np.array(list(thresh_counts.values()))[inds]
y_asym_before = y_asym_before/np.sum(y_asym_before)
plt.semilogy(x_asym_before, y_asym_before)
plt.xlabel("ka/k")
plt.ylabel("proportion of nodes that activated during that time frame")
plt.title("Prop nodes vs ka/k")

In [None]:
# threshold after protest
tweets_after = tweets_0[tweets_0.date >= datetime(2013, 8, 26)].reset_index(drop=True)
len(tweets_after)

In [None]:
# get following counts
# asymmetric
following_counts = {k: len(v) for k, v in unified_following.items()}

# unique_users are unfiltered. Symmetric
unique_users = tweets_after.drop_duplicates(subset="user_screen_name", keep="first")

# show some threshold values
thresholds = get_thresholds(following_counts, unique_users, unified_following)

# display
len(thresholds)

In [None]:
# mean threshold
np.mean([v for k, v in thresholds.items()])

In [None]:
# plot distribution proportion of nodes vs ka/k
thresh = [v for k, v in thresholds.items()]
thresh_counts = Counter(thresh)
x = np.array(list(thresh_counts.keys()))
inds = np.argsort(x)
x_asym_after = x[inds]
y_asym_after = np.array(list(thresh_counts.values()))[inds]
y_asym_after = y_asym_after/np.sum(y_asym_after)
plt.semilogy(x_asym_after, y_asym_after)
plt.xlabel("ka/k")
plt.ylabel("num nodes")
plt.title("Num nodes vs ka/k")

In [None]:
plt.semilogy(x_asym_before, y_asym_before, 'o-', alpha=0.7, label="before protest")
plt.semilogy(x_asym_after, y_asym_after,'o-', alpha=0.7, label="after protest")
plt.legend()
plt.show()

plt.semilogy(x_asym_before, y_asym_before, 'x', alpha=0.7, label="before protest")
plt.semilogy(x_asym_after, y_asym_after,'o', alpha=0.7, label="after protest")
plt.legend()
plt.show()

plt.plot(x_asym_before, y_asym_before, 'x', alpha=0.7, label="before protest")
plt.plot(x_asym_after, y_asym_after,'o', alpha=0.7, label="after protest")
plt.legend()
plt.show()

plt.semilogy(x_asym_after, y_asym_after, alpha=0.)
plt.bar(x_asym_before, y_asym_before, alpha=0.3, width=0.03, label="before protest")
plt.bar(x_asym_after, y_asym_after, alpha=0.3, width=0.03, label="after protest")
plt.legend()
plt.show()

sns.distplot(x_asym_before, label="before protest", kde=False, norm_hist=True, hist=True,)
sns.distplot(x_asym_after, label="after protest", kde=False, norm_hist=True, hist=True,)
plt.xlabel("ka/k")
plt.ylabel("proportion of nodes that activated during that time frame")
plt.title("Distribution of ka/k before and after protest")
plt.legend()

## Recruitment Bursts

## Distribution of Cascade Size

In [None]:
# How to identify cascade? retweet of same message?
    # Count how many retweeted same message = cascade size?
    # Count how many used the same #hashtag - cascade size?
    # Add k core attributes per node
    # Find K core of who started = seed/leader
    # Associate cascade size with k core of seed/leader

In [None]:
# count retweets for each parent id
cascade_sizes = tweets_0.groupby(
    by="parent_tweet_id").count().reset_index().iloc[:, :2].date.values

# Should we add this?
# # we add the original tweet
# cascade_sizes = cascade_sizes + 1

In [None]:
x = np.array(list(Counter(cascade_sizes).keys()))
inds = np.argsort(x)
x = x[inds]
y = np.array(list(Counter(cascade_sizes).values()))[inds]
plt.loglog(x, y, marker=".", )
plt.xlabel("cascade size (number of retweets + original tweet)")
plt.ylabel("number of nodes")
plt.title("Distribution of cascade size")

In [None]:
# WE CAN CHECK WHO ARE THE NODES THAT REACHED MANY PEOPLE

## Node Centrality (K Core) vs Cascade Size

In [None]:
# make a dictionary of parent tweet_id and cascade size
# Check in the asymmetric network and symmetric network
# will associate parent_tweet_id with original tweeter and calculate k core

cascade_df = tweets_0.groupby(
    by="parent_tweet_id").count().reset_index().iloc[:, :2]
# rename column to cascade
cascade_df = cascade_df.rename(columns={"date": "cascade"})
cascade_df.head()

In [None]:
# k cores in asymmetric network
# from G above - includes followers who do not follow back and did not tweet
# about the protest
len(G.nodes)

In [None]:
# remove self-loops
G.remove_edges_from(nx.selfloop_edges(G))

In [None]:
len(G.nodes)

In [None]:
# k cores in asymmetric network - includes those who did not tweet about the protest
# {name: k_core, name: k_core}
k_cores_dict = nx.core_number(G)

# SAVE KCORE AS ATTRIBUTE OF NODE
len(k_cores_dict)

In [None]:
# k_core per users
k_core_df = pd.DataFrame.from_dict(
    k_cores_dict, orient='index').reset_index()
len(k_core_df)

In [None]:
# get the parent user screen name
cascade_df_2 = pd.merge(unique_users, cascade_df, left_on="tweet_id",
                        right_on="parent_tweet_id").loc[:, ["user_screen_name", "cascade"]]
cascade_df_2.head()

In [None]:
k_core_users.head()

In [None]:
# merge cascade size and kcore
df_cascade_kcore = pd.merge(k_core_users, cascade_df_2, left_on="index",
         right_on="user_screen_name")[[0, "cascade"]]
df_cascade_kcore.head()

In [None]:
len(df_cascade_kcore)

In [None]:
x = df_cascade_kcore[0]
inds = np.argsort(x)
x = x[inds]
y = df_cascade_kcore["cascade"][inds]

plt.plot(x, y/len(y), 'o')
plt.title("cascade vs k-core")
plt.xlabel("k core")
plt.ylabel("$N_c$/$N$")

In [None]:
# average cascade size vs k core
cascade_kcore_mean = df_cascade_kcore.groupby(by=0).mean().reset_index()

x = cascade_kcore_mean[0]
inds = np.argsort(x)
x = x[inds]
y = cascade_kcore_mean["cascade"][inds]

plt.plot(x, y, '-')
plt.title("cascade vs k-core")
plt.xlabel("k core")
plt.ylabel("Average cascade size")

The average cascade size correlates positively with k core size. This may mean that tweets from users in higher k cores reach more people than users in lower k cores.

In [None]:
# correlate with kcore with degrees
# degree per users
degrees = dict(G.degree)

degree_df = pd.DataFrame.from_dict(
    degrees, orient='index').reset_index()



In [None]:
# merge cascade size and kcore
df_degree_k_core = pd.merge(degree_df, cascade_df_2, left_on="index",
         right_on="user_screen_name")[[0, "cascade"]]
df_degree_k_core.columns = ["degree", "cascade"]
df_degree_k_core.head()

In [None]:
len(df_degree_k_core)

In [None]:
# degree vs k core

x = df_degree_k_core["degree"]
inds = np.argsort(x)
x = x[inds]
y = df_degree_k_core["cascade"][inds]

plt.loglog(x, y, 'o')
plt.title("cascade vs degree")
plt.xlabel("degree")
plt.ylabel("cascade size")

In [None]:
# mean degree vs k core
df_degree_k_core_mean = df_degree_k_core.groupby(by="degree").mean().reset_index()

x = df_degree_k_core_mean["degree"]
inds = np.argsort(x)
x = x[inds]
y = df_degree_k_core_mean["cascade"][inds]

f, ax = plt.subplots()
ax.set(xscale="log", yscale="log")

# sns.lmplot("degree", "cascade", data=df_degree_k_core_mean)
plt.loglog(x, y, 'o')
plt.title("average cascade vs degree")
plt.xlabel("degree")
plt.ylabel("Average cascade size")