In [14]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm 
from graph_tool.all import *
import json
tqdm.pandas()
from IPython.display import Image
from datetime import datetime, timedelta

In [10]:
def tweet_ID_to_User(ID_dict, ID):
    """
    Function to return the user name from Tweeter user ID
    using the ID_dictionary; 
    Returns NaN if the user is not found
    """
    try:
        return ID_dict[ID]
    except:
        return np.nan

In [11]:
def df_tweets_processor(df_tweets, period_duration_days = 30):
    # Noting the bounds of the time index
    start_date = pd.to_datetime('2016-01-01')
    curr_date = start_date
    end_date = pd.to_datetime('2020-12-31')
    
    # List to store the Time varying information
    weekly_tweets_info = []
    
    # Traversing through time; time-step defined within the loop
    while(curr_date <= end_date):
        period_end = curr_date + timedelta(days = period_duration_days)
      

        df_temp = df_tweets.loc[curr_date:period_end]
        
        if df_temp.shape[0] > 0 :
            std_centrality, links_num, interactions = tweets_sub_processor(df_temp)
            weekly_tweets_info.append([period_end, std_centrality, links_num, interactions])
        else:
            weekly_tweets_info.append([period_end, np.NaN, np.NaN, np.NaN])

        curr_date = period_end
        
        weekly_tweets_df = pd.DataFrame(weekly_tweets_info, columns = ['date', 'stddev_centrality', 'avg_links', 'interactions'])
        weekly_tweets_df.set_index('date', inplace = True)
        
    return weekly_tweets_df


In [12]:
def tweets_sub_processor(df_tweets_):
    
    # Selecting a subset of the data to extract and store the Tweets author/originator name
    df_IDs_Users_Tweetsid = df_tweets_[['id','user','conversationId']]

    # Converting the Tweet ID and Conversation ID to string type
    df_IDs_Users_Tweetsid.loc[:,'id'] = df_IDs_Users_Tweetsid.loc[:,'id'].astype(str)
    df_IDs_Users_Tweetsid.loc[:,'conversationId'] = df_IDs_Users_Tweetsid.loc[:,'conversationId'].astype(str)

    # Extracting the user name and storing it under username 
    df_IDs_Users_Tweetsid.loc[:,'user_name'] = df_IDs_Users_Tweetsid.loc[:,'user'].apply(lambda user_dict: user_dict['username'])
    
    # Extracting User_names and tweetIDs seperately as a list
    user_list = list(df_IDs_Users_Tweetsid['user_name'].values)
    tweets_ID_list = list(df_IDs_Users_Tweetsid['id'].astype(str).values)

    # Constructing a dictionary of tweet IDs and user names
    dict_tweetsid_username = {}
    for i in range(len(user_list)):
        dict_tweetsid_username[tweets_ID_list[i]] = user_list[i]

    # Extracting and storing the tweets originators/ orginal authors
    df_IDs_Users_Tweetsid.loc[:,'tweet_author'] = df_IDs_Users_Tweetsid.loc[:,'conversationId'].progress_apply(lambda conversationId: tweet_ID_to_User(dict_tweetsid_username, conversationId))

    # Removing NaN containing rows
    df_IDs_Users_Tweetsid = df_IDs_Users_Tweetsid.dropna(how='any', axis=0).reset_index(drop=True)

    # Capturing the unique users list
    unique_user = list(np.unique(df_IDs_Users_Tweetsid.user_name))
    
    # Forming a matrix of # of unique_users x # of unique_users
    matrix = np.zeros((len(unique_user),len(unique_user)))

    # Accounting the tweet interactions from the original authors and replying audience
    for i in tqdm(range(df_IDs_Users_Tweetsid.shape[0])):
        try:
            matrix[unique_user.index(df_IDs_Users_Tweetsid.loc[i,'tweet_author']), unique_user.index(df_IDs_Users_Tweetsid.loc[i,'user_name'])] += 1
        except:
            pass
        
    # Number of interaction between 2 different people
    interactions = np.sum(matrix - np.diag(np.diag(matrix)))
    
    # Number of unique users
    unique_users = matrix.shape[0]
    
    # Graph Library object instantiation
    g1 = Graph()

    # Dict to store the vertices for all the unique users
    dict_vertex = {}
    for user in unique_user:
        dict_vertex[user] = g1.add_vertex()

    # Removing duplicate entries of tweets/tweet_replies
    dict_edges = {}
    df_IDs_Users_Tweets_unique = df_IDs_Users_Tweetsid.drop_duplicates(subset=['user_name','tweet_author'], keep='first').reset_index(drop=True)

    for i in range(df_IDs_Users_Tweets_unique.shape[0]):

        # Neglecting the initial tweets where the originator or original author makes the tweets; Adding Edges for remaining tweets interactions
        if df_IDs_Users_Tweets_unique.loc[i,'user_name'] != df_IDs_Users_Tweets_unique.loc[i,'tweet_author']:
            dict_edges[i] = g1.add_edge(dict_vertex[df_IDs_Users_Tweets_unique.loc[i,'user_name']], dict_vertex[df_IDs_Users_Tweets_unique.loc[i,'tweet_author']])
    
    # PageRank Centrality measure of the graph
    pr = pagerank(g1)

    # Sorting the most influencial users
    df_page_rank = pd.DataFrame(list(pr.a), columns = ['page_rank score'], index = unique_user).sort_values(ascending= False, by='page_rank score')
    
    # Std Dev of centrality
    std_centrality = df_page_rank.std(axis = 0).values[0]
    
    # Number of links
    links_num = np.sum(np.sign((matrix - np.diag(np.diag(matrix))).ravel()))
  
    
    return std_centrality, links_num, interactions

In [41]:
df_tweets = pd.read_pickle('gs://afp_bucket/SP_500/MSFT.pkl')

In [42]:
df_tweets.shape

(573870, 22)

In [43]:
df_tweets.set_index('date', inplace= True)
df_tweets.sort_index(inplace= True)
df_info = df_tweets_processor(df_tweets)
df_info.to_csv(r'./Sample_tests_files/MSFT_net_results_monthly.csv')

  start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
100%|██████████| 6621/6621 [00:00<00:00, 679184.28it/s]
100%|██████████| 6480/6480 [00:00<00:00, 24001.06it/s]
  start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [44]:
df_info

Unnamed: 0_level_0,stddev_centrality,avg_links,interactions
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-31,0.000065,16.0,16.0
2016-03-01,0.000068,7.0,8.0
2016-03-31,0.000065,8.0,8.0
2016-04-30,0.000055,14.0,18.0
2016-05-30,0.000077,13.0,13.0
...,...,...,...
2020-09-06,0.000164,501.0,553.0
2020-10-06,0.000180,330.0,365.0
2020-11-05,0.000085,247.0,267.0
2020-12-05,0.000154,165.0,171.0
