In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from ipywidgets import interact, interactive, fixed
from ipywidgets import IntSlider
import networkx as nx
import re
import sys
sys.path.append('..')
from transform.read_transform import read_transform

In [2]:
data = read_transform(path_tweets='../data/TW.csv', path_retweets='../data/RT.csv', 
                      join_method='concat', 
                      hashtags_as_list=False, mentions_as_list=False)
print(data.shape)

(44334, 19)


In [3]:
data.head(3)

Unnamed: 0,creation_date,description,entities_hashtags,favourite_count,followers_count,friends_count,full_text,location,mentions,profile_created_at,protected,retweet_count,search_key,tweet_id,user_id,user_name,user_screen_name,is_retweet,RT_of_ID
0,2021-04-01 02:06:11+00:00,I retweet #Anime stuff🔥 Follow for a cookie🍪,"anime, food, animefood, hungry, 백종원, 고든램지",0,5064,1,RT @byOceanEyes: OceanEyes\nonly for anime foo...,Animeland,'byOceanEyes',2018-07-24 18:53:54+00:00,False,115,#food,1377442135604084742,1021830850357473280,RE:AnimeTron,ReAnimetron,True,
1,2021-04-01 02:06:03+00:00,,"Food, FoodPorn, FoodPorn",0,16037,9925,"Might not look as appetizing as other posts, b...",,,2015-08-01 07:11:08+00:00,False,0,#food,1377442100900421632,3303170203,Dining Cooking,DiningCooking,False,
2,2021-04-01 02:05:30+00:00,NYT/#1 int'l bestselling author THOSE WHO SAVE...,"poetry, food, PoetsCafe",0,6090,758,"Come for the #poetry, stay for the #food—or vi...","Boston, MA","'AMIGHTYBLAZE', 'fascicles'",2009-09-02 04:30:11+00:00,False,0,#food,1377441963746721796,70882212,Jenna Blum,Jenna_Blum,False,


## Get all the mentions in a dictionary

In [4]:
tmp = data[['mentions', 'user_screen_name']]
tmp.loc[:,'mentions'] = tmp.loc[:,'mentions'].str.replace("'", "").str.split(', ')
tmp = tmp.fillna(0)
print(tmp.shape)

tmp = tmp.set_index('user_screen_name')
tmp.head(2)

(44334, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0_level_0,mentions
user_screen_name,Unnamed: 1_level_1
ReAnimetron,[byOceanEyes]
DiningCooking,0


In [5]:
tmp.loc['Greenpeace'] == 0

mentions    True
Name: Greenpeace, dtype: bool

In [6]:
mentions = tmp.to_dict()['mentions']
S = pd.Series(mentions)

In [7]:
import itertools
dict(itertools.islice(mentions.items(), 3))

{'ReAnimetron': ['byOceanEyes'],
 'DiningCooking': 0,
 'Jenna_Blum': ['AMIGHTYBLAZE', 'fascicles']}

## Get all the attributes in a dictionary

In [8]:
tmp = data[['user_screen_name', 'followers_count']].set_index('user_screen_name')
follower_counts = tmp.to_dict()['followers_count']
pd.Series(follower_counts)[:2]

ReAnimetron       5064
DiningCooking    16020
dtype: int64

In [9]:
pd.Series(follower_counts).loc['Greenpeace']

1897589

In [10]:
dict(itertools.islice(follower_counts.items(), 3))

{'ReAnimetron': 5064, 'DiningCooking': 16020, 'Jenna_Blum': 6090}

## Get Retweet count in a dictionary

In [11]:
tmp = data[['user_screen_name', 'retweet_count']].groupby('user_screen_name').mean()
retweet_counts = tmp.to_dict()['retweet_count']
print(pd.Series(retweet_counts)[:2])
len(retweet_counts)

004clinden    3.1
007201_       1.0
dtype: float64


18626

In [12]:
pd.Series(retweet_counts).loc['Greenpeace']

23.0

In [13]:
dict(itertools.islice(retweet_counts.items(), 3))

{'004clinden': 3.1, '007201_': 1.0, '0123Mara': 84.0}

## Get Activity leve of users

In [14]:
tmp = data.drop_duplicates(subset=['tweet_id','user_screen_name'])['user_screen_name'].value_counts()
user_activity = tmp.to_dict()
print(pd.Series(user_activity)[:3])
len(user_activity)

researchmrx       929
CounterIreland    326
WhatsOnOLIO       252
dtype: int64


18626

In [15]:
print(pd.Series(user_activity).loc['Greenpeace'])

1


## Create Graph-File

In [16]:
data[data['user_screen_name'].str.contains('Greenpeace')]['followers_count'].values

array([1897589], dtype=int64)

In [17]:
pd.Series(mentions).loc['Greenpeace']

0

In [18]:
G = nx.DiGraph()

In [None]:
# Add nodes
for key, mention in mentions.items():
    G.add_node(key, followers_count=follower_counts[key])
    if type(mentions) != 0:
        for m in mentions:
            G.add_node(m, followers_count=follower_counts[m])

In [None]:
len(G.nodes())

In [None]:
S = pd.Series(nx.get_node_attributes(G,'followers_count'))

In [None]:
S.loc['Greenpeace']

### Graph with more attributes: followers, retweets, user activity

In [19]:
G = nx.DiGraph()

In [20]:
# Add nodes
iteration = 1
for key, mention in mentions.items():
    sys.stdout.flush()
    sys.stdout.write('\r---- {} of {} Nodes ----'.format(iteration, len(mentions)+1))
    G.add_node(key, followers_count=follower_counts[key], user_activity=user_activity[key], retweet_counts=retweet_counts[key])
    if type(mentions) != 0:
        for m in mentions:
            G.add_node(m, followers_count=follower_counts[m], user_activity=user_activity[m], retweet_counts=retweet_counts[m])
            
    iteration += 1

---- 18626 of 18627 Nodes ----

In [21]:
len(G.nodes())

18626

In [22]:
pd.Series(nx.get_node_attributes(G, 'followers_count'))['researchmrx']

7104

In [23]:
pd.Series(nx.get_node_attributes(G, 'user_activity'))['researchmrx']

929

In [24]:
pd.Series(nx.get_node_attributes(G, 'retweet_counts'))['Greenpeace']

23.0

In [25]:
pd.Series(mentions).loc['Greenpeace']

0

In [26]:
len(mentions)

18626

In [27]:
# Add edges
iteration = 1
for key, val in mentions.items():
    sys.stdout.flush()
    sys.stdout.write('\r---- {} of {} Edges ----'.format(iteration, len(mentions)+1))
    if val != 0:
        for v in val:
            G.add_edge(key, v)
    iteration += 1

---- 12996 of 18627 Edges ----

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



---- 17033 of 18627 Edges ----

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



---- 18626 of 18627 Edges ----

In [28]:
nx.write_gexf(G, path='../data/gephi/many_attr.gexf')

In [None]:
data[data['user_screen_name']]

In [36]:
data[data['user_screen_name'].str.contains('Youtube')]

Unnamed: 0,creation_date,description,entities_hashtags,favourite_count,followers_count,friends_count,full_text,location,mentions,profile_created_at,protected,retweet_count,search_key,tweet_id,user_id,user_name,user_screen_name,is_retweet,RT_of_ID
25135,2021-04-06 19:00:00+00:00,"High-end documentaries on Nature, History, Cul...","GuineaPig, Experiments, AnimalTesting, AnimalL...",0,400,61,Bye Bye Guinea Pig? The Battle to STOP Animal ...,,'YouTube',2013-06-07 10:41:39+00:00,False,0,#AnimalRights,1379509209369296902,1490094044,Free Documentary,FreeYoutubedocs,False,
