In [26]:
#!pip install -r requirements.txt

In [27]:
import json
import pandas as pd
import datetime as dt

In [28]:
def gen_date_ranges(start_date, end_date):
    dates = []
    if start_date.month == end_date.month:
        dates.append([start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")])
    else:
        data_ranges = pd.date_range(start= start_date, end = end_date, freq="M")
        for date_end in data_ranges:
            date_start = dt.date(date_end.year,date_end.month,1).strftime("%Y-%m-%d")
            date_end = date_end.strftime("%Y-%m-%d")
            dates.append([date_start, date_end])
        dates[-1][1] = end_date.strftime("%Y-%m-%d")
    return dates

In [29]:
def get_nodes_edges_from_json(tweet_json):
    nodes = []
    edges = []
    for batch in tweet_json:
        tweets1 = batch['data']
        tweets2 = batch['includes']['tweets']
        users = batch['includes']['users']
        

        for tweet in tweets1:
            nodes.append({'id':tweet['id'], 'label':tweet['id'], 'type':'tweet', 'text':tweet['text'], 'created_at':tweet['created_at']}) # main tweet id
            #edge tweets
            edges.append({'source':tweet['author_id'], 'target':tweet['id'], 'type':'tweets', 'created_at':tweet['created_at']})

            try:
                for hashtag in tweet['entities']['hashtags']:
                    #node hashtag
                    nodes.append({'id':'#'+hashtag['tag'], 'label':'#'+hashtag['tag'], 'type':'hashtag', 'created_at':tweet['created_at']})
                    #edge has_hashtag
                    edges.append({'source':tweet['id'], 'target':'#'+hashtag['tag'], 'type':'has_hashtag', 'created_at':tweet['created_at']})
            except:
                pass

            try:
                for url in tweet['entities']['urls']:
                    #node url
                    nodes.append({'id':url['url'], 'label':url['url'], 'type':'url', 'created_at':tweet['created_at']})
                    #edge has_url
                    edges.append({'source':tweet['id'], 'target':url['tag'], 'type':'has_url', 'created_at':tweet['created_at']})
            except:
                pass
            
            try:
                for media_key in tweet['attachments']['media_keys']:
                    #node media
                    nodes.append({'id':media_key, 'label':media_key, 'type':'media', 'created_at':tweet['created_at']})
                    #edge has_media
                    edges.append({'source':tweet['id'], 'target':media_key, 'type':'has_media', 'created_at':tweet['created_at']})
            except:
                pass
            try:
                for mentioned_user in tweet['entities']['mentions']:
                    #node user
                    nodes.append({'id':mentioned_user['id'], 'label':'@'+mentioned_user['username'], 'type':'user'})
                    #edge mentions
                    edges.append({'source':tweet['id'], 'target':mentioned_user['id'], 'type':'mentions', 'created_at':tweet['created_at']})
            except:
                pass

            try:
                #node tweet (retweeted and replied_to)
                    #nodes.append({'id':tweet['referenced_tweets'][0]['id'], 'label':tweet['referenced_tweets'][0]['id'], 'type':'tweet'}) we removed it because it has no text
                #edge retweets
                edges.append({'source':tweet['author_id'], 'target':tweet['referenced_tweets'][0]['id'], 'type':tweet['referenced_tweets'][0]['type'], 'created_at':tweet['created_at']})
            except:
                pass
        
        for user in users:
            try:
                nodes.append({'id':user['id'], 'label':'@'+user['username'], 'name':user['name'], 'location':user['location'], 'type':'user'}) # user node
            except: #in case there is no user location
                nodes.append({'id':user['id'], 'label':'@'+user['username'], 'name':user['name'], 'location':None, 'type':'user'}) # user node    
        

        for tweet in tweets2:
            nodes.append({'id':tweet['id'], 'label':tweet['id'], 'type':'tweet', 'text':tweet['text'], 'created_at':tweet['created_at']}) # main tweet id
            edges.append({'source':tweet['author_id'], 'target':tweet['id'], 'type':'tweets', 'created_at':tweet['created_at']})
            try:
                nodes.append({'id':tweet['author_id'],'label':'@'+tweet['author']['username'],'name':tweet['author']['name'], 'type':'user'}) # author id
                #edge tweets
            except:
                pass

            try:
                for hashtag in tweet['entities']['hashtags']:
                    #node hashtag
                    nodes.append({'id':'#'+hashtag['tag'], 'label':'#'+hashtag['tag'], 'type':'hashtag', 'created_at':tweet['created_at']})
                    #edge has_hashtag
                    edges.append({'source':tweet['id'], 'target':'#'+hashtag['tag'], 'type':'has_hashtag', 'created_at':tweet['created_at']})
            except:
                pass

            try:
                for url in tweet['entities']['urls']:
                    #node url
                    nodes.append({'id':url['url'], 'label':url['url'], 'type':'url', 'created_at':tweet['created_at']})
                    #edge has_url
                    edges.append({'source':tweet['id'], 'target':url['tag'], 'type':'has_url', 'created_at':tweet['created_at']})
            except:
                pass
            
            try:
                for media_key in tweet['attachments']['media_keys']:
                    #node media
                    nodes.append({'id':media_key, 'label':media_key, 'type':'media', 'created_at':tweet['created_at']})
                    #edge has_media
                    edges.append({'source':tweet['id'], 'target':media_key, 'type':'has_media', 'created_at':tweet['created_at']})
            except:
                pass
            try:
                for mentioned_user in tweet['entities']['mentions']:
                    #node user
                    nodes.append({'id':mentioned_user['id'], 'label':'@'+mentioned_user['username'], 'type':'user'})
                    #edge mentions
                    edges.append({'source':tweet['id'], 'target':mentioned_user['id'], 'type':'mentions', 'created_at':tweet['created_at']})
            except:
                pass

            try:
                #node tweet (quoted and replied_to)
                    #nodes.append({'id':tweet['referenced_tweets'][0]['id'], 'label':tweet['referenced_tweets'][0]['id'], 'type':'tweet'}) we removed it because it has no text
                #edge retweets
                edges.append({'source':tweet['author_id'], 'target':tweet['referenced_tweets'][0]['id'], 'type':tweet['referenced_tweets'][0]['type'], 'created_at':tweet['created_at']})
            except:
                pass
    return(nodes,edges)

In [30]:
colombian_valid_tweeets_ids = list(pd.read_csv('./../data/texts/colombian_valid_tweets.csv')['Id'])

In [31]:
start_date = dt.date(2014,1,1)
end_date = dt.date(2014,12,31)

nodes = []
edges = []
for range_i in gen_date_ranges(start_date, end_date):
    date_str = range_i[0][0:-3] #range is generated in the format yyyy-mm-dd, date_str has the yyyy-mm format
    
    with open('./../data/monthly_jsons_complete/'+date_str+'.json') as f:
        tweets = json.load(f) 
    
    nodes_i,edges_i = get_nodes_edges_from_json(tweets)
    nodes = nodes + nodes_i; edges = edges+ edges_i

In [32]:
nodes_dt = pd.DataFrame.from_dict(nodes)
edges_dt = pd.DataFrame.from_dict(edges)
nodes_dt.drop_duplicates(subset = 'id', inplace=True)

In [37]:
[id in colombian_valid_tweeets_ids for id in nodes_dt['id']]

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,


In [34]:
nodes_dt = nodes_dt[nodes_dt['id'] in colombian_valid_tweeets_ids] [[a and b for a, b in zip(boo1, boo2)]]
boo1 = edges_dt[edges_dt['source'] in colombian_valid_tweeets_ids]
boo2 = edges_dt[edges_dt['target'] in colombian_valid_tweeets_ids]
edges_dt = edges_dt[[a and b for a, b in zip(boo1, boo2)]]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
[a and b for a, b in zip(boo1, boo2)]

NameError: name 'boo1' is not defined

In [None]:
with pd.ExcelWriter('./../data/nodes_edges/'+start_date.__str__()[0:-2]+end_date.__str__()[0:-3]+'.xlsx') as writer: 
    edges_dt.to_excel(writer, sheet_name= 'edges',index=False)
    nodes_dt.to_excel(writer, sheet_name= 'nodes',index=False)

KeyboardInterrupt: 