In [76]:
import glob
import json
import pandas as pd
import numpy as np
import datetime
from math import inf

Features to extract :
- Average cost of character (# / @) on he whole dataset
- Average use of characters (@ / #) for each user (N_usages / N_tweets)
- Frequency of tweets published (N_tweets / Delta_t dataset)
- Frequency of follow actions (Delta_friends / Delta_t, if positive)

In [77]:
min_timestamp = inf
max_timestamp = 0
cost_mention = []
cost_hashtag = []
N_Tweets = 0

In [78]:
data_folder = "data/Woldcup2008/Tweet Worldcup/raw"
files = glob.glob(f"{data_folder}/*.json")
print(f"Found {len(files)} files")

Found 2285 files


In [79]:
users_dict = {}

for i, file in enumerate(files[:500]) :

    if i % 100 == 0 and i > 0:
        print(f"Opening file {i}/{len(files)}")

    tweets = []
    with open(file, 'r', encoding='utf-8') as f:
        for line in f: 
            tweet = json.loads(line)
            tweets.append(tweet)

    for tweet in tweets:

        # User data

        user_id = tweet['user']['id']

        # Create the user if not already in the dict
        if user_id not in users_dict.keys():
            users_dict[user_id] = {
                'n_tweets' : 0,
                'max_friends' : 0,
                'min_friends' : inf,
                'n_mention' : 0,
                'n_hashtag' : 0,
                'total_cost_mention' : 0,
                'total_cost_hashtag' : 0
            }

        # Add the tweet to the user count
        users_dict[user_id]['n_tweets'] += 1
        users_dict[user_id]['n_mention'] += len(tweet['entities']['user_mentions'])
        users_dict[user_id]['n_hashtag'] += len(tweet['entities']['hashtags'])

        # Add the cost of mentions
        for mention in tweet['entities']['user_mentions']:
            users_dict[user_id]['total_cost_mention'] += len(mention['screen_name'])

        # Add the cost of hashtags
        for hashtag in tweet['entities']['hashtags']:
            users_dict[user_id]['total_cost_hashtag'] += len(hashtag['text'])


        # Update min and max friends if necessary
        n_friends = tweet['user']['friends_count']
        if  n_friends > users_dict[user_id]['max_friends']:
            users_dict[user_id]['max_friends'] = n_friends

        if n_friends < users_dict[user_id]['min_friends']:
            users_dict[user_id]['min_friends'] = n_friends


        # Global data

        
        # Get the cost of each #
        for hashtag in tweet['entities']['hashtags']:
            cost_hashtag.append(len(hashtag['text']))

        # Get the cost of each @
        for mention in tweet['entities']['user_mentions']:
            cost_mention.append(len(mention['screen_name']))

        # Update min and max timestamp if necessary
        if tweet['current_time'] < min_timestamp:
            min_timestamp = tweet['current_time']
        elif tweet['current_time'] > max_timestamp:
            max_timestamp = tweet['current_time'] 

Opening file 100/2285
Opening file 200/2285
Opening file 300/2285
Opening file 400/2285


Execution time estimation :
- 10 files : 7"
- 100 files : 53"
- 200 files : 1'58"
- 500 files : 5'32"

In [80]:
# Compute the delta time (in microseconds)
delta_t = max_timestamp - min_timestamp

In [81]:
print(f"Total number of # : {len(cost_hashtag)}")
print(f"Total number of Mentions (@) : {len(cost_mention)}")
print(f"First tweet : {datetime.datetime.fromtimestamp(min_timestamp / 1000)}")
print(f"Last tweet : {datetime.datetime.fromtimestamp(max_timestamp / 1000)}")

Total number of # : 1638497
Total number of Mentions (@) : 895104
First tweet : 2018-06-14 04:14:25.098000
Last tweet : 2018-06-16 13:43:00.391000


Create dataframe

In [97]:
users = pd.DataFrame.from_dict(users_dict, orient='index')
users = users.reset_index()      
users = users.rename(columns={'index': 'user_id'}) 
users.head()

Unnamed: 0,user_id,n_tweets,max_friends,min_friends,n_mention,n_hashtag,total_cost_mention,total_cost_hashtag
0,39084553,1,826,826,1,2,14,18
1,1564678657,1,686,686,0,0,0,0
2,967244973824339968,1,44,44,1,0,9,0
3,4872447178,2,3189,3189,2,3,17,30
4,35628099,1,407,407,2,2,25,11


In [101]:
users.to_csv('data/users.csv', index=False)

In [102]:
df = pd.read_csv('data/users.csv')
df.head()

Unnamed: 0,user_id,n_tweets,max_friends,min_friends,n_mention,n_hashtag,total_cost_mention,total_cost_hashtag
0,39084553,1,826,826,1,2,14,18
1,1564678657,1,686,686,0,0,0,0
2,967244973824339968,1,44,44,1,0,9,0
3,4872447178,2,3189,3189,2,3,17,30
4,35628099,1,407,407,2,2,25,11
