In [104]:
import glob
import json
import pandas as pd
import numpy as np
import datetime
from math import inf

## 1. Extract data about each user

Features to extract :
- Average cost of character (# / @) on he whole dataset
- Average use of characters (@ / #) for each user (N_usages / N_tweets)
- Frequency of tweets published (N_tweets / Delta_t dataset)
- Frequency of follow actions (Delta_friends / Delta_t, if positive)

In [105]:
min_timestamp = inf
max_timestamp = 0
cost_mention = []
cost_hashtag = []
N_Tweets = 0

In [106]:
data_folder = "../data/Woldcup2008/Tweet Worldcup/raw"
files = glob.glob(f"{data_folder}/*.json")
print(f"Found {len(files)} files")

Found 2285 files


In [107]:
users_dict = {}
n_files = len(files)

for i, file in enumerate(files[:n_files]) :

    print(f"Opening file {i+1}/{n_files}", end="\r")

    tweets = []
    with open(file, 'r', encoding='utf-8') as f:
        for line in f: 
            tweet = json.loads(line)
            tweets.append(tweet)

    for tweet in tweets:

        # User data

        user_id = tweet['user']['id']

        # Create the user if not already in the dict
        if user_id not in users_dict.keys():
            users_dict[user_id] = {
                'n_tweets' : 0,
                'n_replies' : 0,
                'n_quotes' : 0,
                'n_retweets' : 0,
                'max_friends' : 0,
                'min_friends' : inf,
                'max_followers' : 0,
                'n_mentions' : 0,
                'n_hashtags' : 0,
                'n_urls' : 0,
                'n_out_urls' : 0,
                'first_active' : inf,
                'last_active' : 0,
            }

        # Add the tweet to the user count
        users_dict[user_id]['n_tweets'] += 1
        users_dict[user_id]['n_replies'] += int(tweet['in_reply_to_user_id'] is not None)
        users_dict[user_id]['n_quotes'] += int(tweet['is_quote_status'])
        users_dict[user_id]['n_retweets'] += int('retweeted_status' in tweet)
        users_dict[user_id]['n_mentions'] += len(tweet['entities']['user_mentions'])
        users_dict[user_id]['n_hashtags'] += len(tweet['entities']['hashtags'])
        users_dict[user_id]['n_urls'] += len(tweet['entities']['urls'])

        url = [url["expanded_url"] for url in tweet['entities']['urls']]
        for i, u in enumerate(url):
            if u.startswith("https://twitter.com"):
                url.pop(i)
        users_dict[user_id]['n_out_urls'] += len(url)


        # Update min and max friends if necessary
        n_friends = tweet['user']['friends_count']
        if  n_friends > users_dict[user_id]['max_friends']:
            users_dict[user_id]['max_friends'] = n_friends

        if n_friends < users_dict[user_id]['min_friends']:
            users_dict[user_id]['min_friends'] = n_friends

        # Update max followers if necessary
        followers_count = tweet['user']['followers_count']
        if followers_count > users_dict[user_id]['max_followers']:
            users_dict[user_id]['max_followers'] = followers_count

        # Update first and last active time if necessary
        tweet_time = tweet['current_time']
        if tweet_time < users_dict[user_id]['first_active']:
            users_dict[user_id]['first_active'] = tweet_time
        if tweet_time > users_dict[user_id]['last_active']:
            users_dict[user_id]['last_active'] = tweet_time


        # Global data

        
        # Get the cost of each #
        for hashtag in tweet['entities']['hashtags']:
            cost_hashtag.append(len(hashtag['text']) + 1)

        # Get the cost of each @
        for mention in tweet['entities']['user_mentions']:
            cost_mention.append(len(mention['screen_name']) + 1)

        # Update min and max timestamp if necessary
        if tweet['current_time'] < min_timestamp:
            min_timestamp = tweet['current_time']
        elif tweet['current_time'] > max_timestamp:
            max_timestamp = tweet['current_time'] 

Opening file 2285/2285

Execution time estimation :
- 10 files : 5"
- 100 files : 53"
- 200 files : 1'58"
- 500 files : 5'32"
- 2285 files : 18'

In [108]:
# Compute the delta time (in microseconds)
delta_t = max_timestamp - min_timestamp
delta_t_h = delta_t / 3600000

In [109]:
print(f"Total number of # : {len(cost_hashtag)}")
print(f"Average cost of # : {np.mean(cost_hashtag):.2f} chars")
print(f"Total number of Mentions (@) : {len(cost_mention)}")
print(f"Average cost of Mentions (@) # : {np.mean(cost_mention):.2f} chars")
print(f"First tweet : {datetime.datetime.fromtimestamp(min_timestamp / 1000)}")
print(f"Last tweet : {datetime.datetime.fromtimestamp(max_timestamp / 1000)}")

Total number of # : 7564843
Average cost of # : 8.84 chars
Total number of Mentions (@) : 3927542
Average cost of Mentions (@) # : 11.57 chars
First tweet : 2018-06-14 04:14:25.098000
Last tweet : 2018-06-17 19:01:13.214000


Create dataframe

In [110]:
users = pd.DataFrame.from_dict(users_dict, orient='index')
users = users.reset_index()      
users = users.rename(columns={'index': 'user_id'}) 
users.head()

Unnamed: 0,user_id,n_tweets,n_replies,n_quotes,n_retweets,max_friends,min_friends,max_followers,n_mentions,n_hashtags,n_urls,n_out_urls,first_active,last_active
0,39084553,1,0,0,1,826,826,556,1,2,0,0,1528942465098,1528942465098
1,1564678657,1,0,1,0,686,686,918,0,0,1,0,1528942465150,1528942465150
2,967244973824339968,2,0,2,1,44,44,34,1,0,2,0,1528942465165,1528957562932
3,4872447178,4,0,0,3,3189,3189,2807,3,7,1,0,1528942465499,1528948008587
4,35628099,1,0,0,1,407,407,48,2,2,0,0,1528942466051,1528942466051


In [111]:
# # Compute average use of # and @
# users['average_use_mention'] = users['n_mentions'] / users['n_tweets']
# users['average_use_hashtag'] = users['n_hashtags'] / users['n_tweets']

In [112]:
# Compute frequency of actions (by hour) on the active delta time for users with +1 tweet / on the global delta time for others
i_mult_tweets = users['n_tweets'] > 1

users.loc[i_mult_tweets, 'f_tweets'] = users.loc[i_mult_tweets, 'n_tweets'] / ((users.loc[i_mult_tweets, 'last_active'] - users.loc[i_mult_tweets, 'first_active']) / 3600000)
users.loc[i_mult_tweets, 'f_friends'] = (users.loc[i_mult_tweets, 'max_friends'] - users.loc[i_mult_tweets, 'min_friends']) / ((users.loc[i_mult_tweets, 'last_active'] - users.loc[i_mult_tweets, 'first_active']) / 3600000)

users.loc[~ i_mult_tweets, 'f_tweets'] = users.loc[~ i_mult_tweets, 'n_tweets'] / delta_t_h
users.loc[~ i_mult_tweets, 'f_friends'] = (users.loc[~ i_mult_tweets, 'max_friends'] - users.loc[~ i_mult_tweets, 'min_friends']) / delta_t_h

# Compute active timeframe
users.loc[i_mult_tweets, 'time_active'] = (users.loc[i_mult_tweets, 'last_active'] - users.loc[i_mult_tweets, 'first_active']) / 3600000

In [113]:
users.head()

Unnamed: 0,user_id,n_tweets,n_replies,n_quotes,n_retweets,max_friends,min_friends,max_followers,n_mentions,n_hashtags,n_urls,n_out_urls,first_active,last_active,f_tweets,f_friends,time_active
0,39084553,1,0,0,1,826,826,556,1,2,0,0,1528942465098,1528942465098,0.011523,0.0,
1,1564678657,1,0,1,0,686,686,918,0,0,1,0,1528942465150,1528942465150,0.011523,0.0,
2,967244973824339968,2,0,2,1,44,44,34,1,0,2,0,1528942465165,1528957562932,0.476892,0.0,4.193824
3,4872447178,4,0,0,3,3189,3189,2807,3,7,1,0,1528942465499,1528948008587,2.59783,0.0,1.539747
4,35628099,1,0,0,1,407,407,48,2,2,0,0,1528942466051,1528942466051,0.011523,0.0,


In [114]:
print(f"Proportion of retweets : {users['n_retweets'].sum() / users['n_tweets'].sum():.2f}")
print(f"Proportion of quotes : {users['n_quotes'].sum() / users['n_tweets'].sum():.2f}")
print(f"Proportion of replies : {users['n_replies'].sum() / users['n_tweets'].sum():.2f}")

Proportion of retweets : 0.66
Proportion of quotes : 0.14
Proportion of replies : 0.02


Remove useless features

In [115]:
users.drop(['max_friends', 'min_friends', 'first_active', 'last_active'], axis=1, inplace=True)
users.head()

Unnamed: 0,user_id,n_tweets,n_replies,n_quotes,n_retweets,max_followers,n_mentions,n_hashtags,n_urls,n_out_urls,f_tweets,f_friends,time_active
0,39084553,1,0,0,1,556,1,2,0,0,0.011523,0.0,
1,1564678657,1,0,1,0,918,0,0,1,0,0.011523,0.0,
2,967244973824339968,2,0,2,1,34,1,0,2,0,0.476892,0.0,4.193824
3,4872447178,4,0,0,3,2807,3,7,1,0,2.59783,0.0,1.539747
4,35628099,1,0,0,1,48,2,2,0,0,0.011523,0.0,


Save Dataframe

In [116]:
users.to_csv('data/users.csv', index=False)

## 2. Select users to annotate

In [117]:
assert 1 == 0

AssertionError: 

Load Dataframe

In [118]:
df = pd.read_csv('data/users.csv')
df.head()

Unnamed: 0,user_id,n_tweets,n_replies,n_quotes,n_retweets,max_followers,n_mentions,n_hashtags,n_urls,n_out_urls,f_tweets,f_friends,time_active
0,39084553,1,0,0,1,556,1,2,0,0,0.011523,0.0,
1,1564678657,1,0,1,0,918,0,0,1,0,0.011523,0.0,
2,967244973824339968,2,0,2,1,34,1,0,2,0,0.476892,0.0,4.193824
3,4872447178,4,0,0,3,2807,3,7,1,0,2.59783,0.0,1.539747
4,35628099,1,0,0,1,48,2,2,0,0,0.011523,0.0,


In [119]:
df.shape

(1843439, 13)

In [120]:
users_to_select = 200
max_tweets_by_user = 100

In [121]:
np.random.seed(42)
rand_indexes = np.random.choice(df.index.to_numpy(), size=200, replace=False)
selected_ids = df.loc[rand_indexes, 'user_id'].to_numpy()

In [122]:
# Adapt to file folder
data_folder = "../data/Woldcup2008/Tweet Worldcup/raw"
files = glob.glob(f"{data_folder}/*.json")
print(f"Found {len(files)} files")
n_files = len(files)

Found 2285 files


In [123]:
# Create empty dict to save tweets from each user
user_tweets = {}
for user_id in selected_ids:
    user_tweets[user_id] = []

# Go through each file to find tweets from selected users
for i, file in enumerate(files[:n_files]) :

    print(f"Opening file {i+1}/{n_files}", end="\r")

    tweets = []
    with open(file, 'r', encoding='utf-8') as f:
        for line in f: 
            tweet = json.loads(line)
            tweets.append(tweet)

    for tweet in tweets:

        # User data
        user_id = tweet['user']['id']

        # Filter to selected users
        if user_id in selected_ids:
            
            # Add tweet to list if limit not reached
            if len(user_tweets[user_id]) < max_tweets_by_user:
                user_tweets[user_id].append(tweet)

Opening file 2285/2285

In [124]:
# Make sure that tweets were found for each selected user (Remove if working with only a part of a dtaset)
assert [] not in user_tweets.values()

In [125]:
# Write file for each user
for user_id, tweets in user_tweets.items():

    filename = f'data/user_tweets/{user_id}.json'

    with open(filename, 'w', encoding='utf-8') as f:

        json.dump(tweets, f, ensure_ascii=False, indent=2)
