In [79]:
import numpy as np
import pandas as pd
import glob
import json
from pathlib import Path
from IPython.display import clear_output

In [80]:
def get_full_text(tweet):
    
    # Case 1: This is a retweet
    if 'retweeted_status' in tweet:
        retweet = tweet['retweeted_status']
        # Case 1a: Retweet of an extended tweet
        if 'extended_tweet' in retweet:
            return retweet['extended_tweet']['full_text']
        # Case 1b: Retweet of a normal tweet
        else:
            return retweet['text']
    
    # Case 2: This is a regular extended tweet
    if 'extended_tweet' in tweet:
        return tweet['extended_tweet']['full_text']
    
    # Case 3: This is a regular non-extended tweet
    return tweet['text']

In [81]:
def print_tweet(tweet, j, n_tweets):

    print(f"Tweet {j+1}/{n_tweets}")

    print(f"Retweet ? : {'retweeted_status' in tweet}")

    print("Content of the tweet :")
    print(get_full_text(tweet))
    print("Time :")
    print(tweet['created_at'])
    print("Hashtags used :")
    print([d['text'] for d in tweet['entities']['hashtags']])
    print("Retweets count :")
    print(tweet['retweet_count'])
    print("favourite count :")
    print(tweet['favorite_count'])
    print("\n")

In [82]:
def show_tweets(user_files, user_ids, i, n_to_annotate):

    clear_output()
    print(f"Opening tweets from user {user_ids[i]} ({i+1}/{n_to_annotate})")

    with open(user_files[i], 'r', encoding='utf-8') as f:
        tweets = json.load(f)

    print("User description : ")
    print(tweets[0]['user']['description'], "\n")

    for j, tweet in enumerate(tweets):
        print_tweet(tweet, j, len(tweets))


In [83]:
def ask_label():
    while True:
        answer = input("Is this user suspicious? (1 = Yes, 0 = No): ").strip()
        if answer in ['0', '1']:
            return int(answer)
        else:
            print("Please enter 1 (Yes) or 0 (No).")

___

Get files containing tweets for each user

In [84]:
user_files = np.array(glob.glob("data/user_tweets/*.json"))
user_ids = np.array([Path(file).stem for file in user_files]).astype(np.int64)
print(f"Found {len(user_files)} files")

Found 200 files


Create empty dataset

In [85]:
# annotated_df = pd.DataFrame(columns=['user_id', 'suspicious'])
# annotated_df.to_csv("data/annotated_users.csv", index=False)

Open annotated users dataset

In [90]:
annotated_df = pd.read_csv("data/annotated_users.csv")
print(f"Already {annotated_df.shape[0]} annotated users")

Already 100 annotated users


Find ids not annotated yet

In [87]:
idx_to_annotate = ~ np.isin(user_ids, annotated_df['user_id'].astype(np.int64))
user_files = user_files[idx_to_annotate]
user_ids = user_ids[idx_to_annotate]
print(f"{np.sum(idx_to_annotate)} users to annotate")

117 users to annotate


View the tweets of some users to annotate them

**Read tweets from user in scrollable view, then click the button for not_suspicious/suspicious**

In [None]:
# Number of users to annotate
n_to_annotate = 10

for i in range(n_to_annotate):
    
    show_tweets(user_files, user_ids, i, n_to_annotate)

    label = ask_label()
    annotated_df = pd.concat([
        annotated_df,
        pd.DataFrame([{'user_id': user_ids[i], 'suspicious': label}])
    ], ignore_index=True)

    print(f"Annotated successfuly user {user_ids[i]}")

clear_output()
print(f"Annotated {annotated_df.shape[0]} users in total")

Annotated 100 users in total


In [89]:
annotated_df.to_csv("data/annotated_users.csv", index=False)