Gran volum de tweets segons una llista d'usuaris (no pública)

In [None]:
import asyncio
import pickle
from datetime import datetime, timedelta

import pandas as pd
from tqdm import tqdm
from twitter_scrape_utils import setup_accounts, tweets2df
from twscrape import API, gather
from twscrape.logger import set_log_level

set_log_level("DEBUG")
api = API()  # inicialitza l'API amb la base de dades de comptes predeterminada `accounts.db`

pd.set_option("display.max_colwidth", None)

In [None]:
right_users = pd.read_csv("far-right_users.csv").username.unique().tolist()
left_users = pd.read_csv("non-far-right_users.csv").username.unique().tolist()
len(right_users), len(left_users)

In [None]:
await setup_accounts(api)

In [None]:
start_date = datetime(2024, 1, 1)
since_date = start_date.strftime("%Y-%m-%d")

# Get user_id
# user = await api.user_by_login(user_name)
# user_id = user.id

# tweets = {} # Executar només la primera vegada, així podem parar i rependre quan vulguem
TWEETS_LIMIT = 20
await api.pool.login_all()  # Ensure accounts are logged in
for user_name in tqdm(left_users):

    if user_name in tweets:
        continue

    query = f"from:{user_name} -is:retweet since:{since_date}"
    tweets[user_name] = await gather(api.search(query, limit=TWEETS_LIMIT))
    # tweets[user_name] = await gather(api.user_tweets(user_name, limit=tweets_limit))

In [None]:
# Guardem vesions intermèdies cada X temps
with open(f"left_{len(tweets)}.pkl", "wb") as f:
    pickle.dump(tweets, f)

In [None]:
# # Load already scrapped tweets
# with open("tweets.pkl", "rb") as f:
#     tweets = pickle.load(f)

In [None]:
def parse_tweet(tweet):
    return {
        "id": tweet.id,
        "username": tweet.user.username,
        "content": tweet.rawContent,
        "created_at": tweet.date,
        "photos": tweet.media.photos,
        "videos": tweet.media.videos,
        "parent": tweet.inReplyToTweetId,
    }


def get_df(tweets, allow_retweets=False):
    dfs = {k: tweets2df(v, pase_tweet) for k, v in tweets.items()}

    if not allow_retweets:
        # keep only post from user, no retweets
        dfs = {k: df[df.username == k] if not df.empty else df for k, df in dfs.items()}
    df = pd.concat(list(dfs.values()))

    df["has_photo"] = False
    df["has_video"] = False
    df["is_response"] = False
    df.loc[df["photos"].str.len() != 0, "has_photo"] = True
    df.loc[df["videos"].str.len() != 0, "has_video"] = True
    df.loc[~df["parent"].isna(), "is_response"] = True
    df = df.drop(["photos", "videos", "parent"], axis=1)
    return df

In [None]:
df = get_df(tweets)
df.to_csv(f"left_{len(tweets)}.csv", index=False)