In [None]:
import re
import pandas as pd
from pandas.io.json import json_normalize

In [None]:
#Data preprocessing made following this guide: https://www.kaggle.com/code/prathamsharma123/clean-raw-json-tweets-data
def data_preprocessing(data):
    users = json_normalize(data["user"])
    users.rename(columns={"id":"userId", "url":"profileUrl"}, inplace = True)
    users = pd.DataFrame(users)
    users.drop_duplicates(subset = ["userId"], inplace = True)
    user_id = []
    user_names = []
    for user in data["user"]:
        uid = user["id"]
        user_id.append(uid)
        uname = user["username"]
        user_names.append(uname)
    data["userId"] = user_id
    data["userName"] = user_names
    
    data['date'] = data['date'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))
    
    return data

In [None]:
#Top 10 tweets by number of retweets
def query1(data):
    data = data.sort_values(by = "retweetCount", ascending = False)
    return data

In [None]:
#Top 10 users by number of tweets
def query2(data):
    grouped = data.groupby(by = ["userId", "userName"]).size().reset_index(name = "tweetCount")
    grouped = grouped.sort_values(by = "tweetCount", ascending = False)
    return grouped

In [None]:
#Top 10 dates by number of tweets
def query3(data):
    grouped = data.groupby(by = "date").size().reset_index(name = "tweetCount")
    grouped = grouped.sort_values(by = "tweetCount", ascending = False)
    return grouped

In [None]:
#Top 10 hashtags by number of uses
def query4(data):
    hashtags = []
    for tweet in data["content"]:
        matches = re.findall('#[a-zA-Z0-9_]+', tweet)
        hashtags.extend(matches)
    hashtags = pd.DataFrame(hashtags, columns = ["Hashtag"])
    grouped = hashtags.groupby(by = "Hashtag").size().reset_index(name = "count")
    grouped = grouped.sort_values(by = "count", ascending = False)
    return grouped

In [None]:
def main():
    print("Loading data. This may take a few minutes...")
    raw_data = pd.read_json("farmers-protest-tweets-2021-03-5.json", lines = True)
    print("Data loaded. Preprocessing data...")
    tweets = data_preprocessing(raw_data)
    print("Data processed.")
    while True:
        query = input("Please enter query ID (1, 2, 3, 4) or press -1 to exit.")
        if query == "1":
            print("Top 10 tweets by number of retweets:")
            result = query1(tweets)
            display(result.head(10))
        elif query == "2":
            print("Top 10 users by number of tweets:")
            result = query2(tweets)
            display(result.head(10))
        elif query == "3":
            print("Top 10 dates by number of tweets:")
            result = query3(tweets)
            display(result.head(10))
        elif query == "4":
            print("Top 10 hashtags by number of uses:")
            result = query4(tweets)
            display(result.head(10))
        elif query == "-1":
            break
        else:
            print("Invalid value. Please try again.")
    

In [None]:
if __name__ == "__main__":
    main()