En este archivo puedes escribir lo que estimes conveniente. Te recomendamos detallar tu solución y todas las suposiciones que estás considerando. Aquí puedes ejecutar las funciones que definiste en los otros archivos de la carpeta src, medir el tiempo, memoria, etc.

In [1]:
from typing import List, Tuple
from datetime import datetime
import json
import time
import sqlite3
import emoji

file_path = "farmers-protest-tweets-2021-2-4.json"

In [2]:
"""
Las top 10 fechas donde hay más tweets. Mencionar el usuario (username) que más publicaciones tiene por cada uno de esos días
{
"date": "2021-02-04",
    "user"={
"id": 123456,
"username": "user1",
}
}

"""


def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
    conn = sqlite3.connect("tweets.db")
    c = conn.cursor()

    c.execute(
        """CREATE TABLE IF NOT EXISTS tweets
                 (date TEXT, user TEXT, tweet TEXT)"""
    )
    conn.commit()

    c.execute("SELECT COUNT(*) FROM tweets")
    count = c.fetchone()[0]
    if count == 0:
        with open(file_path, "r") as file:
            for line in file:
                tweet = json.loads(line)
                date = tweet["date"][:10]
                user = tweet["user"]["username"]
                tweet_text = tweet["content"]
                c.execute(
                    "INSERT INTO tweets (date, user, tweet) VALUES (?, ?, ?)",
                    (date, user, tweet_text),
                )
        conn.commit()

    query_top_dates = """
    SELECT date
    FROM tweets
    GROUP BY date
    ORDER BY COUNT(*) DESC
    LIMIT 10
    """
    c.execute(query_top_dates)
    top_dates = [row[0] for row in c.fetchall()]

    top_users_per_date = []
    for date in top_dates:
        query_top_user = f"""
        SELECT user, COUNT(*) as tweet_count
        FROM tweets
        WHERE date = '{date}'
        GROUP BY user
        ORDER BY tweet_count DESC
        LIMIT 1
        """
        c.execute(query_top_user)
        top_user,_ = c.fetchone()
        top_users_per_date.append(
            (datetime.strptime(date, "%Y-%m-%d").date(), top_user)
        )
    conn.close()

    return top_users_per_date

In [3]:
# BST para almacenar los tweets por fecha y usuario y luego obtener los top 10 usuarios, esto esta pensando para que las proximas consultas sean mas rapidas
def q1_memory(file_path: str) -> List[Tuple[datetime.date, str]]:
    class BSTNode:
        def __init__(self, date, user):
            self.date = date
            self.users = {user: 1}
            self.left = None
            self.right = None

        def insert(self, date, user):
            if date < self.date:
                if self.left is None:
                    self.left = BSTNode(date, user)
                else:
                    self.left.insert(date, user)
            elif date > self.date:
                if self.right is None:
                    self.right = BSTNode(date, user)
                else:
                    self.right.insert(date, user)
            else:
                if user in self.users:
                    self.users[user] += 1
                else:
                    self.users[user] = 1

        def get_top_users(self):
            data = []
            if self.left:
                data.extend(self.left.get_top_users())
            data.append(
                (
                    self.date,
                    max(self.users, key=self.users.get),
                    sum(self.users.values()),
                )
            )
            if self.right:
                data.extend(self.right.get_top_users())
            return data

    root = None
    with open(file_path, "r") as file:
        for line in file:
            tweet = json.loads(line)
            date = datetime.fromisoformat(tweet["date"]).date()
            user = tweet["user"]["username"]
            if root is None:
                root = BSTNode(date, user)
            else:
                root.insert(date, user)

    all_data = root.get_top_users() if root else []
    top_dates = sorted(all_data, key=lambda x: x[2], reverse=True)[:10]
    return [(date, user) for date, user, _ in top_dates]

In [4]:
%reload_ext memory_profiler
%memit top_dates = q1_memory(file_path)


peak memory: 87.45 MiB, increment: 6.45 MiB


In [5]:
%reload_ext memory_profiler
%memit top_dates = q1_time(file_path)

peak memory: 90.27 MiB, increment: 4.85 MiB


In [6]:
start_time = time.time()
top_dates = q1_time(file_path)
end_time = time.time()
print(top_dates)
print("q1_time:", end_time - start_time, "s")

[(datetime.date(2021, 2, 12), 'RanbirS00614606'), (datetime.date(2021, 2, 13), 'MaanDee08215437'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 14), 'rebelpacifist'), (datetime.date(2021, 2, 18), 'neetuanjle_nitu'), (datetime.date(2021, 2, 15), 'jot__b'), (datetime.date(2021, 2, 20), 'MangalJ23056160'), (datetime.date(2021, 2, 23), 'Surrypuria'), (datetime.date(2021, 2, 19), 'Preetm91')]
q1_time: 1.0452218055725098 s


In [7]:
start_time = time.time()
top_dates = q1_memory(file_path)
end_time = time.time()
print(top_dates)
print("q1_memory:", end_time - start_time, "s")

[(datetime.date(2021, 2, 12), 'RanbirS00614606'), (datetime.date(2021, 2, 13), 'MaanDee08215437'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 14), 'rebelpacifist'), (datetime.date(2021, 2, 18), 'neetuanjle_nitu'), (datetime.date(2021, 2, 15), 'jot__b'), (datetime.date(2021, 2, 20), 'MangalJ23056160'), (datetime.date(2021, 2, 23), 'Surrypuria'), (datetime.date(2021, 2, 19), 'Preetm91')]
q1_memory: 3.090492010116577 s


In [8]:
"""
Los top 10 emojis más usados con su respectivo conteo
{
"content":"emoji"
}
"""


def q2_time(file_path: str) -> List[Tuple[str, int]]:
    conn = sqlite3.connect("emojis.db")
    c = conn.cursor()

    c.execute(
        """CREATE TABLE IF NOT EXISTS tweets
                 (content TEXT)"""
    )
    conn.commit()

    c.execute("SELECT COUNT(*) FROM tweets")
    count = c.fetchone()[0]
    if count == 0:
        with open(file_path, "r") as file:
            for line in file:
                tweet = json.loads(line)
                tweet_text = tweet["content"]
                c.execute(
                    "INSERT INTO tweets (content) VALUES (?)",
                    (str(tweet_text),),
                )
        conn.commit()

    query_top_emojis = """ SELECT content FROM tweets"""
    c.execute(query_top_emojis)

    emoji_counter = {}
    for row in c.fetchall():
        tweet = row[0]
        for char in tweet:
            if char in emoji.UNICODE_EMOJI['en']:
                if char in emoji_counter:
                    emoji_counter[char] += 1
                else:
                    emoji_counter[char] = 1
    conn.close()
        
    return sorted(emoji_counter.items(), key=lambda x: x[1], reverse=True)[:10]

In [9]:
def q2_memory(file_path: str) -> List[Tuple[str, int]]:
    class BSTNode:
        def __init__(self, emoji):
            self.emoji = emoji
            self.counter = 1
            self.left = None
            self.right = None

        def insert(self, emoji):
            if emoji < self.emoji:
                if self.left is None:
                    self.left = BSTNode(emoji)
                else:
                    self.left.insert(emoji)
            elif emoji > self.emoji:
                if self.right is None:
                    self.right = BSTNode(emoji)
                else:
                    self.right.insert(emoji)
            else:
                self.counter += 1

        def get_top_emojis(self):
            data = []
            if self.left:
                data.extend(self.left.get_top_emojis())
            data.append((self.emoji, self.counter))
            if self.right:
                data.extend(self.right.get_top_emojis())
            return data

    root = None
    with open(file_path, "r") as file:
        for line in file:
            tweet = json.loads(line)
            for char in tweet["content"]:
                if char in emoji.UNICODE_EMOJI["en"]:
                    if root is None:
                        root = BSTNode(char)
                    else:
                        root.insert(char)
            

    all_data = root.get_top_emojis() if root else []
    return sorted(all_data, key=lambda x: x[1], reverse=True)[:10]

In [10]:
%reload_ext memory_profiler
%memit top_dates = q2_memory(file_path)

peak memory: 86.35 MiB, increment: 0.00 MiB


In [11]:
%reload_ext memory_profiler
%memit top_dates = q2_time(file_path)

peak memory: 129.96 MiB, increment: 43.91 MiB


In [12]:
start_time = time.time()
top_emojis = q2_time(file_path)
end_time = time.time()
print(top_emojis)
print("q2_time:", end_time - start_time, "s")

[('🙏', 7286), ('😂', 3072), ('🚜', 2972), ('✊', 2411), ('🌾', 2363), ('🏻', 2080), ('❤', 1779), ('🤣', 1668), ('🏽', 1218), ('👇', 1108)]
q2_time: 7.929040193557739 s


In [13]:
start_time = time.time()
top_emojis = q2_memory(file_path)
end_time = time.time()
print(top_emojis)
print("q2_memory:", end_time - start_time, "s")

[('🙏', 7286), ('😂', 3072), ('🚜', 2972), ('✊', 2411), ('🌾', 2363), ('🏻', 2080), ('❤', 1779), ('🤣', 1668), ('🏽', 1218), ('👇', 1108)]
q2_memory: 11.087182521820068 s
