In [1]:
import src.tweet_data_processor as tdp
import src.twitter_queries as tq
import time
import random
import numpy as np
import pickle 

search_app = tq.TwitterQueries()
dbs = tdp.TweetDataProcessor()

2024-04-24 17:55:48,493 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
2024-04-24 17:55:48,494 - apscheduler.scheduler - INFO - Added job "Cache.save_to_disk" to job store "default"
2024-04-24 17:55:48,494 - apscheduler.scheduler - INFO - Scheduler started
2024-04-24 17:55:48,495 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 17:55:48,498 - apscheduler.scheduler - DEBUG - Next wakeup is due at 2024-04-24 17:56:48.492165-04:00 (in 59.994001 seconds)
2024-04-24 17:55:48,500 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
2024-04-24 17:55:48,501 - apscheduler.scheduler - INFO - Added job "Cache.save_to_disk" to job store "default"
2024-04-24 17:55:48,501 - apscheduler.scheduler - INFO - Scheduler started
2024-04-24 17:55:48,502 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 17:55:48,519 - apscheduler.scheduler - 

## Database Stats

In [2]:
cursor = dbs.mysql_conn.cursor(buffered=True)
cursor.execute('SELECT count(*) as n_users FROM twitter.users')
fname = cursor.fetchone()[0]
print(fname,'users in MySQL')

75377 users in MySQL


In [3]:
cursor.execute('SELECT count(distinct hashtag) as n_users FROM twitter.hashtags')
fname = cursor.fetchone()[0]
print(fname,'hashtags in MySQL')

6008 hashtags in MySQL


In [4]:
print(dbs.tweet_collection.count_documents({}),'tweets in MongoDB')

86656 tweets in MongoDB


In [5]:
res = dbs.neo4j_connection.execute_query("MATCH ()-[r]->() RETURN count(r) as n")
print(res.records[0][0],'relationships between users in Neo4J')

62744 relationships between users in Neo4J


## Experiments

In [6]:
n_experiments = 10000
stats_sample = 1000

query = """
SELECT id_str FROM twitter.users
ORDER BY RAND()
LIMIT 5000
"""
cursor.execute(query)
sample_users = cursor.fetchall()
sample_users_list = [x[0] for x in sample_users]

## for hashtags, focusing only on the ones with 1 twitter
query = """
WITH tb as 
(SELECT hashtag
FROM (SELECT hashtag, count(tweet_id) as n_tweet FROM twitter.hashtags
GROUP BY 1
HAVING n_tweet = 1) a)

SELECT hashtag, tweet_id
FROM twitter.hashtags a
WHERE hashtag in (select hashtag from tb)
ORDER BY RAND()
LIMIT 3000
"""
cursor.execute(query)
sample_hashtags = cursor.fetchall()
sample_hashtags_list = [x[0] for x in sample_hashtags]
tweets_hashtags_list = [x[1] for x in sample_hashtags]

tweets = dbs.tweet_collection.aggregate([{ "$sample": { "size": 5000 } }])
tweets_list = [i['id_str'] for i in tweets]

#### Hashtags

In [7]:
time_list_cache = []
time_list_wo_cache = []

for i in range(0,n_experiments):
    number = random.randint(0, len(sample_hashtags_list)-1)
    hashtag_id = sample_hashtags_list[number]
    tweet_id = tweets_hashtags_list[number]
    is_in_cache = search_app.tweet_cache.get(tweet_id) is not None
    tic = time.perf_counter()
    search_app.search_tweets_by_hashtag(hashtag_id)
    toc = time.perf_counter()
    if is_in_cache:
        time_list_cache.append(toc-tic)
    else:
        time_list_wo_cache.append(toc-tic)
    
results = {
    'with_cache':time_list_cache,
    'without_cache':time_list_wo_cache
}

with open('hashtag_results.pkl', 'wb') as f:
    pickle.dump(results, f)

2024-04-24 17:56:48,506 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 17:56:48,506 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 17:56:48,508 - apscheduler.executors.default - INFO - Running job "Cache.save_to_disk (trigger: interval[0:01:00], next run at: 2024-04-24 17:57:48 EDT)" (scheduled at 2024-04-24 17:56:48.500163-04:00)
2024-04-24 17:56:48,508 - apscheduler.scheduler - DEBUG - Next wakeup is due at 2024-04-24 17:57:48.500163-04:00 (in 59.992053 seconds)
2024-04-24 17:56:48,513 - apscheduler.executors.default - INFO - Running job "Cache.save_to_disk (trigger: interval[0:01:00], next run at: 2024-04-24 17:57:48 EDT)" (scheduled at 2024-04-24 17:56:48.492165-04:00)
2024-04-24 17:56:48,513 - apscheduler.scheduler - DEBUG - Next wakeup is due at 2024-04-24 17:57:48.492165-04:00 (in 59.979053 seconds)
2024-04-24 17:56:48,513 - apscheduler.executors.default - INFO - Job "Cache.save_to_disk (trigger: interval[0:01:00], next run at: 2024-04

In [8]:
time_list_wo_cache = np.array(time_list_wo_cache)
chosen_ids = random.sample(range(0,len(time_list_wo_cache)), stats_sample)
print('Without cache')    
print('mean:',f'{np.mean(time_list_wo_cache[chosen_ids])*1000:0.4f} ms')
print('std:',f'{np.std(time_list_wo_cache[chosen_ids])*1000:0.4f} ms')
time_list_cache = np.array(time_list_cache)
chosen_ids = random.sample(range(0,len(time_list_cache)), stats_sample)
print('With cache')  
print('mean:',f'{np.mean(time_list_cache[chosen_ids])*1000:0.4f} ms')
print('std:',f'{np.std(time_list_cache[chosen_ids])*1000:0.4f} ms')

Without cache
mean: 46.2792 ms
std: 18.9015 ms
With cache
mean: 29.2719 ms
std: 6.5254 ms


#### User

In [9]:

time_list_cache = []
time_list_wo_cache = []

for i in range(0,n_experiments):
    number = random.randint(0, len(sample_users_list)-1)
    user_id = sample_users_list[number]
    is_in_cache = search_app.user_cache.get(user_id) is not None
    tic = time.perf_counter()
    search_app.get_user_data([user_id])
    toc = time.perf_counter()
    if is_in_cache:
        time_list_cache.append(toc-tic)
    else:
        time_list_wo_cache.append(toc-tic)

results = {
    'with_cache':time_list_cache,
    'without_cache':time_list_wo_cache
}


with open('user_results.pkl', 'wb') as f:
    pickle.dump(results, f)



2024-04-24 18:01:48,518 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 18:01:48,521 - apscheduler.scheduler - DEBUG - Next wakeup is due at 2024-04-24 18:02:48.492165-04:00 (in 59.971156 seconds)
2024-04-24 18:01:48,521 - apscheduler.executors.default - INFO - Running job "Cache.save_to_disk (trigger: interval[0:01:00], next run at: 2024-04-24 18:02:48 EDT)" (scheduled at 2024-04-24 18:01:48.492165-04:00)
2024-04-24 18:01:48,545 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 18:01:48,554 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 18:01:48,556 - apscheduler.scheduler - DEBUG - Next wakeup is due at 2024-04-24 18:02:48.522608-04:00 (in 59.966081 seconds)
2024-04-24 18:01:48,558 - apscheduler.executors.default - INFO - Running job "TrendingHashtags.save_trending_hashtags (trigger: interval[0:01:00], next run at: 2024-04-24 18:02:48 EDT)" (scheduled at 2024-04-24 18:01:48.522608-04:00)
2024-04-24 18:01:48,580 - apschedul

In [10]:
time_list_wo_cache = np.array(time_list_wo_cache)
chosen_ids = random.sample(range(0,len(time_list_wo_cache)), stats_sample)
print('Without cache')    
print('mean:',f'{np.mean(time_list_wo_cache[chosen_ids])*1000:0.4f} ms')
print('std:',f'{np.std(time_list_wo_cache[chosen_ids])*1000:0.4f} ms')
time_list_cache = np.array(time_list_cache)
chosen_ids = random.sample(range(0,len(time_list_cache)), stats_sample)
print('With cache')  
print('mean:',f'{np.mean(time_list_cache[chosen_ids])*1000:0.4f} ms')
print('std:',f'{np.std(time_list_cache[chosen_ids])*1000:0.4f} ms')

Without cache
mean: 28.4548 ms
std: 4.9709 ms
With cache
mean: 13.9489 ms
std: 3.0011 ms


#### Tweet

In [None]:
time_list_cache = []
time_list_wo_cache = []


for i in range(0,n_experiments):
    number = random.randint(0, len(tweets_list)-1)
    tweet_id = tweets_list[number]
    is_in_cache = search_app.tweet_cache.get(tweet_id) is not None
    tic = time.perf_counter()
    search_app.fetch_tweets_from_mongodb([tweet_id])
    toc = time.perf_counter()
    if is_in_cache:
        time_list_cache.append(toc-tic)
    else:
        time_list_wo_cache.append(toc-tic)
    
results = {
    'with_cache':time_list_cache,
    'without_cache':time_list_wo_cache
}

with open('tweet_results.pkl', 'wb') as f:
    pickle.dump(results, f)

In [None]:
time_list_wo_cache = np.array(time_list_wo_cache)
chosen_ids = random.sample(range(0,len(time_list_wo_cache)), stats_sample)
print('Without cache')    
print('mean:',f'{np.mean(time_list_wo_cache[chosen_ids])*1000:0.4f} ms')
print('std:',f'{np.std(time_list_wo_cache[chosen_ids])*1000:0.4f} ms')
time_list_cache = np.array(time_list_cache)
chosen_ids = random.sample(range(0,len(time_list_cache)), stats_sample)
print('With cache')  
print('mean:',f'{np.mean(time_list_cache[chosen_ids])*1000:0.4f} ms')
print('std:',f'{np.std(time_list_cache[chosen_ids])*1000:0.4f} ms')

Without cache
mean: 15.6105 ms
std: 3.8069 ms
With cache
mean: 0.0022 ms
std: 0.0028 ms


2024-04-24 18:31:48,502 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 18:31:48,505 - apscheduler.scheduler - DEBUG - Next wakeup is due at 2024-04-24 18:32:48.492165-04:00 (in 59.986345 seconds)
2024-04-24 18:31:48,505 - apscheduler.executors.default - INFO - Running job "Cache.save_to_disk (trigger: interval[0:01:00], next run at: 2024-04-24 18:32:48 EDT)" (scheduled at 2024-04-24 18:31:48.492165-04:00)
2024-04-24 18:31:48,519 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 18:31:48,532 - apscheduler.scheduler - DEBUG - Next wakeup is due at 2024-04-24 18:32:48.500163-04:00 (in 59.967185 seconds)
2024-04-24 18:31:48,534 - apscheduler.executors.default - INFO - Running job "Cache.save_to_disk (trigger: interval[0:01:00], next run at: 2024-04-24 18:32:48 EDT)" (scheduled at 2024-04-24 18:31:48.500163-04:00)
2024-04-24 18:31:48,534 - apscheduler.scheduler - DEBUG - Looking for jobs to run
2024-04-24 18:31:48,557 - apscheduler.scheduler - DEBUG 