In [1]:
#This file contains raw cache implementation with a simulated testing senario along with tweet data example. 
# The cache-checkpoint.json file is also attach for reference.

In [9]:
import time
class LRUCache1:
    def __init__(self, capacity, checkpoint_interval):
        self.capacity = capacity
        self.cache = {}
        self.access_times = {}
        self.checkpoint_interval = checkpoint_interval
        self.last_checkpoint_time = time.time()

    def get(self, key):
        if key in self.cache:
            self.access_times[key] = time.time()
            return self.cache[key]
        return None
    
    def put(self, key, value):
        if len(self.cache) >= self.capacity:
            self.evict_least_recently_used()

        self.cache[key] = value
        self.access_times[key] = time.time()

        if time.time() - self.last_checkpoint_time > self.checkpoint_interval:
            self.checkpoint()
            self.last_checkpoint_time = time.time()

    def process_tweets_for_cache(self, tweets): #This function goes in with search application functions.
        for tweet in tweets:
            # Exclude the _id field from the tweet
            tweet_without_id = {key: value for key, value in tweet.items() if key != '_id'}
            self.put(tweet_without_id['Tweet_Id'], tweet_without_id)
        #self.checkpoint()
        
    def evict_least_recently_used(self):
        if not self.cache:
            return
        oldest_key = min(self.access_times, key=self.access_times.get)
        del self.cache[oldest_key]
        del self.access_times[oldest_key]

    def checkpoint(self):
        with open('cache_check-point.json', 'w') as f:
            json.dump({'cache': self.cache, 'access_times': self.access_times}, f)
        self.last_checkpoint_time = time.time()


    def load_checkpoint(self):
        try:
            with open('cache_check-point.json', 'r') as f:
                data = json.load(f)
                self.cache = data['cache']
                self.access_times = data['access_times']
                self.last_checkpoint_time = time.time()
        except FileNotFoundError:
            pass
        
    def search_tweet(self, tweet_id):
        for key, value in self.cache.items():
            if key == tweet_id:
                return value
        return None

In [5]:
# Simulated implementation
import random
import string

# Function to generate a random tweet
def generate_tweet(user_id):
    tweet_id = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
    return {'Tid': tweet_id, 'user_id': user_id, 'text': f'Tweet {tweet_id}'}

tweets = []
for i in range(10000):
    for _ in range(500):
        tweets.append(generate_tweet(i))


cache = LRUCache1(capacity=200, checkpoint_interval=10)

for tweet in tweets:
    Tid = tweet['Tid']
    if Tid not in cache.cache:
        cache.put(Tid, [tweet])

print("Cache contents:")
for Tid, tweet in cache.cache.items():
    print(f"Tweet Id: {Tid}, Tweet: {[tweet]}")


Cache contents:
Tweet Id: dQ49K0pM, Tweet: [[{'Tid': 'dQ49K0pM', 'user_id': 9999, 'text': 'Tweet dQ49K0pM'}]]
Tweet Id: 7gPNY0jD, Tweet: [[{'Tid': '7gPNY0jD', 'user_id': 9999, 'text': 'Tweet 7gPNY0jD'}]]
Tweet Id: w8OXGCZ7, Tweet: [[{'Tid': 'w8OXGCZ7', 'user_id': 9999, 'text': 'Tweet w8OXGCZ7'}]]
Tweet Id: WYnMPCbZ, Tweet: [[{'Tid': 'WYnMPCbZ', 'user_id': 9999, 'text': 'Tweet WYnMPCbZ'}]]
Tweet Id: YOswlbD5, Tweet: [[{'Tid': 'YOswlbD5', 'user_id': 9999, 'text': 'Tweet YOswlbD5'}]]
Tweet Id: J10uFzpO, Tweet: [[{'Tid': 'J10uFzpO', 'user_id': 9999, 'text': 'Tweet J10uFzpO'}]]
Tweet Id: geZFLbXE, Tweet: [[{'Tid': 'geZFLbXE', 'user_id': 9999, 'text': 'Tweet geZFLbXE'}]]
Tweet Id: NeJcgZYp, Tweet: [[{'Tid': 'NeJcgZYp', 'user_id': 9999, 'text': 'Tweet NeJcgZYp'}]]
Tweet Id: wQKadTsj, Tweet: [[{'Tid': 'wQKadTsj', 'user_id': 9999, 'text': 'Tweet wQKadTsj'}]]
Tweet Id: 5kwLka6i, Tweet: [[{'Tid': '5kwLka6i', 'user_id': 9999, 'text': 'Tweet 5kwLka6i'}]]
Tweet Id: 896puEv6, Tweet: [[{'Tid': '896puE

In [3]:
#searching inside cache

In [6]:
tweetasas = cache.search_tweet('ArRmoLQO')
print(tweetasas)

[{'Tid': 'ArRmoLQO', 'user_id': 9999, 'text': 'Tweet ArRmoLQO'}]


In [2]:
# Cache integrated with search function example

In [10]:
# Connect to MongoDB database
import pymongo
from datetime import datetime

mongo_client = pymongo.MongoClient("mongodb+srv://ashtikarshridhar:shridhar@cluster0.ft49nzt.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
mongo_db = mongo_client["Twitter_DB"]
tweet_collection = mongo_db["Tweets"]
quoted_tweet_collection = mongo_db["Quoted-Tweets"]
retweet_collection = mongo_db["Retweets"]

In [14]:
cache = LRUCache1(capacity=200, checkpoint_interval=300) #Initializing cache with 200 tweets capacity and 5 minutes checkpointing interval

In [15]:
def search_by_user_name(user_name):
    start_time = time.time()
    cached_tweets = [tweet for tweet in cache.cache.values() if tweet.get("User_Name") == user_name] # Searching in cache
    
    if cached_tweets:
        end_time = time.time()
        runtime = end_time - start_time
        print("Fetched from cache. ")
        print(f"Runtime of my_function: {runtime} seconds")
        return cached_tweets  # If in cache return results from cache
    
    
    tweets = list(tweet_collection.find({"User_Name": user_name}))
    retweets = list(retweet_collection.find({"User_Name": user_name}))
    quoted_tweets = list(quoted_tweet_collection.find({"User_Name": user_name}))

    combined_result = tweets + retweets + quoted_tweets
    end_time = time.time()
    runtime = end_time - start_time
    print("Fetched from Data Base . ")
    print(f"Runtime of my_function: {runtime} seconds")
    cache.process_tweets_for_cache(combined_result) # If not in cache, store in cache
    return combined_result

In [16]:
search_by_user_name("Gucci Mane") # Getting tweets from a particular user name. 1st time data will load from DB

Fetched from Data Base . 
Runtime of my_function: 0.13840317726135254 seconds


[{'_id': ObjectId('66249dad7339addd98f6a5a5'),
  'created_at': '2020-04-12 12:36:30',
  'Tweet_Id': '1249315454797168641',
  'Text': 'I pray my haters die of corona virus 😷',
  'Hashtag': [],
  'User_Id': '46769281',
  'User_Name': 'Gucci Mane',
  'User_Screen_Name': 'gucci1017',
  'Retweet_Count': 19690,
  'Quote_count': 5545,
  'Likes_Count': 75446,
  'sentiment': 'Negative'}]

In [17]:
search_by_user_name("Gucci Mane") #Since this result was already fetched and stored previously in cache, it loaded from cache

Fetched from cache. 
Runtime of my_function: 0.0 seconds


[{'created_at': '2020-04-12 12:36:30',
  'Tweet_Id': '1249315454797168641',
  'Text': 'I pray my haters die of corona virus 😷',
  'Hashtag': [],
  'User_Id': '46769281',
  'User_Name': 'Gucci Mane',
  'User_Screen_Name': 'gucci1017',
  'Retweet_Count': 19690,
  'Quote_count': 5545,
  'Likes_Count': 75446,
  'sentiment': 'Negative'}]