In [1]:
import time
import pymongo
import psycopg2
import pandas as pd
import json
from datetime import datetime
from cache import SearchCache
import certifi
ca = certifi.where()

In [2]:
class SearchEngine:
    def __init__(self, cache_size=100, cache_ttl=3600):
        """
        Initializes a SearchEngine object with a specified database type and cache settings.

        Args:
        - cache_size (int): Maximum number of items to store in cache
        - cache_ttl (int): Time-to-live (in seconds) for cached items
        """
        # initialize a cache object for the search engine using the SearchCache class
        self.cache = SearchCache(cache_size, cache_ttl)
        self.db_client = pymongo.MongoClient('mongodb+srv://twitter_user:dbms@cluster0.wkyhu.mongodb.net/?retryWrites=true&w=majority',tlsCAFile=ca)
        self.tweets_collection = self.db_client['twitter_db']['tweets_data']
        self.db_conn = psycopg2.connect(database="postgres", user="postgres", password="kueen", host="localhost")
        self.users_cursor = self.db_conn.cursor()
        self.user_table = 'twitter_users_partitioned'
        self.pipeline = [
            {
                '$project': {
                    '_id': 0,
                    'tweet_id': 1,
                    'user': 1,
                    'name': 1,
                    'date': 1,
                    'text': 1,
                    'retweet': {
                        '$cond': {
                            'if': { '$eq': ['$is_retweet', True] },
                            'then': '$retweet',
                            'else': None
                        }
                    },
                    'quote': {
                        '$cond': {
                            'if': { '$eq': ['$is_quote', True] },
                            'then': '$quote',
                            'else': None
                        }
                    },
                    'retweet_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$retweet_count',
                                        '$retweet.retweet_count'
                                    ]
                                },
                                '$quote.retweet_count'
                            ]
                        }
                    },
                    'reply_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$reply_count',
                                        '$retweet.reply_count'
                                    ]
                                },
                                '$quote.reply_count'
                            ]
                        }
                    },
                    'favorite_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$favorite_count',
                                        '$retweet.favorite_count'
                                    ]
                                },
                                '$quote.favorite_count'
                            ]
                        }
                    },
                    'quote_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$quote_count',
                                        '$retweet.quote_count'
                                    ]
                                },
                                '$quote.quote_count'
                            ]
                        }
                    },
                }
            },
            {
                '$addFields': {
                    'engagement': {
                        '$add': [
                            {'$multiply': ['$retweet_count', 0.2]},
                            {'$multiply': ['$favorite_count', 0.2]},
                            {'$multiply': ['$reply_count', 0.3]},
                            {'$multiply': ['$quote_count', 0.3]}
                        ]
                    }
                }
            },
            {
                '$sort': {
                    'engagement': pymongo.DESCENDING,
                    'date': pymongo.DESCENDING
                }
            }
        ]

    
    def most_popular_users(self, n=10):
        """
        Returns the n most popular Twitter users along with their tweets.

        Args:
        - n (int): Number of users to return.

        Returns:
        - list: List of the top n Twitter users, each represented as a dictionary with a 'username' key and a 'tweets' key.
        """
        start_time = time.time()
        
        if 'most_popular_users' in self.cache:
            print("Retrieving 'most popular users' from cache!")
            end_time = time.time()
            print(f"Query took {end_time - start_time:.4f} seconds\n")
            return self.cache['most_popular_users']
        else:
            print(f"New entry, retrieving 'most popular users' from database!")

        query = f"""
            SELECT * FROM 
                (SELECT distinct user_id, name, twitter_join_date, location, 
            verified, followers_count, friends_count, favourites_count,
                    dense_rank () over (partition by user_id order by followers_count desc) rnk 
                FROM 
                (SELECT * FROM {self.user_table} order by followers_count desc) A
                ) B where rnk = 1 
                order by followers_count desc limit {n}"""
        
        self.users_cursor.execute(query)
        results = self.users_cursor.fetchall()

        users = []
        for row in results:
            user = {
                'user_id': row[0],
                'name': row[1],
                'twitter_join_date': row[2],
                'location': row[3],
                'verified': row[4],
                'followers_count': row[5],
                'friends_count': row[6],
                'favourites_count': row[7],
            }
            users.append(user)

        users = pd.DataFrame(users)
        self.cache['most_popular_users'] = users.to_json(orient='records')
        self.cache.save_checkpoint()
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return users
    
    
    def most_engaging_tweets(self, n=10):
        """
        Returns the most engaging n tweets in the database, where engagement is defined as the sum of retweet
        count, reply count, quote count, and favorite count.

        Args:
        - n (int): Number of tweets to return.

        Returns:
        - list: List of the top n tweets, each represented as a dictionary.
        """
        start_time = time.time()
        
        if 'most_engaging_tweets' in self.cache:
            print("Retrieving 'most engaging tweets' from cache!")
            end_time = time.time()
            print(f"Query took {end_time - start_time:.4f} seconds\n")            
            return self.cache['most_engaging_tweets']
        else:
            print(f"New entry, retrieving 'most engaging tweets' from database!")

        tweets = list(self.tweets_collection.aggregate(self.pipeline + [{'$limit': n}]))
        self.cache['most_engaging_tweets'] = tweets
        
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return tweets

    
    def most_popular_hashtags(self, n=10):
        """
        Returns the n most popular hashtags in the database.

        Args:
        - n (int): Number of hashtags to return.

        Returns:
        - list: List of the top n hashtags, each represented as a dictionary.
        """
        start_time = time.time()
        
        if 'most_popular_hashtags' in self.cache:
            print("Retrieving 'most popular hashtags' from cache!")
            end_time = time.time()
            print(f"Query took {end_time - start_time:.4f} seconds\n")
            return self.cache['most_popular_hashtags']
        else:
            print(f"New entry, retrieving 'most popular hashtags' from database!")

        pipeline = [
            {
                '$match': {
                    '$or': [
                        {'media.hashtags': {'$exists': True}},
                        {'retweet.media.hashtags': {'$exists': True}},
                        {'quote.media.hashtags': {'$exists': True}},
                    ]
                }
            },
            {
                '$project': {
                    '_id': 0,
                    'hashtags': {
                        '$concatArrays': [
                            {'$ifNull': ['$media.hashtags', []]},
                            {'$ifNull': ['$retweet.media.hashtags', []]},
                            {'$ifNull': ['$quote.media.hashtags', []]},
                        ]
                    }
                }
            },
            {
                '$unwind': '$hashtags'
            },
            {
                '$group': {
                    '_id': '$hashtags',
                    'count': {'$sum': 1}
                }
            },
            {
                '$sort': {
                    'count': pymongo.DESCENDING,
                    'date': pymongo.DESCENDING
                }
            },
            {
                '$limit': n
            }
        ]

        
        hashtags = list(self.tweets_collection.aggregate(pipeline))
        hashtags = [{x["_id"]: x["count"]} for x in hashtags]
        self.cache['most_popular_hashtags'] = hashtags
        self.cache.save_checkpoint()
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return hashtags

    
    def search_by_date_range(self, start_date_str, end_date_str):
        """
        Returns the top n tweets in the database that were posted within the specified date range.

        Args:
        - start_date_str (str): Start date of the range in the format 'Fri Apr 24 10:06:09 +0000 2020'
        - end_date_str (str): End date of the range in the format 'Fri Apr 24 10:06:09 +0000 2020'
        - n (int): Number of tweets to return.

        Returns:
        - list: List of the top n tweets, each represented as a dictionary.
        """
        start_date = datetime.strptime(start_date_str, '%a %b %d %H:%M:%S %z %Y')
        end_date = datetime.strptime(end_date_str, '%a %b %d %H:%M:%S %z %Y')

        query = {
            'date': {
                '$gte': start_date,
                '$lte': end_date
            }
        }
        
        tweets = self.tweets_collection.find(query)
        return tweets


    def search_by_username(self, username, n=10):
        """
        Returns the top n users in the database matching the given username.

        Args:
        - username (str): The username to search for.
        - n (int): Number of users to return.

        Returns:
        - list: List of the top n users, each represented as a dictionary.
        """
        start_time = time.time()
        
        if username in self.cache:
            print(f"Retrieving '{username}' from cache!")
            end_time = time.time()
            print(f"Query took {end_time - start_time:.4f} seconds\n")

            return self.cache[username]
        else:
            print(f"New entry, retrieving '{username}' from database!")

        query = f"""
            SELECT user_id, name, twitter_join_date, location, 
            verified, followers_count, friends_count, favourites_count
            FROM {self.user_table}
            WHERE name LIKE '%{username}%'
            AND (name, date, followers_count) IN (
                SELECT name, MAX(date), MAX(followers_count)
                FROM {self.user_table}
                WHERE name LIKE '%{username}%'
                GROUP BY name, user_id
            )
            ORDER BY followers_count DESC, verified DESC
            LIMIT {n}
        """

        self.users_cursor.execute(query)
        results = self.users_cursor.fetchall()

        users = []
        for row in results:
            user = {
                'user_id': row[0],
                'name': row[1],
                'twitter_join_date': row[2],
                'location': row[3],
                'verified': row[4],
                'followers_count': row[5],
                'friends_count': row[6],
                'favourites_count': row[7],
            }
            users.append(user)

        users = pd.DataFrame(users)
        self.cache[username] = users.to_json(orient='records')
        self.cache.save_checkpoint()
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return users
    
    
    def search_by_keyword(self, keyword, start_time, end_time, n=10):
        """
        Returns the top n tweets in the database that contain the given word.

        Args:
        - keyword (str): The word to search for.
        - n (int): Number of tweets to return.

        Returns:
        - list: List of the top n tweets, each represented as a dictionary.
        """
        st = time.time()
        
        if keyword in self.cache:
            print(f"Retrieving '{keyword}' from cache!")
            et = time.time()
            print(f"Query took {et - st:.4f} seconds\n")

            return self.cache[keyword]
        else:
            print(f"New entry, retrieving '{keyword}' from database!")
            
        start_date = datetime.strptime(start_time, '%a %b %d %H:%M:%S %z %Y')
        end_date = datetime.strptime(end_time, '%a %b %d %H:%M:%S %z %Y')

        pipeline = [
            {
                '$match': {
                    '$text': {
                        '$search': keyword
                    }
                }
            },
            {
                '$limit': n
            }
        ]
        
        if start_time and end_time:
            tweets = list(self.search_by_date_range(start_time, end_time))
        else:
            tweets = list(self.tweets_collection.aggregate(pipeline + self.pipeline))
        self.cache[keyword] = tweets
        self.cache.save_checkpoint()
        et = time.time()
        print(f"Query took {et - st:.4f} seconds\n")
        
        return tweets
    
    
    def search_by_hashtag(self, hashtag, n=10):
        """
        Returns the top n tweets in the database that contain the given hashtag.

        Args:
        - hashtag (str): The hashtag to search for.
        - n (int): Number of tweets to return.

        Returns:
        - list: List of the top n tweets, each represented as a dictionary.
        """
        
        start_time = time.time()
        
        if hashtag in self.cache:
            print(f"Retrieving '#{hashtag}' from cache!")
            end_time = time.time()
            print(f"Query took {end_time - start_time:.4f} seconds\n")
            return self.cache[hashtag]
        else:
            print(f"New entry, retrieving '#{hashtag}' from database!")

        pipeline = [
            {
                '$match': {
                    '$or': [
                        {'media.hashtags': {'$in': [hashtag]}},
                        {'retweet.media.hashtags': {'$in': [hashtag]}},
                        {'quote.media.hashtags': {'$in': [hashtag]}},
                    ]
                }
            },
            {
                '$project': {
                    '_id': 0,
                    'tweet_id': 1,
                    'user': 1,
                    'name': 1,
                    'date': 1,
                    'text': 1,
                    'retweet': {
                        '$cond': {
                            'if': { '$eq': ['$is_retweet', True] },
                            'then': '$retweet',
                            'else': None
                        }
                    },
                    'quote': {
                        '$cond': {
                            'if': { '$eq': ['$is_quote', True] },
                            'then': '$quote',
                            'else': None
                        }
                    },
                    'hashtags': '$media.hashtags',
                    'retweet_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$retweet', None] },
                                '$retweet_count',
                                '$retweet.retweet_count'
                            ]
                        }
                    },
                    'reply_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$reply_count',
                                        '$retweet.reply_count'
                                    ]
                                },
                                '$quote.reply_count'
                            ]
                        }
                    },
                    'favorite_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$retweet', None] },
                                '$favorite_count',
                                '$retweet.favorite_count'
                            ]
                        }
                    },
                    'quote_count': {
                        '$max': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                '$quote_count',
                                '$quote.quote_count'
                            ]
                        }
                    },
                },
            },
            {
                '$unwind': '$hashtags'
            },
            {
                '$addFields': {
                    'engagement': {
                        '$toInt': {
                            '$add': [
                                {'$multiply': ['$retweet_count', 0.2]},
                                {'$multiply': ['$favorite_count', 0.2]},
                                {'$multiply': ['$reply_count', 0.3]},
                                {'$multiply': ['$quote_count', 0.3]}
                            ]
                        }
                    },
                }
            },

            {
                '$sort': {
                    'engagement': pymongo.DESCENDING,
                    'date': pymongo.DESCENDING
                }
            },
            {
                '$limit': n
            }
        ]

        tweets = list(self.tweets_collection.aggregate(pipeline))
        self.cache['#'+hashtag] = tweets
        self.cache.save_checkpoint()
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return tweets
    

## Testing

In [3]:
# create a SearchEngine object with cache size of 50 and cache TTL of 30 seconds
search_engine = SearchEngine(cache_size=50, cache_ttl=30)

Cache file is corrupted.
Creating new cache.
Checkpoint saved!


In [4]:
start_time='Sat Apr 25 14:19:11 +0000 2020'
end_time='Sat Apr 25 14:30:00 +0000 2020'

In [5]:
search_engine.most_popular_users()

New entry, retrieving 'most popular users' from database!
Checkpoint saved!
Query took 0.4114 seconds



Unnamed: 0,user_id,name,twitter_join_date,location,verified,followers_count,friends_count,favourites_count
0,813286,Barack Obama,2007-03-05,"Washington, DC",True,115603427,607612,11
1,18839785,Narendra Modi,2009-01-10,India,True,55786179,2364,0
2,807095,The New York Times,2007-03-02,New York City,True,46361159,904,18483
3,145125358,Amitabh Bachchan,2010-05-18,"Mumbai, India",True,41596464,1833,75
4,101311381,Shah Rukh Khan,2010-01-02,,True,40028019,77,32
5,471741741,PMO India,2012-01-23,India,True,34461808,486,0
6,113419517,Hrithik Roshan,2010-02-11,,True,28170371,90,172
7,92724677,Virender Sehwag,2009-11-26,India,True,20571543,143,4627
8,405427035,Arvind Kejriwal,2011-11-05,India,True,18339248,221,618
9,14293310,TIME,2008-04-03,,True,17057740,494,536


In [6]:
search_engine.most_popular_users()

Retrieving 'most popular users' from cache!
Query took 0.0000 seconds



'[{"user_id":"813286","name":"Barack Obama","twitter_join_date":1173052800000,"location":"Washington, DC","verified":true,"followers_count":115603427,"friends_count":607612,"favourites_count":11},{"user_id":"18839785","name":"Narendra Modi","twitter_join_date":1231545600000,"location":"India","verified":true,"followers_count":55786179,"friends_count":2364,"favourites_count":0},{"user_id":"807095","name":"The New York Times","twitter_join_date":1172793600000,"location":"New York City","verified":true,"followers_count":46361159,"friends_count":904,"favourites_count":18483},{"user_id":"145125358","name":"Amitabh Bachchan","twitter_join_date":1274140800000,"location":"Mumbai, India","verified":true,"followers_count":41596464,"friends_count":1833,"favourites_count":75},{"user_id":"101311381","name":"Shah Rukh Khan","twitter_join_date":1262390400000,"location":null,"verified":true,"followers_count":40028019,"friends_count":77,"favourites_count":32},{"user_id":"471741741","name":"PMO India","

In [7]:
search_engine.most_engaging_tweets()

New entry, retrieving 'most engaging tweets' from database!
Query took 1.4234 seconds



[{'tweet_id': 1254051230822944770,
  'user': 1039346340449452033,
  'name': 'Grace',
  'date': 'Sat Apr 25 14:14:47 +0000 2020',
  'text': 'But Joe, what if I WANT to drink bleach? What if I wanted to do that even before the orange man said to inject Lysol into our veins to stop corona? What if?',
  'retweet': None,
  'quote': {'tweet_id': 1253751812194070529,
   'user_id': 939091,
   'user_name': 'Joe Biden',
   'quote_count': 32237,
   'reply_count': 46159,
   'retweet_count': 263475,
   'favorite_count': 1280593,
   'media': {'hashtags': [], 'urls': [], 'mentions': []}},
  'retweet_count': 263475,
  'reply_count': 46159,
  'favorite_count': 1280593,
  'quote_count': 32237,
  'engagement': 332332.4},
 {'tweet_id': 1254044290344521728,
  'user': 863214058845200384,
  'name': 'Control Roboto 🤖 💚 Cine y Series',
  'date': 'Sat Apr 25 13:47:12 +0000 2020',
  'text': 'Esto es terrible, si no los mata el corona, se matan entre ellos con rifles o tomando lavandina.\nDe pedo siguen vivos.',


In [8]:
search_engine.cache.get_keys()

['most_popular_users', 'most_engaging_tweets']

In [9]:
search_engine.most_popular_hashtags()

New entry, retrieving 'most popular hashtags' from database!
Checkpoint saved!
Query took 0.6596 seconds



[{'Corona': 9800},
 {'Mattarella': 3406},
 {'25Aprile': 3371},
 {'corona': 3263},
 {'AltaredellaPatria': 1829},
 {'COVID19': 1722},
 {'PideAlmayaDiyeÇıkıp': 1599},
 {'Liberazione': 1573},
 {'Covid_19': 1545},
 {'coronavirus': 1160}]

In [10]:
search_engine.most_popular_hashtags()

Retrieving 'most popular hashtags' from cache!
Query took 0.0001 seconds



[{'Corona': 9800},
 {'Mattarella': 3406},
 {'25Aprile': 3371},
 {'corona': 3263},
 {'AltaredellaPatria': 1829},
 {'COVID19': 1722},
 {'PideAlmayaDiyeÇıkıp': 1599},
 {'Liberazione': 1573},
 {'Covid_19': 1545},
 {'coronavirus': 1160}]

In [11]:
search_engine.search_by_username("Sözcü")

New entry, retrieving 'Sözcü' from database!
Checkpoint saved!
Query took 0.0351 seconds



Unnamed: 0,user_id,name,twitter_join_date,location,verified,followers_count,friends_count,favourites_count
0,218078497,Sözcü,2010-11-21,İstanbul,True,2398838,28,0
1,3142919739,Sözcü Dünya,2015-04-07,"İstanbul, Türkiye",False,39909,11,0
2,3501964461,Sözcü Ekonomi,2015-08-31,,False,23393,13,0


In [12]:
search_engine.search_by_username("Sözcü")

Retrieving 'Sözcü' from cache!
Query took 0.0002 seconds



'[{"user_id":"218078497","name":"S\\u00f6zc\\u00fc","twitter_join_date":1290297600000,"location":"\\u0130stanbul","verified":true,"followers_count":2398838,"friends_count":28,"favourites_count":0},{"user_id":"3142919739","name":"S\\u00f6zc\\u00fc D\\u00fcnya","twitter_join_date":1428364800000,"location":"\\u0130stanbul, T\\u00fcrkiye","verified":false,"followers_count":39909,"friends_count":11,"favourites_count":0},{"user_id":"3501964461","name":"S\\u00f6zc\\u00fc Ekonomi","twitter_join_date":1440979200000,"location":null,"verified":false,"followers_count":23393,"friends_count":13,"favourites_count":0}]'

In [13]:
keyword = search_engine.search_by_keyword("Modiji", start_time, end_time)
keyword

New entry, retrieving 'Modiji' from database!
Checkpoint saved!
Query took 0.0223 seconds



[]

In [14]:
search_engine.search_by_hashtag("corona")

New entry, retrieving '#corona' from database!
Checkpoint saved!
Query took 0.3340 seconds



[{'tweet_id': 1254052339301978112,
  'user': 304065893,
  'name': 'Paul Bocken (@🏚 indien mogelijk)',
  'date': 'Sat Apr 25 14:19:11 +0000 2020',
  'text': "Wie maakt een filmpje om ook alle BN'ers te bedanken?\n#SARSCoV2 #Covid19 #corona https://t.co/DKtz9eMU3n",
  'retweet': {'tweet_id': 1253626273395507200,
   'user_id': 18078366,
   'user_name': 'Zara-Blue Exotic 🐅',
   'quote_count': 2704,
   'reply_count': 816,
   'retweet_count': 14185,
   'favorite_count': 26702,
   'created_at': 'Fri Apr 24 10:06:09 +0000 2020',
   'media': {'hashtags': [], 'urls': [], 'mentions': []}},
  'quote': {'tweet_id': 1253626273395507200,
   'user_id': 18078366,
   'user_name': 'Zara-Blue Exotic 🐅',
   'quote_count': 2704,
   'reply_count': 816,
   'retweet_count': 14185,
   'favorite_count': 26702,
   'media': {'hashtags': [], 'urls': [], 'mentions': []}},
  'hashtags': 'SARSCoV2',
  'retweet_count': 14185,
  'reply_count': 816,
  'favorite_count': 26702,
  'quote_count': 2704,
  'engagement': 9233},

In [17]:
search_engine.search_by_username("Sözcü")

Retrieving 'Sözcü' from cache!
Query took 0.0006 seconds

Cache entry with key 'Sözcü' has expired.


In [15]:
search_engine.cache.get_items()

[('most_popular_users',
  ('[{"user_id":"813286","name":"Barack Obama","twitter_join_date":1173052800000,"location":"Washington, DC","verified":true,"followers_count":115603427,"friends_count":607612,"favourites_count":11},{"user_id":"18839785","name":"Narendra Modi","twitter_join_date":1231545600000,"location":"India","verified":true,"followers_count":55786179,"friends_count":2364,"favourites_count":0},{"user_id":"807095","name":"The New York Times","twitter_join_date":1172793600000,"location":"New York City","verified":true,"followers_count":46361159,"friends_count":904,"favourites_count":18483},{"user_id":"145125358","name":"Amitabh Bachchan","twitter_join_date":1274140800000,"location":"Mumbai, India","verified":true,"followers_count":41596464,"friends_count":1833,"favourites_count":75},{"user_id":"101311381","name":"Shah Rukh Khan","twitter_join_date":1262390400000,"location":null,"verified":true,"followers_count":40028019,"friends_count":77,"favourites_count":32},{"user_id":"4717