In [1]:
import time
import pymongo
import psycopg2
import pandas as pd
from cache import SearchCache

In [2]:
class SearchEngine:
    def __init__(self, cache_size=100, cache_ttl=3600):
        """
        Initializes a SearchEngine object with a specified database type and cache settings.

        Args:
        - cache_size (int): Maximum number of items to store in cache
        - cache_ttl (int): Time-to-live (in seconds) for cached items
        """
        # initialize a cache object for the search engine using the SearchCache class
        self.cache = SearchCache(cache_size, cache_ttl)
        self.db_client = pymongo.MongoClient('mongodb+srv://twitter_user:dbms@cluster0.wkyhu.mongodb.net/?retryWrites=true&w=majority')
        self.tweets_collection = self.db_client['twitter_db']['tweets_data']
        #self.db_conn = psycopg2.connect(database="postgres", user="postgres", password="kueen", host="localhost")
        #self.users_cursor = self.db_conn.cursor()
        #self.user_table = 'twitter_users_partitioned'
        self.pipeline = [
            {
                '$project': {
                    '_id': 0,
                    'tweet_id': 1,
                    'user': 1,
                    'name': 1,
                    'date': 1,
                    'text': 1,
                    'retweet': {
                        '$cond': {
                            'if': { '$eq': ['$is_retweet', True] },
                            'then': '$retweet',
                            'else': None
                        }
                    },
                    'quote': {
                        '$cond': {
                            'if': { '$eq': ['$is_quote', True] },
                            'then': '$quote',
                            'else': None
                        }
                    },
                    'retweet_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$retweet_count',
                                        '$retweet.retweet_count'
                                    ]
                                },
                                '$quote.retweet_count'
                            ]
                        }
                    },
                    'reply_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$reply_count',
                                        '$retweet.reply_count'
                                    ]
                                },
                                '$quote.reply_count'
                            ]
                        }
                    },
                    'favorite_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$favorite_count',
                                        '$retweet.favorite_count'
                                    ]
                                },
                                '$quote.favorite_count'
                            ]
                        }
                    },
                    'quote_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$quote_count',
                                        '$retweet.quote_count'
                                    ]
                                },
                                '$quote.quote_count'
                            ]
                        }
                    },
                }
            },
            {
                '$addFields': {
                    'engagement': {
                        '$add': [
                            {'$multiply': ['$retweet_count', 0.2]},
                            {'$multiply': ['$favorite_count', 0.2]},
                            {'$multiply': ['$reply_count', 0.3]},
                            {'$multiply': ['$quote_count', 0.3]}
                        ]
                    }
                }
            },
            {
                '$sort': {
                    'engagement': pymongo.DESCENDING,
                    'date': pymongo.DESCENDING
                }
            }
        ]

    
    def most_popular_users(self, n=10):
        """
        Returns the n most popular Twitter users along with their tweets.

        Args:
        - n (int): Number of users to return.

        Returns:
        - list: List of the top n Twitter users, each represented as a dictionary with a 'username' key and a 'tweets' key.
        """
        start_time = time.time()
        
        if 'most_popular_users' in self.cache:
            print("Retrieving 'most popular users' from cache!")
            return self.cache['most_popular_users']
        else:
            print(f"New entry, retrieving 'most popular users' from database!")

        query = f"""
            SELECT * FROM 
                (SELECT distinct user_id, name, twitter_join_date, location, 
            verified, followers_count, friends_count, favourites_count,
                    dense_rank () over (partition by user_id order by followers_count desc) rnk 
                FROM 
                (SELECT * FROM {self.user_table} order by followers_count desc) A
                ) B where rnk = 1 
                order by followers_count desc limit {n}"""
        
        self.users_cursor.execute(query)
        results = self.users_cursor.fetchall()

        users = []
        for row in results:
            user = {
                'user_id': row[0],
                'name': row[1],
                'twitter_join_date': row[2],
                'location': row[3],
                'verified': row[4],
                'followers_count': row[5],
                'friends_count': row[6],
                'favourites_count': row[7],
            }
            users.append(user)

        users = pd.DataFrame(users)
        self.cache['most_popular_users'] = users
        
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return users
    
    
    def most_engaging_tweets(self, n=10):
        """
        Returns the most engaging n tweets in the database, where engagement is defined as the sum of retweet
        count, reply count, quote count, and favorite count.

        Args:
        - n (int): Number of tweets to return.

        Returns:
        - list: List of the top n tweets, each represented as a dictionary.
        """
        start_time = time.time()
        
        if 'most_engaging_tweets' in self.cache:
            print("Retrieving 'most engaging tweets' from cache!")
            return self.cache['most_engaging_tweets']
        else:
            print(f"New entry, retrieving 'most engaging tweets' from database!")

        tweets = list(self.tweets_collection.aggregate(self.pipeline + [{'$limit': n}]))
        self.cache['most_engaging_tweets'] = tweets
        
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return tweets

    
    def most_popular_hashtags(self, n=10):
        """
        Returns the n most popular hashtags in the database.

        Args:
        - n (int): Number of hashtags to return.

        Returns:
        - list: List of the top n hashtags, each represented as a dictionary.
        """
        start_time = time.time()
        
        if 'most_popular_hashtags' in self.cache:
            print("Retrieving 'most popular hashtags' from cache!")
            return self.cache['most_popular_hashtags']
        else:
            print(f"New entry, retrieving 'most popular hashtags' from database!")

        pipeline = [
            {
                '$match': {
                    '$or': [
                        {'media.hashtags': {'$exists': True}},
                        {'retweet.media.hashtags': {'$exists': True}},
                        {'quote.media.hashtags': {'$exists': True}},
                    ]
                }
            },
            {
                '$project': {
                    '_id': 0,
                    'hashtags': {
                        '$concatArrays': [
                            {'$ifNull': ['$media.hashtags', []]},
                            {'$ifNull': ['$retweet.media.hashtags', []]},
                            {'$ifNull': ['$quote.media.hashtags', []]},
                        ]
                    }
                }
            },
            {
                '$unwind': '$hashtags'
            },
            {
                '$group': {
                    '_id': '$hashtags',
                    'count': {'$sum': 1}
                }
            },
            {
                '$sort': {
                    'count': pymongo.DESCENDING,
                    'date': pymongo.DESCENDING
                }
            },
            {
                '$limit': n
            }
        ]

        hashtags = list(self.tweets_collection.aggregate(pipeline))
        hashtags = [{x["_id"]: x["count"]} for x in hashtags]
        self.cache['most_popular_hashtags'] = hashtags
        
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return hashtags
    
        
    def search_by_username(self, username, n=10):
        """
        Returns the top n users in the database matching the given username.

        Args:
        - username (str): The username to search for.
        - n (int): Number of users to return.

        Returns:
        - list: List of the top n users, each represented as a dictionary.
        """
        start_time = time.time()
        
        if username in self.cache:
            print(f"Retrieving '{username}' from cache!")
            return self.cache[username]
        else:
            print(f"New entry, retrieving '{username}' from database!")

        query = f"""
            SELECT user_id, name, twitter_join_date, location, 
            verified, followers_count, friends_count, favourites_count
            FROM {self.user_table}
            WHERE name LIKE '%{username}%'
            AND (name, date, followers_count) IN (
                SELECT name, MAX(date), MAX(followers_count)
                FROM {self.user_table}
                WHERE name LIKE '%{username}%'
                GROUP BY name, user_id
            )
            ORDER BY followers_count DESC, verified DESC
            LIMIT {n}
        """

        self.users_cursor.execute(query)
        results = self.users_cursor.fetchall()

        users = []
        for row in results:
            user = {
                'user_id': row[0],
                'name': row[1],
                'twitter_join_date': row[2],
                'location': row[3],
                'verified': row[4],
                'followers_count': row[5],
                'friends_count': row[6],
                'favourites_count': row[7],
            }
            users.append(user)

        users = pd.DataFrame(users)
        self.cache[username] = users
        
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return users
    
    
    def search_by_keyword(self, keyword, n=10):
        """
        Returns the top n tweets in the database that contain the given word.

        Args:
        - keyword (str): The word to search for.
        - n (int): Number of tweets to return.

        Returns:
        - list: List of the top n tweets, each represented as a dictionary.
        """
        start_time = time.time()
        
        if keyword in self.cache:
            print(f"Retrieving '{keyword}' from cache!")
            return self.cache[keyword]
        else:
            print(f"New entry, retrieving '{keyword}' from database!")

        pipeline = [
            {
                '$match': {
                    '$text': {'$search': keyword}
                }
            },
            {
                '$limit': n
            }
        ]
            
        tweets = list(self.tweets_collection.aggregate(pipeline + self.pipeline))
        self.cache[keyword] = tweets
        
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return tweets
    
    
    def search_by_hashtag(self, hashtag, n=10):
        """
        Returns the top n tweets in the database that contain the given hashtag.

        Args:
        - hashtag (str): The hashtag to search for.
        - n (int): Number of tweets to return.

        Returns:
        - list: List of the top n tweets, each represented as a dictionary.
        """
        
        start_time = time.time()
        
        if hashtag in self.cache:
            print(f"Retrieving '{hashtag}' from cache!")
            return self.cache[hashtag]
        else:
            print(f"New entry, retrieving '{hashtag}' from database!")

        pipeline = [
            {
                '$match': {
                    '$or': [
                        {'media.hashtags': {'$in': [hashtag]}},
                        {'retweet.media.hashtags': {'$in': [hashtag]}},
                        {'quote.media.hashtags': {'$in': [hashtag]}},
                    ]
                }
            },
            {
                '$project': {
                    '_id': 0,
                    'tweet_id': 1,
                    'user': 1,
                    'name': 1,
                    'date': 1,
                    'text': 1,
                    'retweet': {
                        '$cond': {
                            'if': { '$eq': ['$is_retweet', True] },
                            'then': '$retweet',
                            'else': None
                        }
                    },
                    'quote': {
                        '$cond': {
                            'if': { '$eq': ['$is_quote', True] },
                            'then': '$quote',
                            'else': None
                        }
                    },
                    'hashtags': '$media.hashtags',
                    'retweet_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$retweet', None] },
                                '$retweet_count',
                                '$retweet.retweet_count'
                            ]
                        }
                    },
                    'reply_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                {
                                    '$cond': [
                                        { '$eq': ['$retweet', None] },
                                        '$reply_count',
                                        '$retweet.reply_count'
                                    ]
                                },
                                '$quote.reply_count'
                            ]
                        }
                    },
                    'favorite_count': {
                        '$sum': {
                            '$cond': [
                                { '$eq': ['$retweet', None] },
                                '$favorite_count',
                                '$retweet.favorite_count'
                            ]
                        }
                    },
                    'quote_count': {
                        '$max': {
                            '$cond': [
                                { '$eq': ['$quote', None] },
                                '$quote_count',
                                '$quote.quote_count'
                            ]
                        }
                    },
                },
            },
            {
                '$unwind': '$hashtags'
            },
            {
                '$addFields': {
                    'engagement': {
                        '$toInt': {
                            '$add': [
                                {'$multiply': ['$retweet_count', 0.2]},
                                {'$multiply': ['$favorite_count', 0.2]},
                                {'$multiply': ['$reply_count', 0.3]},
                                {'$multiply': ['$quote_count', 0.3]}
                            ]
                        }
                    },
                }
            },

            {
                '$sort': {
                    'engagement': pymongo.DESCENDING,
                    'date': pymongo.DESCENDING
                }
            },
            {
                '$limit': n
            }
        ]

        tweets = list(self.tweets_collection.aggregate(pipeline))
        self.cache[hashtag] = tweets
        
        end_time = time.time()
        print(f"Query took {end_time - start_time:.4f} seconds\n")
        
        return tweets
    

## Testing

In [3]:
# create a SearchEngine object with cache size of 50 and cache TTL of 30 seconds
search_engine = SearchEngine(cache_size=50, cache_ttl=30)

In [4]:
search_engine.most_popular_users()

New entry, retrieving 'most popular users' from database!


AttributeError: 'SearchEngine' object has no attribute 'user_table'

In [5]:
search_engine.most_engaging_tweets()

New entry, retrieving 'most engaging tweets' from database!
Query took 1.2672 seconds



[{'tweet_id': 1254051230822944770,
  'user': 1039346340449452033,
  'name': 'Grace',
  'date': 'Sat Apr 25 14:14:47 +0000 2020',
  'text': 'But Joe, what if I WANT to drink bleach? What if I wanted to do that even before the orange man said to inject Lysol into our veins to stop corona? What if?',
  'retweet': None,
  'quote': {'tweet_id': 1253751812194070529,
   'user_id': 939091,
   'user_name': 'Joe Biden',
   'quote_count': 32237,
   'reply_count': 46159,
   'retweet_count': 263475,
   'favorite_count': 1280593,
   'media': {'hashtags': [], 'urls': [], 'mentions': []}},
  'retweet_count': 263475,
  'reply_count': 46159,
  'favorite_count': 1280593,
  'quote_count': 32237,
  'engagement': 332332.4},
 {'tweet_id': 1254044290344521728,
  'user': 863214058845200384,
  'name': 'Control Roboto 🤖 💚 Cine y Series',
  'date': 'Sat Apr 25 13:47:12 +0000 2020',
  'text': 'Esto es terrible, si no los mata el corona, se matan entre ellos con rifles o tomando lavandina.\nDe pedo siguen vivos.',


In [6]:
search_engine.most_popular_hashtags()

New entry, retrieving 'most popular hashtags' from database!
Query took 0.5838 seconds



[{'Corona': 9800},
 {'Mattarella': 3406},
 {'25Aprile': 3371},
 {'corona': 3263},
 {'AltaredellaPatria': 1829},
 {'COVID19': 1722},
 {'PideAlmayaDiyeÇıkıp': 1599},
 {'Liberazione': 1573},
 {'Covid_19': 1545},
 {'coronavirus': 1160}]

In [7]:
search_engine.search_by_username("Sözcü")

New entry, retrieving 'Sözcü' from database!


AttributeError: 'SearchEngine' object has no attribute 'user_table'

In [8]:
search_engine.search_by_keyword("deceiving")

New entry, retrieving 'deceiving' from database!
Query took 0.0425 seconds



[{'tweet_id': 1249403771781816323,
  'user': 2950846039,
  'name': 'Rizzu',
  'date': 'Sun Apr 12 18:27:26 +0000 2020',
  'text': "@ArvindKejriwal This guy is all praising Modiji, awesome Mr.Kejriwal for taking a U turn and deceiving people's mandate...\n\nWe should have started all measures in early February itself where our neighbor China was on a Corona rampage..",
  'retweet': None,
  'quote': None,
  'retweet_count': 0,
  'reply_count': 0,
  'favorite_count': 0,
  'quote_count': 0,
  'engagement': 0.0},
 {'tweet_id': 1254027086697725954,
  'user': 76485304,
  'name': 'gideon_lfc 🇬🇭',
  'date': 'Sat Apr 25 12:38:50 +0000 2020',
  'text': "@bossoklive @chubiei Don't let corona be deceiving you",
  'retweet': None,
  'quote': None,
  'retweet_count': 0,
  'reply_count': 0,
  'favorite_count': 0,
  'quote_count': 0,
  'engagement': 0.0},
 {'tweet_id': 1254023888671068161,
  'user': 1049457667348029441,
  'name': 'Ladyblahblah',
  'date': 'Sat Apr 25 12:26:08 +0000 2020',
  'text': '@B

In [9]:
search_engine.search_by_hashtag("corona")

New entry, retrieving 'corona' from database!
Query took 0.2966 seconds



[{'tweet_id': 1254052339301978112,
  'user': 304065893,
  'name': 'Paul Bocken (@🏚 indien mogelijk)',
  'date': 'Sat Apr 25 14:19:11 +0000 2020',
  'text': "Wie maakt een filmpje om ook alle BN'ers te bedanken?\n#SARSCoV2 #Covid19 #corona https://t.co/DKtz9eMU3n",
  'retweet': {'tweet_id': 1253626273395507200,
   'user_id': 18078366,
   'user_name': 'Zara-Blue Exotic 🐅',
   'quote_count': 2704,
   'reply_count': 816,
   'retweet_count': 14185,
   'favorite_count': 26702,
   'created_at': 'Fri Apr 24 10:06:09 +0000 2020',
   'media': {'hashtags': [], 'urls': [], 'mentions': []}},
  'quote': {'tweet_id': 1253626273395507200,
   'user_id': 18078366,
   'user_name': 'Zara-Blue Exotic 🐅',
   'quote_count': 2704,
   'reply_count': 816,
   'retweet_count': 14185,
   'favorite_count': 26702,
   'media': {'hashtags': [], 'urls': [], 'mentions': []}},
  'hashtags': 'SARSCoV2',
  'retweet_count': 14185,
  'reply_count': 816,
  'favorite_count': 26702,
  'quote_count': 2704,
  'engagement': 9233},

In [10]:
search_engine.search_by_username("Sözcü")

New entry, retrieving 'Sözcü' from database!


AttributeError: 'SearchEngine' object has no attribute 'user_table'