# Queries for the search application

In [1]:
import pandas as pd
import json
import pymongo
import io
import pprint
import psycopg2

## PostgreSQL

In [24]:
#connect to postgreSQL database
conn = psycopg2.connect(host="localhost", port = 5432, database="twitter", user="postgres", password="postgres@329")
cur = conn.cursor()

### About user data

In [12]:
#Row sample from the database 
cur.execute("select * from user_df limit 5")
user = pd.DataFrame(cur.fetchall())
user.columns = [desc[0] for desc in cur.description]
user

Unnamed: 0,user_id,user_id_str,name,screen_name,location,description,protected,followers_count,friends_count,listed_count,created_at,favourites_count,verified,statuses_count
0,498137972,498137972,Diana Dumitru,anaiduza,,Dubito ergo...,False,396,2099,225,Mon Feb 20 18:40:31 +0000 2012,4259,False,80987
1,65466158,65466158,Pedro da Costa,pdacosta,"Washington, DC",Federal Reserve & economy watcher at Market Ne...,False,136283,28095,4398,Thu Aug 13 20:59:18 +0000 2009,82454,True,239352
2,1399854920,1399854920,Kobe,BryantBWild,The Wolfpack,"Fall In Order To Grow, Lose In Order To Gain. ...",False,886,635,1,Fri May 03 14:16:58 +0000 2013,11986,False,42034
3,940032448057896960,940032448057896960,JR Thatcher 📈,JrThatcher,"West TX, USA",Graduate Student | B.S. Economics | Finance | ...,False,1145,535,13,Mon Dec 11 01:36:15 +0000 2017,178003,False,97937
4,808545,808545,Bruce Reyes-Chow 🗽,breyeschow,"San Francisco, CA",he/him/his\n#WarrenDemocrat\n@fpcpaloalto @brc...,False,10005,1748,562,Sat Mar 03 15:46:53 +0000 2007,25727,False,60731


In [25]:
## Types of user accounts (Verified vs non-verified)
#Row sample from the database 
cur.execute("SELECT count(CASE WHEN verified THEN 1 END) FROM user_df;")
verified = cur.fetchall()
cur.execute("SELECT count(CASE WHEN not verified THEN 0 END) FROM user_df;")
non_verified = cur.fetchall()

"Number of verified user accounts:" + str(verified[0][0]) + " and non_verified account:" + str(non_verified[0][0])

'Number of verified user accounts:935 and non_verified account:15863'

### Queries based on only user data

In [18]:
# Top N users and their location based on
# Status_count
# Favourites_count
# Followers_count

cur.execute("select user_id, name, location, statuses_count from user_df order by statuses_count desc limit 5")
top_n_statuses_count = pd.DataFrame(cur.fetchall())
top_n_statuses_count.columns = [desc[0] for desc in cur.description]
top_n_statuses_count

cur.execute("select user_id, name, location, favourites_count from user_df order by favourites_count desc limit 5")
top_n_favourites_count = pd.DataFrame(cur.fetchall())
top_n_favourites_count.columns = [desc[0] for desc in cur.description]
top_n_favourites_count

cur.execute("select user_id, name, location, followers_count from user_df order by followers_count desc limit 5;")
top_n_followers_count = pd.DataFrame(cur.fetchall())
top_n_followers_count.columns = [desc[0] for desc in cur.description]
top_n_followers_count

Unnamed: 0,user_id,name,location,followers_count
0,807095,The New York Times,New York City,46425645
1,1652541,Reuters,Around the world,21866712
2,1652541,Reuters,Around the world,21866711
3,1652541,Reuters,Around the world,21866710
4,1652541,Reuters,Around the world,21866710


In [23]:
#Close the cursor and connection to the database
cur.close()
conn.close()

## MongoDB

In [27]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["tweet_database"]
tweets = db.tweets_collection

In [28]:
#Document(row) sample from the mongoDB database 
myquery = {}
mydoc = list(tweets.find(myquery))
mydoc[0]

{'_id': ObjectId('5ea9c38017dfdac82a918317'),
 'id': 1255560342383603721,
 'user_name': 'Lean Consultancy',
 'user_id': '990868256360751104',
 'content': '@Growth_Lean &gt;&gt; U.S. pending home sales fall sharply in March https://t.co/8Lud9zsELG #lean https://t.co/k6PzFSfL1S',
 'created_at': '2020-04-29 18:11:27',
 'location': 'Europe',
 'hashtags': ['lean'],
 'mentions': ['990868256360751104'],
 'in_reply_to_user_id': 990868256360751104,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': -1,
 'retweetedFrom_id': -1,
 'retweetedFrom_name': 'NA',
 'media': ['https://t.co/k6PzFSfL1S'],
 'retweet': False,
 'FavCount': 0,
 'Orig_retweet_fav': -1,
 'source': 'Twibble.io',
 'retweet_count': 0,
 'lang': 'en'}

### About data

In [9]:
## Types of tweets
count_retweet = tweets.count_documents({"retweet":True})
count_original = tweets.count_documents({"retweet":False})
"Number of original tweet:" + str(count_original) + " and number of retweets:" + str(count_retweet)

'Number of original tweet:3306 and number of retweets:12123'

In [10]:
## Type of media in our dataset
type_count = {"text": 0, "Only_image":0, "both":0}
for tweet in tweets.find():
    md = tweet["media"]
    if len(md) != 0:
        if tweet["content"] == "":
            type_count["image"] += 1 # Only Image
        else:
            type_count["both"] += 1 # Image and text
    else:
        type_count["text"] += 1 # Only text

type_count

{'text': 14738, 'Only_image': 0, 'both': 691}

In [15]:
## Number of distint user in database
distinct_users = set()

for tweet in tweets.find():
    distinct_users.add(tweet['user_id'])

print("{} distinct user out of {} total user".format(len(distinct_users), tweets.count()))

14040 distinct user out of 15429 total user


  import sys


In [16]:
## Tweet with  maximum retweet count
max_retweet_count = 0
for tweet in tweets.find():
    if tweet['retweet_count']>max_retweet_count:
        max_retweet_count = tweet['retweet_count']
        tweet_id = tweet['id_str']
print(max_retweet_count)

myquery = {'id_str': tweet_id}
twts = list(tweets.find(myquery))
twts[0]

12676


{'_id': ObjectId('5ea77dc7c9283bc56a270ebc'),
 'id_str': '1254112304754556929',
 'user_name': 'J',
 'user_id': '785827332',
 'content': "3 companies with ties to the Trump admin. received millions under the PPP program. Another got a loan from a bank that once employed its board's chair. https://t.co/GcgqwGPe43",
 'created_at': '2020-04-25 18:17:28',
 'location': 'The World',
 'hashtags': [],
 'mentions': ['14173315'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '14173315',
 'media': [],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter for iPhone',
 'retweet_count': 12676}

In [17]:
## Tweet with  maximum favorite count
max_FavCount = 0
for tweet in tweets.find():
    if tweet['FavCount']>max_FavCount:
        max_FavCount = tweet['FavCount']
        tweet_id = tweet['id_str']
print(max_FavCount)

myquery = {'id_str': tweet_id}
twts = list(tweets.find(myquery))
twts[0]

4708


{'_id': ObjectId('5ea77daec9283bc56a2709fb'),
 'id_str': '1254168393638449153',
 'user_name': 'POLITICO',
 'user_id': '9300262',
 'content': 'Dr. Anthony Fauci says the U.S. should at least double coronavirus testing in the coming weeks before easing into reopening the economy https://t.co/9LrRsvl1qX',
 'created_at': '2020-04-25 22:00:21',
 'location': 'Washington, D.C.',
 'hashtags': [],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 4708,
 'source': 'Hootsuite Inc.',
 'retweet_count': 1752}

### Search by word

In [4]:
#Find number of tweets with word
myquery = {"content":{"$regex":"covid19","$options" :'i'}}
tweets.count_documents(myquery)

413

In [5]:
#Tweets
twts = list(tweets.find(myquery))
twts[0]

{'_id': ObjectId('5ea66a14fb8f5ff7fd4712a9'),
 'id_str': '1254636770051928071',
 'user_name': 'Pamela Moore #WWG1WGA #DIGITALSOLDIERS',
 'user_id': '1188695886781698050',
 'content': 'Met virtually with our terrific @AmChamPH Board on how U.S. firms are supporting the Philippine economy, workforce, and #COVID19 response. We’re all in this together, and great to \u200bsee U.S. firms contributing in so many ways. #FriendsPartnersAllies #goodbUSinessPH',
 'created_at': '2020-04-27 05:01:30',
 'location': 'Broken Arrow, OK',
 'hashtags': [],
 'mentions': ['441540997', '4745918652'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '441540997',
 'media': [],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter for Android',
 'retweet_count': 118}

### Search by a hashtag

In [8]:
# #Find number of tweets with #covid19 (Exact string put the specified string between the ^(Starts with) and $(Ends with))
# myquery = {"hashtags":{"$regex":"^covid19$","$options" :'i'}}
#Search using a hashtag.
myquery = {"content":{"$regex":"#covid19","$options" :'i'}}
twts = list(tweets.find(myquery))
twts[0]

{'_id': ObjectId('5ea66a14fb8f5ff7fd4712a9'),
 'id_str': '1254636770051928071',
 'user_name': 'Pamela Moore #WWG1WGA #DIGITALSOLDIERS',
 'user_id': '1188695886781698050',
 'content': 'Met virtually with our terrific @AmChamPH Board on how U.S. firms are supporting the Philippine economy, workforce, and #COVID19 response. We’re all in this together, and great to \u200bsee U.S. firms contributing in so many ways. #FriendsPartnersAllies #goodbUSinessPH',
 'created_at': '2020-04-27 05:01:30',
 'location': 'Broken Arrow, OK',
 'hashtags': [],
 'mentions': ['441540997', '4745918652'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '441540997',
 'media': [],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter for Android',
 'retweet_count': 118}

In [9]:
len(twts)

397

### Search by time range

In [31]:
# Search by Time range
twts = tweets.find({"created_at":{ "$gte": "2020-04-27 05:13:10", "$lt": "2020-04-30 05:13:50" }})
twts = list(twts)
twts[0]

{'_id': ObjectId('5ea9c38017dfdac82a918317'),
 'id': 1255560342383603721,
 'user_name': 'Lean Consultancy',
 'user_id': '990868256360751104',
 'content': '@Growth_Lean &gt;&gt; U.S. pending home sales fall sharply in March https://t.co/8Lud9zsELG #lean https://t.co/k6PzFSfL1S',
 'created_at': '2020-04-29 18:11:27',
 'location': 'Europe',
 'hashtags': ['lean'],
 'mentions': ['990868256360751104'],
 'in_reply_to_user_id': 990868256360751104,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': -1,
 'retweetedFrom_id': -1,
 'retweetedFrom_name': 'NA',
 'media': ['https://t.co/k6PzFSfL1S'],
 'retweet': False,
 'FavCount': 0,
 'Orig_retweet_fav': -1,
 'source': 'Twibble.io',
 'retweet_count': 0,
 'lang': 'en'}

In [32]:
len(twts)

16898

### Search by user name

In [12]:
# Search by user_name
twts = tweets.find({"user_name":"Howard"})
twts = list(twts)
twts[0]

{'_id': ObjectId('5ea66a14fb8f5ff7fd471282'),
 'id_str': '1254639809068519424',
 'user_name': 'Howard',
 'user_id': '2337724718',
 'content': 'U.S.: bailout small businesses by giving grants to 5% of them\n\nJapan: bailout small businesses by giving 100% of them all the payroll money they need\nhttps://t.co/10BjEDjjn3',
 'created_at': '2020-04-27 05:13:35',
 'location': 'New York, NY',
 'hashtags': [],
 'mentions': ['2172596028'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '2172596028',
 'media': [],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter for iPhone',
 'retweet_count': 240}

In [13]:
len(twts)

1

### Search by user_id

In [14]:
# Search by user id
twts = tweets.find({"user_id":"123628682"})
twts = list(twts)
twts[0]

{'_id': ObjectId('5ea66a14fb8f5ff7fd471280'),
 'id_str': '1254639872641576960',
 'user_name': 'Xcntrik',
 'user_id': '123628682',
 'content': 'Many U.S. businesses unlikely to seek government aid: NABE survey https://t.co/9bPT4YQN5I https://t.co/0B3ekpqymK',
 'created_at': '2020-04-27 05:13:50',
 'location': 'Southeast Texas',
 'hashtags': [],
 'mentions': ['1652541'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '1652541',
 'media': ['https://t.co/0B3ekpqymK'],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter for Android',
 'retweet_count': 7}

In [15]:
len(twts)

2

### Search by two words

In [16]:
#search by two words
twts = tweets.find( {
    "$and" : [
        {"content":{"$regex":"covid19","$options" :'i'}} , {"content":{"$regex":"#trump","$options" :'i'}}
    ]
} )
twts = list(twts)
twts[0]

{'_id': ObjectId('5ea66a1ffb8f5ff7fd47154e'),
 'id_str': '1254579387464212487',
 'user_name': 'J Cassidy',
 'user_id': '718527880284610561',
 'content': "@realDonaldTrump Read the Constitution. You're embarrassing yourself again... #Trump #coronavirus #COVID19  https://t.co/bCRHRusB3I",
 'created_at': '2020-04-27 01:13:29',
 'location': '',
 'hashtags': ['Trump', 'coronavirus', 'COVID19'],
 'mentions': ['123281100', '25073877'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '123281100',
 'media': [],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter Web App',
 'retweet_count': 826}

In [17]:
len(twts)

9

### Top five hashtags used

In [18]:
hashtag_count = {}
for tweet in tweets.find({"hashtags":{"$ne":[]}},{"hashtags":1, "_id":0}):
    ht = tweet["hashtags"]
    for hash in ht:
        hash = hash.lower()
        if hash not in hashtag_count:
            hashtag_count[hash] = 1
        else:
            hashtag_count[hash] += 1
hashtags = pd.DataFrame.from_dict(hashtag_count, orient='index')
hashtags = hashtags.rename(columns={0:"count"})
hashtags = hashtags.sort_values("count", ascending=False)
hashtags.head(5)

Unnamed: 0,count
covid19,222
trump2020,213
2a41a,209
veterans,209
alwayssupportourveterans,209


In [None]:
client.close()

## Queries based on MongoDB and postgreSQL

In [14]:
#Connect to postgreSQL
conn = psycopg2.connect(host="localhost", port = 5432, database="twitter", user="postgres", password="postgres@329")
cur = conn.cursor()

In [3]:
#Connect to MongoDB
client = pymongo.MongoClient()
db = client["tweet_database"]
tweets = db.tweets_collection

### Original user of maximum retweeted tweet

In [4]:
#Original user of maximum retweeted tweet
twts = list(tweets.find({"retweet": {"$ne": False}},{"user_id":1, "user_name":1, "_id":0}).sort("retweet_count",-1).limit(1))
twts

[{'user_name': 'The_War_Economy', 'user_id': '856978948965769220'}]

### What is the location of Maximum tweets.

In [5]:
#Location of maximum tweets
user_twt_count = pd.DataFrame(tweets.aggregate([
  {
    "$group": {
       "_id": "$user_id",
       "count": { "$sum": 1 }
    }
  }
]))
user_twt_count.columns = ["user_id","count"]
user_twt_count = user_twt_count.astype({'user_id': 'int64'})

In [6]:
cur.execute("SELECT user_id,location FROM user_df where location <> '';")
user_loc = pd.DataFrame(cur.fetchall())
user_loc.columns = [desc[0] for desc in cur.description]

In [7]:
loc_df = pd.merge(user_loc,user_twt_count,on="user_id")
loc_df = loc_df[["location","count"]].groupby("location").count()
loc_df.index[loc_df["count"].argmax()]

'United States'

### Tweet with given keyword, hashtag and threshold on number of followers.

In [21]:
cur.execute("SELECT user_id,followers_count FROM user_df where followers_count > 100 order by followers_count desc;")
users = pd.DataFrame(cur.fetchall())
users.columns = [desc[0] for desc in cur.description]

In [23]:
# Search by word and hastag
twts = pd.DataFrame(tweets.find( {
    "$and" : [
        {"content":{"$regex":"covid19","$options" :'i'}} , {"content":{"$regex":"#trump","$options" :'i'}}
    ]
}))
del twts["_id"]
twts = twts.astype({'user_id': 'int64'})

twts_df = pd.merge(users,twts,on="user_id")
twts_df.head()

Unnamed: 0,user_id,followers_count,id,user_name,content,created_at,location,hashtags,mentions,in_reply_to_user_id,...,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,media,retweet,FavCount,Orig_retweet_fav,source,retweet_count,lang
0,3565028294,8056,1255491197415706624,Jose Enriquez,Much of U.S. economy still plugging along desp...,2020-04-29 13:36:42,,"[coronavirus, COVID19]","[3565028294, 25073877]",,...,1255480003871027200,3565028294,972_834,[],True,0,2,Twitter for Android,1,en
1,3565028294,8056,1255480003871027200,Jose Enriquez,Much of U.S. economy still plugging along desp...,2020-04-29 12:52:13,,"[coronavirus, COVID19, COVID19, MAGA, KAG2020,...",[25073877],,...,-1,-1,,[],False,2,-1,Twitter for Android,1,en
2,70132768,6314,1255327046705741826,Tinamarief49,"@IngrahamAngle so let me get this straight, al...",2020-04-29 02:44:25,,"[COVID19, TrumpLiesAmericansDie, TrumpLiesPeop...",[50769180],50769180.0,...,-1,-1,,[],False,0,-1,Twitter Web App,0,en
3,1076344051614531584,2778,1255313707418169345,WQ2627 🌟🌟🌟,"All U.S. Corporations Including Ford, Delta, A...",2020-04-29 01:51:25,Anonville-Potus45 Pompeo Barr,"[QAnon, COVID19, Economy, WWG1WGA, Trump2020]",[],,...,-1,-1,,[],False,0,-1,Twitter Web App,0,en
4,92492429,2068,1255156351619260417,JusticeMatters!,@WhiteHouse @realDonaldTrump President @Barack...,2020-04-28 15:26:08,Under God's Grace,"[COVIDー19, Trump, COVID19, coronavirus, pandemic]","[822215673812119553, 25073877, 813286]",8.222157e+17,...,-1,-1,,[https://t.co/0jGSF6ksOy],False,0,-1,Twitter for Android,0,en


In [24]:
### Tweet with given keyword, hashtag and a location.

In [31]:
# Search by word and hastag
twts = pd.DataFrame(tweets.find( {
    "$and" : [
        {"content":{"$regex":"covid19","$options" :'i'}} ,
        {"content":{"$regex":"","$options" :'i'}}, 
        {"location":{"$regex":"^New York", "$options" : "i"}}
    ]
}))
del twts["_id"]
twts.head()

Unnamed: 0,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,media,retweet,FavCount,Orig_retweet_fav,source,retweet_count,lang
0,1255258012320743424,Agolo,961384868,The US goods trade deficit widened in March am...,2020-04-28 22:10:06,"New York, NY","[pandemic, covid19]",[],,,-1,-1,,[],False,0,-1,Hootsuite Inc.,0,en
1,1255027876086403072,Great White Sharkℹ,45536016,"#COVID19Nigeria #Plandemic as done 10, + 10 y...",2020-04-28 06:55:37,"New York, USA","[COVID19Nigeria, Plandemic]",[2936714848],,,-1,-1,,[],False,1,-1,Twitter for Android,0,en
2,1254919002842730496,Wonder Media Network,1021752303978725377,Gender equity will play a critical role in hel...,2020-04-27 23:43:00,"New York, NY",[COVID19],[557261881],,,-1,-1,,[],False,3,-1,TweetDeck,0,en
3,1254886223887376385,Christopher Johnson,1517716502,Met virtually with our terrific @AmChamPH Boar...,2020-04-27 21:32:45,New York City,[],"[441540997, 4745918652]",,,1253671313467678726,441540997,USAmbManila,[],True,0,679,Twitter for iPhone,138,en


In [13]:
#Close the cursor and connection to the database
cur.close()
conn.close()

In [42]:
client.close()

## Misc

In [21]:
hashtag_count = {}
for tweet in tweets.find():
    ht = tweet["hashtags"]
    for hash in ht:
        hash = hash.lower()
        if hash not in hashtag_count:
            hashtag_count[hash] = 1
        else:
            hashtag_count[hash] += 1

hc = sorted(hashtag_count, key=hashtag_count.get, reverse=True)  # Sorting the top 10.
count = 0

for z in hc:
    if count == 10:
        break
    jsonx = {"hashtag": z, "count": str(hashtag_count[z])}
    count += 1
# {k: v for k, v in sorted(hashtag_count.items(), key=lambda item: item[1])}

## Experimenting with text index

In [12]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db1 = client["TWT_DB"]
test = db1["test"]

In [29]:
#Create a test index
test.create_index([("content",pymongo.TEXT)])

'content_text'

In [30]:
twts1 = test.find({"$text": {"$search": "\"#covid19\"" } } )
twts1 = list(twts1)
twts1[0]

{'_id': ObjectId('5ea66afbfb8f5ff7fd474756'),
 'id_str': '1253752842361483265',
 'user_name': 'Dr. Marcell Vollmer #SocialDistancing #StayHome',
 'user_id': '99674560',
 'content': 'The Great #Lockdown of the economy has been completely unprecedented, both in terms of the speed of the shutdown and its impact on jobs.\n\n #coronavirus #COVID19 #health #COVIDー19 #worklife #motivation #FutureofWork #Leadership #WorkLifeBalance #USA\n\n https://t.co/yoEHiyJcxc',
 'created_at': '2020-04-24 18:29:05',
 'location': 'Munich, Bavaria',
 'hashtags': ['Lockdown',
  'coronavirus',
  'COVID19',
  'health',
  'COVIDー19',
  'worklife',
  'motivation',
  'FutureofWork',
  'Leadership',
  'WorkLifeBalance',
  'USA'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 6,
 'source': 'Twitter for iPad',
 'retweet_count': 4}

In [31]:
len(twts1)

396

In [58]:
twts1 = test.find({"$text": {"$search": "covid19"} } )
twts1 = list(twts1)
twts1[0]

{'_id': ObjectId('5ea66afbfb8f5ff7fd474756'),
 'id_str': '1253752842361483265',
 'user_name': 'Dr. Marcell Vollmer #SocialDistancing #StayHome',
 'user_id': '99674560',
 'content': 'The Great #Lockdown of the economy has been completely unprecedented, both in terms of the speed of the shutdown and its impact on jobs.\n\n #coronavirus #COVID19 #health #COVIDー19 #worklife #motivation #FutureofWork #Leadership #WorkLifeBalance #USA\n\n https://t.co/yoEHiyJcxc',
 'created_at': '2020-04-24 18:29:05',
 'location': 'Munich, Bavaria',
 'hashtags': ['Lockdown',
  'coronavirus',
  'COVID19',
  'health',
  'COVIDー19',
  'worklife',
  'motivation',
  'FutureofWork',
  'Leadership',
  'WorkLifeBalance',
  'USA'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 6,
 'source': 'Twitter for iPad',
 'retweet_count': 4}

In [56]:
len(twts1)

408

In [44]:
test.find({"$text":{"$search": "covid19"}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.test',
  'indexFilterSet': False,
  'parsedQuery': {'$text': {'$search': 'covid19',
    '$language': 'english',
    '$caseSensitive': False,
    '$diacriticSensitive': False}},
  'winningPlan': {'stage': 'TEXT',
   'indexPrefix': {},
   'indexName': 'content_text',
   'parsedTextQuery': {'terms': ['covid19'],
    'negatedTerms': [],
    'phrases': [],
    'negatedPhrases': []},
   'textIndexVersion': 3,
   'inputStage': {'stage': 'TEXT_MATCH',
    'inputStage': {'stage': 'FETCH',
     'inputStage': {'stage': 'OR',
      'inputStage': {'stage': 'IXSCAN',
       'keyPattern': {'_fts': 'text', '_ftsx': 1},
       'indexName': 'content_text',
       'isMultiKey': True,
       'isUnique': False,
       'isSparse': False,
       'isPartial': False,
       'indexVersion': 2,
       'direction': 'backward',
       'indexBounds': {}}}}}},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 408,
  'exe

In [36]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["TWT_DB"]
tweets = db["tweets"]

In [50]:
tweets.find({"content":{"$regex":"covid19","$options" :'i'}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.tweets',
  'indexFilterSet': False,
  'parsedQuery': {'content': {'$regex': 'covid19', '$options': 'i'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'covid19', '$options': 'i'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 413,
  'executionTimeMillis': 11,
  'totalKeysExamined': 0,
  'totalDocsExamined': 15429,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'covid19', '$options': 'i'}},
   'nReturned': 413,
   'executionTimeMillisEstimate': 0,
   'works': 15431,
   'advanced': 413,
   'needTime': 15017,
   'needYield': 0,
   'saveState': 120,
   'restoreState': 120,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 15429},
  'allPlansExecution': []},
 'serverInfo': {'host': 'DESKTOP-T0J0HGC',
  'port': 27017,
  'version': '4.2.6',
  'gitVersion': '20364840b8f1af16917e4c23c1b5f5efd8b3

In [13]:
tweets.find({"content":{"$regex":"trump","$options" :'i'}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.tweets',
  'indexFilterSet': False,
  'parsedQuery': {'content': {'$regex': 'trump', '$options': 'i'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 2240,
  'executionTimeMillis': 37,
  'totalKeysExamined': 0,
  'totalDocsExamined': 15429,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'nReturned': 2240,
   'executionTimeMillisEstimate': 0,
   'works': 15431,
   'advanced': 2240,
   'needTime': 13190,
   'needYield': 0,
   'saveState': 120,
   'restoreState': 120,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 15429},
  'allPlansExecution': []},
 'serverInfo': {'host': 'DESKTOP-T0J0HGC',
  'port': 27017,
  'version': '4.2.6',
  'gitVersion': '20364840b8f1af16917e4c23c1b5f5efd8b352f

In [14]:
test.find({"content":{"$regex":"trump","$options" :'i'}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.test',
  'indexFilterSet': False,
  'parsedQuery': {'content': {'$regex': 'trump', '$options': 'i'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 2240,
  'executionTimeMillis': 18,
  'totalKeysExamined': 0,
  'totalDocsExamined': 15429,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'nReturned': 2240,
   'executionTimeMillisEstimate': 0,
   'works': 15431,
   'advanced': 2240,
   'needTime': 13190,
   'needYield': 0,
   'saveState': 120,
   'restoreState': 120,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 15429},
  'allPlansExecution': []},
 'serverInfo': {'host': 'DESKTOP-T0J0HGC',
  'port': 27017,
  'version': '4.2.6',
  'gitVersion': '20364840b8f1af16917e4c23c1b5f5efd8b352f8'

In [33]:
client.close()