# Queries for the search application

In [1]:
import pandas as pd
import json
import pymongo
import io
import pprint
import psycopg2

## PostgreSQL

In [2]:
#connect to postgreSQL database
conn = psycopg2.connect(host="localhost", port = 5432, database="twitter", user="postgres", password="postgres@329")
cur = conn.cursor()

### About user data

In [3]:
#Row sample from the database 
cur.execute("select * from user_df limit 5")
user = pd.DataFrame(cur.fetchall())
user.columns = [desc[0] for desc in cur.description]
user

Unnamed: 0,user_id,user_id_str,name,screen_name,location,description,protected,followers_count,friends_count,listed_count,created_at,favourites_count,verified,statuses_count
0,10140,10140,nicco mele,nicco,"Cambridge, MA",Managing Director @DRKFoundation - teaching @K...,False,9289,6534,403,Mon Oct 23 13:44:36 +0000 2006,2895,False,9760
1,12917,12917,"Cordelya Sharpe, isolated AF",Cordelya,"Delta, PA",See crafty tweets at @cordymakes / Venmo: @cor...,False,157,453,0,Sat Nov 18 01:49:36 +0000 2006,3161,False,24876
2,51943,51943,That Goan Guy,schmmuck,½ way btwn d gutter & d stars,I'm just an eight ulcer man on four ulcer pay.,False,13763,2040,352,Fri Dec 08 21:03:19 +0000 2006,21706,False,285388
3,58523,58523,Ricochet,Ricochet,,Conservative conversation and community. And p...,False,20860,1691,635,Mon Dec 11 21:54:27 +0000 2006,6790,True,71963
4,66983,66983,Kevin Farner,kevinfarner,"Minneapolis, MN",,False,891,1158,74,Thu Dec 14 13:19:09 +0000 2006,2374,False,7399


In [4]:
## Types of user accounts (Verified vs non-verified)
#Row sample from the database 
cur.execute("SELECT count(CASE WHEN verified THEN 1 END) FROM user_df;")
verified = cur.fetchall()
cur.execute("SELECT count(CASE WHEN not verified THEN 0 END) FROM user_df;")
non_verified = cur.fetchall()

"Number of verified user accounts:" + str(verified[0][0]) + " and non_verified account:" + str(non_verified[0][0])

'Number of verified user accounts:707 and non_verified account:14168'

### Queries based on only user data

In [5]:
# Top N users and their location based on
# Status_count
# Favourites_count
# Followers_count

cur.execute("select user_id, name, location, statuses_count from user_df order by statuses_count desc limit 5")
top_n_statuses_count = pd.DataFrame(cur.fetchall())
top_n_statuses_count.columns = [desc[0] for desc in cur.description]
top_n_statuses_count

cur.execute("select user_id, name, location, favourites_count from user_df order by favourites_count desc limit 5")
top_n_favourites_count = pd.DataFrame(cur.fetchall())
top_n_favourites_count.columns = [desc[0] for desc in cur.description]
top_n_favourites_count

cur.execute("select user_id, name, location, followers_count from user_df order by followers_count desc limit 5;")
top_n_followers_count = pd.DataFrame(cur.fetchall())
top_n_followers_count.columns = [desc[0] for desc in cur.description]
top_n_followers_count

Unnamed: 0,user_id,name,location,followers_count
0,807095,The New York Times,New York City,46425645
1,1652541,Reuters,Around the world,21866693
2,3108351,The Wall Street Journal,"New York, NY",17693180
3,91478624,Forbes,"New York, NY",15986043
4,2467791,The Washington Post,"Washington, DC",15650435


In [6]:
#Close the cursor and connection to the database
cur.close()
conn.close()

## MongoDB

In [7]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["tweet_database"]
tweets = db.tweets_collection

In [8]:
#Document(row) sample from the mongoDB database 
myquery = {}
mydoc = list(tweets.find(myquery))
mydoc[0]

{'_id': ObjectId('5ea9c38017dfdac82a918317'),
 'id': 1255560342383603721,
 'user_name': 'Lean Consultancy',
 'user_id': '990868256360751104',
 'content': '@Growth_Lean &gt;&gt; U.S. pending home sales fall sharply in March https://t.co/8Lud9zsELG #lean https://t.co/k6PzFSfL1S',
 'created_at': '2020-04-29 18:11:27',
 'location': 'Europe',
 'hashtags': ['lean'],
 'mentions': ['990868256360751104'],
 'in_reply_to_user_id': 990868256360751104,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': -1,
 'retweetedFrom_id': -1,
 'retweetedFrom_name': 'NA',
 'media': ['https://t.co/k6PzFSfL1S'],
 'retweet': False,
 'FavCount': 0,
 'Orig_retweet_fav': -1,
 'source': 'Twibble.io',
 'retweet_count': 0,
 'lang': 'en'}

### About data

In [9]:
## Types of tweets
count_retweet = tweets.count_documents({"retweet":True})
count_original = tweets.count_documents({"retweet":False})
"Number of original tweet:" + str(count_original) + " and number of retweets:" + str(count_retweet)

'Number of original tweet:4623 and number of retweets:12275'

In [10]:
## Type of media in our dataset
type_count = {"text": 0, "Only_image":0, "both":0}
for tweet in tweets.find():
    md = tweet["media"]
    if len(md) != 0:
        if tweet["content"] == "":
            type_count["image"] += 1 # Only Image
        else:
            type_count["both"] += 1 # Image and text
    else:
        type_count["text"] += 1 # Only text

type_count

{'text': 15685, 'Only_image': 0, 'both': 1213}

In [11]:
## Number of distint user in database
distinct_users = set()

for tweet in tweets.find():
    distinct_users.add(tweet['user_id'])

print("{} distinct user out of {} total user".format(len(distinct_users), tweets.count()))

14937 distinct user out of 16898 total user


  import sys


In [13]:
## Tweet with  maximum retweet count
max_retweet_count = 0
for tweet in tweets.find():
    if tweet['retweet_count']>max_retweet_count:
        max_retweet_count = tweet['retweet_count']
        tweet_id = tweet['id']
print(max_retweet_count)

myquery = {'id': tweet_id}
twts = list(tweets.find(myquery))
twts[0]

34671


{'_id': ObjectId('5ea9c40717dfdac82a919ee3'),
 'id': 1255401632264830979,
 'user_name': 'The_War_Economy',
 'user_id': '856978948965769220',
 'content': 'The only reason the U.S. has reported one million cases of CoronaVirus is that our Testing is sooo much better than any other country in the World. Other countries are way behind us in Testing, and therefore show far fewer cases!',
 'created_at': '2020-04-29 07:40:48',
 'location': '',
 'hashtags': [],
 'mentions': ['25073877'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': 1255352014042738688,
 'retweetedFrom_id': '25073877',
 'retweetedFrom_name': 'realDonaldTrump',
 'media': [],
 'retweet': True,
 'FavCount': 0,
 'Orig_retweet_fav': 201285,
 'source': 'Twitter Web App',
 'retweet_count': 34671,
 'lang': 'en'}

In [15]:
## Tweet with  maximum favorite count
max_FavCount = 0
for tweet in tweets.find():
    if tweet['FavCount']>max_FavCount:
        max_FavCount = tweet['FavCount']
        tweet_id = tweet['id']
print(max_FavCount)

myquery = {'id': tweet_id}
twts = list(tweets.find(myquery))
twts[0]

7996


{'_id': ObjectId('5ea9c4b717dfdac82a91ba8b'),
 'id': 1255099265728352258,
 'user_name': 'Neil King',
 'user_id': '135515077',
 'content': 'This is bonkers:\n\nRoughly half of all U.S. workers stand to earn more in unemployment benefits than they did at their jobs before the coronavirus pandemic brought the economy to a standstill.     https://t.co/sXDmXH0PJ3',
 'created_at': '2020-04-28 11:39:18',
 'location': 'Washington, DC',
 'hashtags': [],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': -1,
 'retweetedFrom_id': -1,
 'retweetedFrom_name': 'NA',
 'media': [],
 'retweet': False,
 'FavCount': 7996,
 'Orig_retweet_fav': -1,
 'source': 'TweetDeck',
 'retweet_count': 2280,
 'lang': 'en'}

### Search by word

In [16]:
#Find number of tweets with word
def search_word():
    user_input = input("Please enter a word: ")
    myquery = {"$text":{"$search": user_input}}
    twts = pd.DataFrame(tweets.find(myquery)).drop(['_id','source','media'],axis=1).sort_values(["retweet",
                                "FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts

In [17]:
search_word().head()

Please enter a word: stimulus


Unnamed: 0,index,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,56,1255153188719624192,Paul Page,19693661,The scale of the stimulus measure means many l...,2020-04-28 15:13:34,Washington | New York,[],[3108351],,,-1,-1,,False,51,-1,18,en
1,99,1255134878347059200,BreakThrough News,1217207858435186689,"Join us this Wed, 4.29 at 8PM ET / 5 PT / 7 CT...",2020-04-28 14:00:48,,[],[],,,-1,-1,,False,22,-1,12,en
2,47,1255218480665722880,Catskill Ranger,1063847518415523840,Over 1 Million people have now tested positive...,2020-04-28 19:33:01,Catskill Park,"[COVID19, NewYork, NewYorkTough, SupportNYPolice]","[21269970, 10615232, 296361085, 232268199, 360...",,,-1,-1,,False,8,-1,6,en
3,3,1255125629017526273,Jessica Amir,292226717,We’ve spoken about record Australian governmen...,2020-04-28 13:24:03,"Sydney, New South Wales","[COVID19, uspoli, Auspol, Ausbiz, ASX]",[],,,-1,-1,,False,8,-1,1,en
4,88,1255520535947051008,Randall French,824510940422893568,US economy shrunk by 4.8% or ~$1.1Tn in the fi...,2020-04-29 15:33:16,#SoCalResists,[],[],,,-1,-1,,False,3,-1,4,en


### Search by a hashtag

In [53]:
# #Find number of tweets with #covid19
def search_hashtag():
    user_input = input("Please enter a hashtag(include #): ")
    myquery = {"hashtags": { "$in" : [user_input]}}

    twts = pd.DataFrame(tweets.find(myquery)).drop(['_id','source','media'],axis=1).sort_values(["retweet",
                                                "FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts

In [54]:
search_hashtag().head()

Please enter a hashtag(include #): covid19


Unnamed: 0,index,id_str,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweetedFrom,retweet,FavCount,retweet_count
0,8,1253689520157048832,Christine Williamson,498397792,New op-ed from Bhaskar Chakravorti of @Fletche...,2020-04-24 14:17:28,Boston,"[lockdown2020, covid19]",[18028551],,,-1,False,2,1
1,2,1254391361517780994,BillyVJ,1201227597071441922,Fascinating piece on #covid19 economy in Ital...,2020-04-26 12:46:20,,[covid19],[],,,-1,False,2,0
2,7,1253700424164655105,Rich Tehrani,5654932,"Tech companies pull back on hiring, flashing a...",2020-04-24 15:00:48,"Trumbull, CT","[pandemictech, CoronaVirus, CoronaVirusOutbrea...",[1250900627351355394],,,-1,False,1,1
3,6,1253837079831814144,Susanne Vuorinen,39520309,@tuuliel Pitkällä tähtäimellä tiukat rajoituks...,2020-04-25 00:03:49,Finland,[covid19],[575655913],575655913.0,1.253602e+18,-1,False,1,0
4,11,1253632148713680898,Sir. Ridwan,1204806232604905472,That's the results u get when u are busy argui...,2020-04-24 10:29:30,kano,"[covid19, twitter]",[],,,-1,False,1,0


### Search by time range

In [20]:
#Search by time range
def search_date():
    start_date = input("Please enter a start date(format:yyyy-mm-dd hh:mm:ss): ")
    end_date = input("Please enter a end date(format:yyyy-mm-dd hh:mm:ss): ")
    twts = pd.DataFrame(tweets.find({"created_at":{ "$gte": start_date, "$lt": end_date }})).drop(['_id','source',
                'media'],axis=1).sort_values(["retweet","FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts
search_date().head()

Please enter a start date(format:yyyy-mm-dd hh:mm:ss): 2020-04-27
Please enter a end date(format:yyyy-mm-dd hh:mm:ss): 2020-04-28


Unnamed: 0,index,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,532,1254896442595897344,Emily Finn,1187652746,#NEW: At today’s coronavirus task force press ...,2020-04-27 22:13:21,"San Diego, CA",[NEW],[25073877],,,-1,-1,,False,837,-1,180,en
1,295,1254905495921528836,Richard Stengel,807357676300730368,When Trump says we had the greatest economy ev...,2020-04-27 22:49:19,"New York, NY",[],[],,,-1,-1,,False,571,-1,130,en
2,536,1254896183161602048,Andrew Clark,60403024,President Trump oversaw a massive mobilization...,2020-04-27 22:12:19,"Washington, DC",[],[],,,-1,-1,,False,91,-1,40,en
3,433,1254900459225169925,West Wing Reports,20182089,Here's U.S. economic growth since 1940. Trump ...,2020-04-27 22:29:19,White House & elsewhere,[],[],,,-1,-1,,False,81,-1,46,en
4,972,1254864663658954753,CGTN,1115874631,"Dow jumps 350 points, closing above 24,000, un...",2020-04-27 20:07:04,"Beijing, China",[],[],,,-1,-1,,False,76,-1,15,en


### Search by user name

In [22]:
# Search by user_name
def search_user_name():
    user_input = input("Please enter a user_name: ")
    twts = pd.DataFrame(tweets.find({"user_name":user_input})).drop(['_id','source',
                'media'],axis=1).sort_values(["retweet","FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts
search_user_name().head()

Please enter a user_name: Tom


Unnamed: 0,index,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,0,1255560152599662594,Tom,2355540739,This is bonkers:\n\nRoughly half of all U.S. w...,2020-04-29 18:10:42,,[],[135515077],,,1255099265728352258,135515077,NKingofDC,True,0,7990,2274,en
1,2,1255207171312300034,Tom,1197488167311597568,"""The consequence of the lack of federal leader...",2020-04-28 18:48:04,,[],[16129920],,,1255206920698433536,16129920,maddow,True,0,2506,1224,en
2,1,1255225760668925954,Tom,822567886506098689,Increased COVID-19 testing shows promise for r...,2020-04-28 20:01:56,,[OANN],[1209936918],,,1255218978143842308,1209936918,OANN,True,0,531,143,en


### Search by user_id

In [24]:
# Search by user id
def search_user_id():
    user_input = input("Please enter a user_id: ")
    twts = pd.DataFrame(tweets.find({"user_id":user_input})).drop(['_id','source','media'],axis=1).sort_values(["retweet",
                                                    "FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts
search_user_id().head()

Please enter a user_id: 2355540739


Unnamed: 0,index,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,0,1255560152599662594,Tom,2355540739,This is bonkers:\n\nRoughly half of all U.S. w...,2020-04-29 18:10:42,,[],[135515077],,,1255099265728352258,135515077,NKingofDC,True,0,7990,2274,en


### Search by two words

In [25]:
#search by two words
def search_by_two_words():
    user_input1 = input("Please enter a word: ")
    user_input2 = input("Please enter a another word: ")
    myquery = {
    "$and" : [
        {"content":{"$regex":user_input1,"$options" :'i'}} , {"content":{"$regex":user_input2,"$options" :'i'}}
        ]
    }
    twts = pd.DataFrame(tweets.find(myquery)).drop(['_id','source','media'],axis=1).sort_values(["retweet",
                                        "FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts
search_by_two_words().head()

Please enter a word: checks
Please enter a another word: trump


Unnamed: 0,index,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,0,1254929098352328704,Maura Mullaney,307933395,@realDonaldTrump A) Not the greatest economy t...,2020-04-28 00:23:07,North Carolina,[],[25073877],25073877,1254908072067313670,-1,-1,,False,0,-1,0,en


### Top five hashtags used

In [26]:
hashtag_count = {}
for tweet in tweets.find({"hashtags":{"$ne":[]}},{"hashtags":1, "_id":0}):
    ht = tweet["hashtags"]
    for hash in ht:
        hash = hash.lower()
        if hash not in hashtag_count:
            hashtag_count[hash] = 1
        else:
            hashtag_count[hash] += 1
hashtags = pd.DataFrame.from_dict(hashtag_count, orient='index')
hashtags = hashtags.rename(columns={0:"count"})
hashtags = hashtags.sort_values("count", ascending=False)
hashtags.head(5)

Unnamed: 0,count
covid19,229
economy,200
oann,192
coronavirus,177
new,165


In [27]:
client.close()

## Queries based on MongoDB and postgreSQL

In [28]:
#Connect to postgreSQL
conn = psycopg2.connect(host="localhost", port = 5432, database="twitter", user="postgres", password="postgres@329")
cur = conn.cursor()

In [29]:
#Connect to MongoDB
client = pymongo.MongoClient()
db = client["tweet_database"]
tweets = db.tweets_collection

### Original user of maximum retweeted tweet

In [30]:
#Original user of maximum retweeted tweet
twts = pd.DataFrame(tweets.find({"retweet": {"$ne": False}},{"user_id":1, "_id":0}).sort("retweet_count",-1).limit(1))
twts = twts.astype({'user_id': 'int64'})

cur.execute("SELECT user_id, name FROM user_df;")
users = pd.DataFrame(cur.fetchall())
users.columns = [desc[0] for desc in cur.description]

pd.merge(users,twts,on="user_id")

Unnamed: 0,user_id,name
0,856978948965769220,The_War_Economy


### What is the location of Maximum tweets.

In [31]:
#Location of maximum tweets
user_twt_count = pd.DataFrame(tweets.aggregate([
  {
    "$group": {
       "_id": "$user_id",
       "count": { "$sum": 1 }
    }
  }
]))
user_twt_count.columns = ["user_id","count"]
user_twt_count = user_twt_count.astype({'user_id': 'int64'})

In [32]:
cur.execute("SELECT user_id,location FROM user_df where location <> '';")
user_loc = pd.DataFrame(cur.fetchall())
user_loc.columns = [desc[0] for desc in cur.description]

In [33]:
loc_df = pd.merge(user_loc,user_twt_count,on="user_id")
loc_df = loc_df[["location","count"]].groupby("location").count()
loc_df.index[loc_df["count"].argmax()]

'United States'

### Search by keyword, hashtag and threshold on number of followers.

In [34]:
def search_word_hashtag_followers_count():
    input_word = input("Please enter a word: ")
    input_hashtag = input("Please enter a hashtag: ")
    threshold = input("Pleas enter a threshold for followers count of users:")
    
    twts = pd.DataFrame(tweets.find( {
        "$and" : [
            {"$text":{"$search": input_word}},
            {"hashtags": { "$in" : [input_hashtag]}}
        ]
    }))
    twts = twts.drop(["_id","location",'source','media'],axis=1)
    twts = twts.astype({'user_id': 'int64'})
    
    cur.execute("SELECT user_id,followers_count FROM user_df where followers_count > " + threshold + " order by followers_count desc;")
    users = pd.DataFrame(cur.fetchall())
    users.columns = [desc[0] for desc in cur.description]
    
    twts_df = pd.merge(users,twts,on="user_id").sort_values(["followers_count","retweet",
                            "FavCount"],ascending=[False,True,False])
    
    return twts_df

search_word_hashtag_followers_count().head()

Please enter a word: trump
Please enter a hashtag: #coronavirus
Pleas enter a threshold for followers count of users:100


Unnamed: 0,user_id,followers_count,id,user_name,content,created_at,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,479398450,8982,1255020210102382593,Marco Vergari,#Georgia #eateries welcome diners back as more...,2020-04-28 06:25:09,"[Georgia, eateries, shutdowns, USA, Atlanta, O...",[],,,-1,-1,,False,0,-1,0,en
1,479398450,8982,1255387489042382853,Marco Vergari,#Coronavirus likely hammered U.S. #economy in ...,2020-04-29 06:44:36,"[Coronavirus, economy, Washington, WhiteHouse,...",[],,,-1,-1,,False,0,-1,0,en
2,14496320,5932,1254849095451148288,Sarah Daniels❣️,"""... Democrats clearly want to keep the econom...",2020-04-27 19:05:12,[],[27522964],,,1254801472627314691,27522964,vdare,True,0,24,8,en
3,68036718,4559,1255507930813804555,Gregorio Meraz,The huge costo of Ignoring &amp; dismissing th...,2020-04-29 14:43:11,[CoronavirusThreat],[],,,-1,-1,,False,0,-1,0,en
4,92492429,2068,1255156351619260417,JusticeMatters!,@WhiteHouse @realDonaldTrump President @Barack...,2020-04-28 15:26:08,"[COVIDー19, Trump, COVID19, coronavirus, pandemic]","[822215673812119553, 25073877, 813286]",8.222157e+17,1.255127e+18,-1,-1,,False,0,-1,0,en


### Search by keyword, hashtag and a location.

In [36]:
def search_word_hashtag_location():
    input_word = input("Please enter a word: ")
    input_hashtag = input("Please enter a hashtag: ")
    location = input("Pleas enter a location:")
    
    twts = pd.DataFrame(tweets.find( {
        "$and" : [
            {"$text":{"$search": input_word}},
            {"hashtags": { "$in" : [input_hashtag]}}
        ]
    }))
    twts = twts.drop(["_id","location",'source','media'],axis=1)
    twts = twts.astype({'user_id': 'int64'})
    
    cur.execute("SELECT user_id,location FROM user_df where location like '%" + location + "%';")
    users = pd.DataFrame(cur.fetchall())
    users.columns = [desc[0] for desc in cur.description]
    
    twts_df = pd.merge(users,twts,on="user_id").sort_values(["retweet","FavCount",
                                                             "retweet_count"],ascending=[True,False,False]).reset_index()
    
    return twts_df
search_word_hashtag_location()

Please enter a word: trump
Please enter a hashtag: #new
Pleas enter a location:New York


Unnamed: 0,index,user_id,location,id,user_name,content,created_at,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang


In [37]:
#Close the cursor and connection to the database
cur.close()
conn.close()

In [38]:
client.close()

## Misc

In [None]:
hashtag_count = {}
for tweet in tweets.find():
    ht = tweet["hashtags"]
    for hash in ht:
        hash = hash.lower()
        if hash not in hashtag_count:
            hashtag_count[hash] = 1
        else:
            hashtag_count[hash] += 1

hc = sorted(hashtag_count, key=hashtag_count.get, reverse=True)  # Sorting the top 10.
count = 0

for z in hc:
    if count == 10:
        break
    jsonx = {"hashtag": z, "count": str(hashtag_count[z])}
    count += 1
# {k: v for k, v in sorted(hashtag_count.items(), key=lambda item: item[1])}

## Experimenting with text index

Experimented with test index using a version of data we stored to find the difference in query time between a query and query on a column which is a text index.

*MongoDB supports query operations that perform a text search of string content. To perform text search, MongoDB uses a text index and the $text operator.*

*MongoDB provides text indexes to support text search queries on string content. text indexes can include any field whose value is a string or an array of string elements.*

*Use the \\$text query operator to perform text searches on a collection with a text index. \\$text will tokenize the search string using whitespace and most punctuation as delimiters, and perform a logical OR of all such tokens in the search string.*

In [39]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db1 = client["TWT_DB"]
test = db1["test"]

In [40]:
#Create a test index
# test.create_index([("content",pymongo.TEXT)])

'content_text'

In [41]:
twts1 = test.find({"$text": {"$search": "\"#covid19\"" } } )
twts1 = list(twts1)
twts1[0]

{'_id': ObjectId('5ea66afbfb8f5ff7fd474756'),
 'id_str': '1253752842361483265',
 'user_name': 'Dr. Marcell Vollmer #SocialDistancing #StayHome',
 'user_id': '99674560',
 'content': 'The Great #Lockdown of the economy has been completely unprecedented, both in terms of the speed of the shutdown and its impact on jobs.\n\n #coronavirus #COVID19 #health #COVIDー19 #worklife #motivation #FutureofWork #Leadership #WorkLifeBalance #USA\n\n https://t.co/yoEHiyJcxc',
 'created_at': '2020-04-24 18:29:05',
 'location': 'Munich, Bavaria',
 'hashtags': ['Lockdown',
  'coronavirus',
  'COVID19',
  'health',
  'COVIDー19',
  'worklife',
  'motivation',
  'FutureofWork',
  'Leadership',
  'WorkLifeBalance',
  'USA'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 6,
 'source': 'Twitter for iPad',
 'retweet_count': 4}

In [43]:
len(twts1)

396

In [44]:
twts1 = test.find({"$text": {"$search": "covid19"} } )
twts1 = list(twts1)
twts1[0]

{'_id': ObjectId('5ea66afbfb8f5ff7fd474756'),
 'id_str': '1253752842361483265',
 'user_name': 'Dr. Marcell Vollmer #SocialDistancing #StayHome',
 'user_id': '99674560',
 'content': 'The Great #Lockdown of the economy has been completely unprecedented, both in terms of the speed of the shutdown and its impact on jobs.\n\n #coronavirus #COVID19 #health #COVIDー19 #worklife #motivation #FutureofWork #Leadership #WorkLifeBalance #USA\n\n https://t.co/yoEHiyJcxc',
 'created_at': '2020-04-24 18:29:05',
 'location': 'Munich, Bavaria',
 'hashtags': ['Lockdown',
  'coronavirus',
  'COVID19',
  'health',
  'COVIDー19',
  'worklife',
  'motivation',
  'FutureofWork',
  'Leadership',
  'WorkLifeBalance',
  'USA'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 6,
 'source': 'Twitter for iPad',
 'retweet_count': 4}

In [45]:
len(twts1)

408

In [46]:
test.find({"$text":{"$search": "covid19"}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.test',
  'indexFilterSet': False,
  'parsedQuery': {'$text': {'$search': 'covid19',
    '$language': 'english',
    '$caseSensitive': False,
    '$diacriticSensitive': False}},
  'winningPlan': {'stage': 'TEXT',
   'indexPrefix': {},
   'indexName': 'content_text',
   'parsedTextQuery': {'terms': ['covid19'],
    'negatedTerms': [],
    'phrases': [],
    'negatedPhrases': []},
   'textIndexVersion': 3,
   'inputStage': {'stage': 'TEXT_MATCH',
    'inputStage': {'stage': 'FETCH',
     'inputStage': {'stage': 'OR',
      'inputStage': {'stage': 'IXSCAN',
       'keyPattern': {'_fts': 'text', '_ftsx': 1},
       'indexName': 'content_text',
       'isMultiKey': True,
       'isUnique': False,
       'isSparse': False,
       'isPartial': False,
       'indexVersion': 2,
       'direction': 'backward',
       'indexBounds': {}}}}}},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 408,
  'exe

In [47]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["TWT_DB"]
tweets = db["tweets"]

In [48]:
tweets.find({"content":{"$regex":"covid19","$options" :'i'}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.tweets',
  'indexFilterSet': False,
  'parsedQuery': {'content': {'$regex': 'covid19', '$options': 'i'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'covid19', '$options': 'i'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 413,
  'executionTimeMillis': 21,
  'totalKeysExamined': 0,
  'totalDocsExamined': 15429,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'covid19', '$options': 'i'}},
   'nReturned': 413,
   'executionTimeMillisEstimate': 0,
   'works': 15431,
   'advanced': 413,
   'needTime': 15017,
   'needYield': 0,
   'saveState': 120,
   'restoreState': 120,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 15429},
  'allPlansExecution': []},
 'serverInfo': {'host': 'DESKTOP-T0J0HGC',
  'port': 27017,
  'version': '4.2.6',
  'gitVersion': '20364840b8f1af16917e4c23c1b5f5efd8b3

In [49]:
tweets.find({"content":{"$regex":"trump","$options" :'i'}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.tweets',
  'indexFilterSet': False,
  'parsedQuery': {'content': {'$regex': 'trump', '$options': 'i'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 2240,
  'executionTimeMillis': 16,
  'totalKeysExamined': 0,
  'totalDocsExamined': 15429,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'nReturned': 2240,
   'executionTimeMillisEstimate': 0,
   'works': 15431,
   'advanced': 2240,
   'needTime': 13190,
   'needYield': 0,
   'saveState': 120,
   'restoreState': 120,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 15429},
  'allPlansExecution': []},
 'serverInfo': {'host': 'DESKTOP-T0J0HGC',
  'port': 27017,
  'version': '4.2.6',
  'gitVersion': '20364840b8f1af16917e4c23c1b5f5efd8b352f

In [52]:
test.find({"$text":{"$search":"trump"}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.test',
  'indexFilterSet': False,
  'parsedQuery': {'$text': {'$search': 'trump',
    '$language': 'english',
    '$caseSensitive': False,
    '$diacriticSensitive': False}},
  'winningPlan': {'stage': 'TEXT',
   'indexPrefix': {},
   'indexName': 'content_text',
   'parsedTextQuery': {'terms': ['trump'],
    'negatedTerms': [],
    'phrases': [],
    'negatedPhrases': []},
   'textIndexVersion': 3,
   'inputStage': {'stage': 'TEXT_MATCH',
    'inputStage': {'stage': 'FETCH',
     'inputStage': {'stage': 'OR',
      'inputStage': {'stage': 'IXSCAN',
       'keyPattern': {'_fts': 'text', '_ftsx': 1},
       'indexName': 'content_text',
       'isMultiKey': True,
       'isUnique': False,
       'isSparse': False,
       'isPartial': False,
       'indexVersion': 2,
       'direction': 'backward',
       'indexBounds': {}}}}}},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 1702,
  'execut

In [51]:
client.close()