# Queries for the search application

In [21]:
import pandas as pd
import json
import pymongo
import io
import pprint
import itertools

In [24]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["tweet_database"]
tweets = db.tweets_collection_final

In [25]:
#Document(row) sample from the mongoDB database 
myquery = {}
mydoc = list(tweets.find(myquery))
mydoc[0]

{'_id': ObjectId('5ea9c38017dfdac82a91830c'),
 'id': 1255560533723484161,
 'user_name': 'Rich Justice',
 'user_id': '866210244',
 'content': 'The record-long U.S. economic expansion is over after almost 11 years, with what’s likely to be the deepest recession in at least eight decades now under way https://t.co/ZOWYUBHaKS',
 'created_at': '2020-04-29 18:12:13',
 'location': 'Pennsylvania',
 'hashtags': [],
 'mentions': ['564111558'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': 1255554460182630401,
 'retweetedFrom_id': '564111558',
 'retweetedFrom_name': 'bpolitics',
 'media': [],
 'retweet': True,
 'FavCount': 0,
 'Orig_retweet_fav': 5,
 'source': 'Twitter for iPad',
 'retweet_count': 11,
 'lang': 'en'}

## Lets learn about our data

In [26]:
## Original vs retweet count
tweet_type = {'original': 0, 'retweets': 0}

for tweet in tweets.find():
    if tweet['retweet'] == True:
        tweet_type['original'] += 1
    else:
        tweet_type['retweets'] += 1
tweet_type

{'original': 12275, 'retweets': 4623}

In [27]:
## Type of media in our dataset
type_count = {"text": 0, "Only_image":0, "both":0}
for tweet in tweets.find():
    md = tweet["media"]
    if len(md) != 0:
        if tweet["content"] == "":
            type_count["image"] += 1 # Only Image
        else:
            type_count["both"] += 1 # Image and text
    else:
        type_count["text"] += 1 # Only text

type_count

{'text': 15685, 'Only_image': 0, 'both': 1213}

In [28]:
## Number of distince user in database
distinct_users = set()

for tweet in tweets.find():
    distinct_users.add(tweet['user_id'])

print("{} distinct user out of {} total user".format(len(distinct_users), tweets.count()))

14937 distinct user out of 16898 total user


  import sys


In [52]:
## Tweet with  maximum retweet count
max_retweet_count = 0
for tweet in tweets.find():
    if tweet['retweet_count']>max_retweet_count:
        max_retweet_count = tweet['retweet_count']
        tweet_id = tweet['id']

myquery = {'id': tweet_id}
twts = list(tweets.find(myquery))
print('number of retweet:\n', max_retweet_count)
print('Conten of the tweet:\n', twts[0]['content'])
print('Creater of the original tweet:\n', twts[0]['retweetedFrom_name'])
print('Likes on the original tweet:\n', twts[0]['Orig_retweet_fav'])

number of retweet:
 34671
Conten of the tweet:
 The only reason the U.S. has reported one million cases of CoronaVirus is that our Testing is sooo much better than any other country in the World. Other countries are way behind us in Testing, and therefore show far fewer cases!
Creater of the original tweet:
 realDonaldTrump
Likes on the original tweet:
 201285


In [53]:
twts[0]

{'_id': ObjectId('5ea9c40717dfdac82a919ee3'),
 'id': 1255401632264830979,
 'user_name': 'The_War_Economy',
 'user_id': '856978948965769220',
 'content': 'The only reason the U.S. has reported one million cases of CoronaVirus is that our Testing is sooo much better than any other country in the World. Other countries are way behind us in Testing, and therefore show far fewer cases!',
 'created_at': '2020-04-29 07:40:48',
 'location': '',
 'hashtags': [],
 'mentions': ['25073877'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': 1255352014042738688,
 'retweetedFrom_id': '25073877',
 'retweetedFrom_name': 'realDonaldTrump',
 'media': [],
 'retweet': True,
 'FavCount': 0,
 'Orig_retweet_fav': 201285,
 'source': 'Twitter Web App',
 'retweet_count': 34671,
 'lang': 'en'}

In [54]:
## Tweet with  maximum favorite count
max_FavCount = 0
for tweet in tweets.find():
    if tweet['FavCount']>max_FavCount:
        max_FavCount = tweet['FavCount']
        tweet_id = tweet['id']

myquery = {'id': tweet_id}
twts = list(tweets.find(myquery))
print('number of favorite:\n', max_FavCount)
print('Conten of the tweet:\n', twts[0]['content'])
print('Creater of tweet:\n', twts[0]['user_name'])

number of favorite:
 7996
Conten of the tweet:
 This is bonkers:

Roughly half of all U.S. workers stand to earn more in unemployment benefits than they did at their jobs before the coronavirus pandemic brought the economy to a standstill.     https://t.co/sXDmXH0PJ3
Creater of tweet:
 Neil King


## Search by word

In [55]:
#Find number of tweets with word
myquery = {"content":{"$regex":"covid19","$options" :'i'}}
tweets.count_documents(myquery)

368

In [56]:
#Tweets
twts = list(tweets.find(myquery))
twts[0]

{'_id': ObjectId('5ea9c38117dfdac82a91836a'),
 'id': 1255558980862173185,
 'user_name': 'Story Partners',
 'user_id': '122488212',
 'content': 'How will we return to work in a safe and sustainable way? \n\nSuzanne Clark from the U.S. Chamber of Commerce answers this question in the latest issue of #InsideStory.\n\n#PublicAffairs #COVID19 #Coronavirus\n\nhttps://t.co/QwdkaAYAqo https://t.co/yirEqWFkg8',
 'created_at': '2020-04-29 18:06:02',
 'location': 'Washington, D.C.',
 'hashtags': ['InsideStory', 'PublicAffairs', 'COVID19', 'Coronavirus'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': -1,
 'retweetedFrom_id': -1,
 'retweetedFrom_name': 'NA',
 'media': ['https://t.co/yirEqWFkg8'],
 'retweet': False,
 'FavCount': 0,
 'Orig_retweet_fav': -1,
 'source': 'Sprout Social',
 'retweet_count': 0,
 'lang': 'en'}

## Search by a hashtag

In [57]:
# #Find number of tweets with #covid19 (Exact string put the specified string between the ^(Starts with) and $(Ends with))
# myquery = {"hashtags":{"$regex":"^covid19$","$options" :'i'}}
#Search using a hashtag.
myquery = {"content":{"$regex":"#covid19","$options" :'i'}}
twts = list(tweets.find(myquery))
twts[0]

{'_id': ObjectId('5ea9c38117dfdac82a91836a'),
 'id': 1255558980862173185,
 'user_name': 'Story Partners',
 'user_id': '122488212',
 'content': 'How will we return to work in a safe and sustainable way? \n\nSuzanne Clark from the U.S. Chamber of Commerce answers this question in the latest issue of #InsideStory.\n\n#PublicAffairs #COVID19 #Coronavirus\n\nhttps://t.co/QwdkaAYAqo https://t.co/yirEqWFkg8',
 'created_at': '2020-04-29 18:06:02',
 'location': 'Washington, D.C.',
 'hashtags': ['InsideStory', 'PublicAffairs', 'COVID19', 'Coronavirus'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': -1,
 'retweetedFrom_id': -1,
 'retweetedFrom_name': 'NA',
 'media': ['https://t.co/yirEqWFkg8'],
 'retweet': False,
 'FavCount': 0,
 'Orig_retweet_fav': -1,
 'source': 'Sprout Social',
 'retweet_count': 0,
 'lang': 'en'}

In [58]:
len(twts)

322

## Search by time range

In [61]:
# Search by Time range
twts = tweets.find({"created_at":{ "$gte": "2020-04-29 18:05:00", "$lt": "2020-04-29 18:06:10" }})
twts = list(twts)
twts[0]

{'_id': ObjectId('5ea9c38117dfdac82a918368'),
 'id': 1255558995890376705,
 'user_name': 'Guido Cuelli',
 'user_id': '301155730',
 'content': 'The U.S. economy contracted at 4.8% in the first-quarter as stringent measures to slow the spread of the coronavirus almost shut down the country https://t.co/N8WAhBdGcj https://t.co/X89f212gkw',
 'created_at': '2020-04-29 18:06:06',
 'location': '',
 'hashtags': [],
 'mentions': ['1652541'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': 1255542383372054528,
 'retweetedFrom_id': '1652541',
 'retweetedFrom_name': 'Reuters',
 'media': [],
 'retweet': True,
 'FavCount': 0,
 'Orig_retweet_fav': 22,
 'source': 'Twitter for Android',
 'retweet_count': 8,
 'lang': 'en'}

In [62]:
len(twts)

13

## Search by user name

In [66]:
# Search by user_name
twts = tweets.find({"user_name":"Guido Cuelli"})
twts = list(twts)
twts[0]

{'_id': ObjectId('5ea9c38117dfdac82a918368'),
 'id': 1255558995890376705,
 'user_name': 'Guido Cuelli',
 'user_id': '301155730',
 'content': 'The U.S. economy contracted at 4.8% in the first-quarter as stringent measures to slow the spread of the coronavirus almost shut down the country https://t.co/N8WAhBdGcj https://t.co/X89f212gkw',
 'created_at': '2020-04-29 18:06:06',
 'location': '',
 'hashtags': [],
 'mentions': ['1652541'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': 1255542383372054528,
 'retweetedFrom_id': '1652541',
 'retweetedFrom_name': 'Reuters',
 'media': [],
 'retweet': True,
 'FavCount': 0,
 'Orig_retweet_fav': 22,
 'source': 'Twitter for Android',
 'retweet_count': 8,
 'lang': 'en'}

In [67]:
len(twts)

1

## Search by user_id

In [69]:
# Search by user id
twts = tweets.find({"user_id":"301155730"})
twts = list(twts)
twts[0]

{'_id': ObjectId('5ea9c38117dfdac82a918368'),
 'id': 1255558995890376705,
 'user_name': 'Guido Cuelli',
 'user_id': '301155730',
 'content': 'The U.S. economy contracted at 4.8% in the first-quarter as stringent measures to slow the spread of the coronavirus almost shut down the country https://t.co/N8WAhBdGcj https://t.co/X89f212gkw',
 'created_at': '2020-04-29 18:06:06',
 'location': '',
 'hashtags': [],
 'mentions': ['1652541'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': 1255542383372054528,
 'retweetedFrom_id': '1652541',
 'retweetedFrom_name': 'Reuters',
 'media': [],
 'retweet': True,
 'FavCount': 0,
 'Orig_retweet_fav': 22,
 'source': 'Twitter for Android',
 'retweet_count': 8,
 'lang': 'en'}

In [70]:
len(twts)

1

## Search by two words

In [71]:
#search by two words
twts = tweets.find( {
    "$and" : [
        {"content":{"$regex":"covid19","$options" :'i'}} , {"content":{"$regex":"#trump","$options" :'i'}}
    ]
} )
twts = list(twts)
twts[0]

{'_id': ObjectId('5ea9c39817dfdac82a9188bb'),
 'id': 1255526469939404801,
 'user_name': 'mnpact/Dave Mindeman',
 'user_id': '18503647',
 'content': '#Covid19 #TrumpEconomy  The U.S. economy shrank at a 4.8% annual rate last quarter',
 'created_at': '2020-04-29 15:56:51',
 'location': 'Apple Valley, MN',
 'hashtags': ['Covid19', 'TrumpEconomy'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': -1,
 'retweetedFrom_id': -1,
 'retweetedFrom_name': 'NA',
 'media': [],
 'retweet': False,
 'FavCount': 0,
 'Orig_retweet_fav': -1,
 'source': 'Twitter Web App',
 'retweet_count': 0,
 'lang': 'en'}

In [72]:
len(twts)

9

## Top five hashtags used

In [74]:
hashtag_count = {}
for tweet in tweets.find({"hashtags":{"$ne":[]}},{"hashtags":1, "_id":0}):
    ht = tweet["hashtags"]
    for hash in ht:
        hash = hash.lower()
        if hash not in hashtag_count:
            hashtag_count[hash] = 1
        else:
            hashtag_count[hash] += 1
hashtags = pd.DataFrame.from_dict(hashtag_count, orient='index')
hashtags = hashtags.rename(columns={0:"count"})
hashtags = hashtags.sort_values("count", ascending=False)
hashtags.head(5)

Unnamed: 0,count
covid19,229
economy,200
oann,192
coronavirus,177
new,165


## Misc

In [75]:
#???????
myquery = {"location":"New York, NY"}

tweets.count_documents(myquery)

146

In [76]:
hashtag_count = {}
for tweet in tweets.find():
    ht = tweet["hashtags"]
    for hash in ht:
        hash = hash.lower()
        if hash not in hashtag_count:
            hashtag_count[hash] = 1
        else:
            hashtag_count[hash] += 1

hc = sorted(hashtag_count, key=hashtag_count.get, reverse=True)  # Sorting the top 10.
count = 0

for z in hc:
    if count == 10:
        break
    jsonx = {"hashtag": z, "count": str(hashtag_count[z])}
    count += 1
# {k: v for k, v in sorted(hashtag_count.items(), key=lambda item: item[1])}

## Experimenting with text index

In [28]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db1 = client["TWT_DB"]
test = db1["test"]

In [29]:
#Create a test index
db1.test.create_index([("content",pymongo.TEXT)])

'content_text'

In [30]:
twts1 = db1.test.find({"$text": {"$search": "\"#covid19\"" } } )
twts1 = list(twts1)
twts1[0]

{'_id': ObjectId('5ea66afbfb8f5ff7fd474756'),
 'id_str': '1253752842361483265',
 'user_name': 'Dr. Marcell Vollmer #SocialDistancing #StayHome',
 'user_id': '99674560',
 'content': 'The Great #Lockdown of the economy has been completely unprecedented, both in terms of the speed of the shutdown and its impact on jobs.\n\n #coronavirus #COVID19 #health #COVIDー19 #worklife #motivation #FutureofWork #Leadership #WorkLifeBalance #USA\n\n https://t.co/yoEHiyJcxc',
 'created_at': '2020-04-24 18:29:05',
 'location': 'Munich, Bavaria',
 'hashtags': ['Lockdown',
  'coronavirus',
  'COVID19',
  'health',
  'COVIDー19',
  'worklife',
  'motivation',
  'FutureofWork',
  'Leadership',
  'WorkLifeBalance',
  'USA'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 6,
 'source': 'Twitter for iPad',
 'retweet_count': 4}

In [31]:
len(twts1)

396

In [58]:
twts1 = db1.test.find({"$text": {"$search": "covid19"} } )
twts1 = list(twts1)
twts1[0]

{'_id': ObjectId('5ea66afbfb8f5ff7fd474756'),
 'id_str': '1253752842361483265',
 'user_name': 'Dr. Marcell Vollmer #SocialDistancing #StayHome',
 'user_id': '99674560',
 'content': 'The Great #Lockdown of the economy has been completely unprecedented, both in terms of the speed of the shutdown and its impact on jobs.\n\n #coronavirus #COVID19 #health #COVIDー19 #worklife #motivation #FutureofWork #Leadership #WorkLifeBalance #USA\n\n https://t.co/yoEHiyJcxc',
 'created_at': '2020-04-24 18:29:05',
 'location': 'Munich, Bavaria',
 'hashtags': ['Lockdown',
  'coronavirus',
  'COVID19',
  'health',
  'COVIDー19',
  'worklife',
  'motivation',
  'FutureofWork',
  'Leadership',
  'WorkLifeBalance',
  'USA'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 6,
 'source': 'Twitter for iPad',
 'retweet_count': 4}

In [56]:
len(twts1)

408

In [44]:
db1.test.find({"$text":{"$search": "covid19"}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.test',
  'indexFilterSet': False,
  'parsedQuery': {'$text': {'$search': 'covid19',
    '$language': 'english',
    '$caseSensitive': False,
    '$diacriticSensitive': False}},
  'winningPlan': {'stage': 'TEXT',
   'indexPrefix': {},
   'indexName': 'content_text',
   'parsedTextQuery': {'terms': ['covid19'],
    'negatedTerms': [],
    'phrases': [],
    'negatedPhrases': []},
   'textIndexVersion': 3,
   'inputStage': {'stage': 'TEXT_MATCH',
    'inputStage': {'stage': 'FETCH',
     'inputStage': {'stage': 'OR',
      'inputStage': {'stage': 'IXSCAN',
       'keyPattern': {'_fts': 'text', '_ftsx': 1},
       'indexName': 'content_text',
       'isMultiKey': True,
       'isUnique': False,
       'isSparse': False,
       'isPartial': False,
       'indexVersion': 2,
       'direction': 'backward',
       'indexBounds': {}}}}}},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 408,
  'exe

In [36]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["TWT_DB"]
tweets = db["tweets"]

In [50]:
db.tweets.find({"content":{"$regex":"covid19","$options" :'i'}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.tweets',
  'indexFilterSet': False,
  'parsedQuery': {'content': {'$regex': 'covid19', '$options': 'i'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'covid19', '$options': 'i'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 413,
  'executionTimeMillis': 11,
  'totalKeysExamined': 0,
  'totalDocsExamined': 15429,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'covid19', '$options': 'i'}},
   'nReturned': 413,
   'executionTimeMillisEstimate': 0,
   'works': 15431,
   'advanced': 413,
   'needTime': 15017,
   'needYield': 0,
   'saveState': 120,
   'restoreState': 120,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 15429},
  'allPlansExecution': []},
 'serverInfo': {'host': 'DESKTOP-T0J0HGC',
  'port': 27017,
  'version': '4.2.6',
  'gitVersion': '20364840b8f1af16917e4c23c1b5f5efd8b3

In [59]:
client.close()

In [23]:
#search by two words
twts = tweets.find( {
    "$and" : [
        {"content":{"$regex":"covid19","$options" :'i'}} , {"content":{"$regex":"#trump","$options" :'i'}}
    ]
} )
twts = list(twts)
twts

[{'_id': ObjectId('5ea77cf6c9283bc56a26e378'),
  'id_str': '1254804431931047939',
  'user_name': 'TTDINK50⭐️⭐️⭐️',
  'user_id': '4857963897',
  'content': 'Food shortage hapenning!\n carne y escases en USA! y el mundo sera afectado!\n#NoMoreLockDown\n#OPENAMERICANOW\n#economy\n#Trump2020NowMoreThanEver\n#AlertaCOVID19SV\n#QuedateEnCasa\n#ElSalvadorEnCuarentena\n#economiaelsalvador\n#4TNosDejaSinComida\n#cordonsanitario\n\nhttps://t.co/3ZACV0sS0u',
  'created_at': '2020-04-27 16:07:44',
  'location': 'USA',
  'hashtags': ['NoMoreLockDown', 'OPENAMERICANOW', 'economy'],
  'mentions': ['999825601610461184'],
  'in_reply_to_user_id': None,
  'in_reply_to_status_id': None,
  'retweetedFrom': '999825601610461184',
  'media': [],
  'retweet': True,
  'FavCount': -1,
  'source': 'Twitter Web App',
  'retweet_count': 1},
 {'_id': ObjectId('5ea77d26c9283bc56a26eeb9'),
  'id_str': '1254579387464212487',
  'user_name': 'J Cassidy',
  'user_id': '718527880284610561',
  'content': "@realDonaldTrump 