# Queries for the search application

In [1]:
import pandas as pd
import json
import pymongo
import io
import pprint

In [2]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["TWT_DB"]
tweets = db["tweets"]

In [13]:
#Document(row) sample from the mongoDB database 
myquery = {}
mydoc = list(tweets.find(myquery))
mydoc[0]

{'_id': ObjectId('5ea66a14fb8f5ff7fd471280'),
 'id_str': '1254639872641576960',
 'user_name': 'Xcntrik',
 'user_id': '123628682',
 'content': 'Many U.S. businesses unlikely to seek government aid: NABE survey https://t.co/9bPT4YQN5I https://t.co/0B3ekpqymK',
 'created_at': '2020-04-27 05:13:50',
 'location': 'Southeast Texas',
 'hashtags': [],
 'mentions': ['1652541'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '1652541',
 'media': ['https://t.co/0B3ekpqymK'],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter for Android',
 'retweet_count': 7}

## Search by word

In [4]:
#Find number of tweets with word
myquery = {"content":{"$regex":"19","$options" :'i'}}
tweets.count_documents(myquery)

2564

In [5]:
#Tweets
twts = list(tweets.find(myquery))
twts[0]

{'_id': ObjectId('5ea66a14fb8f5ff7fd471288'),
 'id_str': '1254639272209952769',
 'user_name': 'Sue Zoo',
 'user_id': '260503264',
 'content': 'US economy faces historic shock with 16% joblessness possible. It hasn’t been this high since 1940, one term president \u2066@realDonaldTrump\u2069!  https://t.co/gzCKuKi35j',
 'created_at': '2020-04-27 05:11:27',
 'location': '',
 'hashtags': [],
 'mentions': ['394782575'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '394782575',
 'media': [],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter Web App',
 'retweet_count': 1}

## Search by exact phrase(can be a word or hashtag)

In [6]:
#Find number of tweets with #covid19 (Exact string put the specified string between the ^(Starts with) and $(Ends with))
myquery = {"hashtags":{"$regex":"^covid19$","$options" :'i'}}
tweets.count_documents(myquery)

222

In [7]:
#Tweets
twts = list(tweets.find(myquery))
twts[0]

{'_id': ObjectId('5ea66a16fb8f5ff7fd471300'),
 'id_str': '1254628238032203776',
 'user_name': 'Toshitaka Kage',
 'user_id': '181232582',
 'content': 'Fauci says #COVID19 testing needs to be doubled before U.S. reopens economy https://t.co/B576qDECyl https://t.co/A0RRjKEw8m',
 'created_at': '2020-04-27 04:27:36',
 'location': '',
 'hashtags': ['COVID19'],
 'mentions': ['487118986'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '487118986',
 'media': ['https://t.co/A0RRjKEw8m'],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter Web App',
 'retweet_count': 27}

In [22]:
#Types of tweets
tweet_type = {'original': 0, 'retweets': 0}

for tweet in tweets.find():
    if tweet['retweet'] == True:
        tweet_type['original'] += 1
    else:
        tweet_type['retweets'] += 1
tweet_type

{'original': 12123, 'retweets': 3306}

In [11]:
#Types of tweets
count_retweet = tweets.count_documents({"retweet":True})
count_original = tweets.count_documents({"retweet":False})
"Number of original tweet:" + str(count_original) + " and number of retweets:" + str(count_retweet)

'Number of original tweet:3306 and number of retweets:12123'

In [24]:
hashtag_count = {}
for tweet in tweets.find():
    ht = tweet["hashtags"]
    for hash in ht:
        hash = hash.lower()
        if hash not in hashtag_count:
            hashtag_count[hash] = 1
        else:
            hashtag_count[hash] += 1

hc = sorted(hashtag_count, key=hashtag_count.get, reverse=True)  # Sorting the top 10.
count = 0

for z in hc:
    if count == 10:
        break
    jsonx = {"hashtag": z, "count": str(hashtag_count[z])}
    count += 1

In [25]:
{k: v for k, v in sorted(hashtag_count.items(), key=lambda item: item[1])}

{'builds': 1,
 'state': 1,
 'wanon': 1,
 'foxnews': 1,
 'thegreatawakening': 1,
 'liberallogic': 1,
 'democratslie': 1,
 'trendingnow': 1,
 'fakenewsmedia': 1,
 '4moreyears': 1,
 '8yearsforever': 1,
 'irgc': 1,
 'psychoanalysis': 1,
 'philosophy': 1,
 'theory': 1,
 'baudrillard': 1,
 'deleuze': 1,
 'jung': 1,
 'lacan': 1,
 'mcluhan': 1,
 'reich': 1,
 'hegel': 1,
 'spinoza': 1,
 'aesthetics': 1,
 'difference': 1,
 'wethepeoplewill': 1,
 'yemen': 1,
 'somalia': 1,
 'war': 1,
 'wankers': 1,
 'coronaviru': 1,
 'traveltuesday': 1,
 'california': 1,
 'fechadocombolsonaro': 1,
 'reopentn': 1,
 'takedapharmaceutical': 1,
 'stock': 1,
 'fintech': 1,
 'ico': 1,
 'income': 1,
 'wealth': 1,
 'cuckoocortez': 1,
 'dumptrump': 1,
 'mining': 1,
 'metals': 1,
 'bailout': 1,
 'hyperinflation': 1,
 'dimecoin': 1,
 'dimecoinnetwork': 1,
 'dime': 1,
 'globalbuzz': 1,
 'tpp': 1,
 'parisclimateagreement': 1,
 'wherearethetests': 1,
 'shelterinplace': 1,
 'entrepreneurs': 1,
 'healthcare': 1,
 'stayhomesaveli

In [32]:
#Search using a hashtag.
myquery = {"content":{"$regex":".#covid19.","$options" :'i'}}
twts = tweets.find(myquery)

In [34]:
for tweet in twts:
    print(tweet)

{'_id': ObjectId('5ea66a14fb8f5ff7fd4712a9'), 'id_str': '1254636770051928071', 'user_name': 'Pamela Moore #WWG1WGA #DIGITALSOLDIERS', 'user_id': '1188695886781698050', 'content': 'Met virtually with our terrific @AmChamPH Board on how U.S. firms are supporting the Philippine economy, workforce, and #COVID19 response. We’re all in this together, and great to \u200bsee U.S. firms contributing in so many ways. #FriendsPartnersAllies #goodbUSinessPH', 'created_at': '2020-04-27 05:01:30', 'location': 'Broken Arrow, OK', 'hashtags': [], 'mentions': ['441540997', '4745918652'], 'in_reply_to_user_id': None, 'in_reply_to_status_id': None, 'retweetedFrom': '441540997', 'media': [], 'retweet': True, 'FavCount': -1, 'source': 'Twitter for Android', 'retweet_count': 118}
{'_id': ObjectId('5ea66a16fb8f5ff7fd471300'), 'id_str': '1254628238032203776', 'user_name': 'Toshitaka Kage', 'user_id': '181232582', 'content': 'Fauci says #COVID19 testing needs to be doubled before U.S. reopens economy https://t

In [20]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["TWT_DB"]
tweets = db["test"]

In [25]:
#Create a test index
db.tweets.create_index([("content",pymongo.TEXT)])

'content_text'

In [57]:
twts = db.tweets.find( { "$text": { "$search": "\"#covid19\"" } } )
twts = list(twts)
twts

[{'_id': ObjectId('5ea66afbfb8f5ff7fd474756'),
  'id_str': '1253752842361483265',
  'user_name': 'Dr. Marcell Vollmer #SocialDistancing #StayHome',
  'user_id': '99674560',
  'content': 'The Great #Lockdown of the economy has been completely unprecedented, both in terms of the speed of the shutdown and its impact on jobs.\n\n #coronavirus #COVID19 #health #COVIDー19 #worklife #motivation #FutureofWork #Leadership #WorkLifeBalance #USA\n\n https://t.co/yoEHiyJcxc',
  'created_at': '2020-04-24 18:29:05',
  'location': 'Munich, Bavaria',
  'hashtags': ['Lockdown',
   'coronavirus',
   'COVID19',
   'health',
   'COVIDー19',
   'worklife',
   'motivation',
   'FutureofWork',
   'Leadership',
   'WorkLifeBalance',
   'USA'],
  'mentions': [],
  'in_reply_to_user_id': None,
  'in_reply_to_status_id': None,
  'retweetedFrom': -1,
  'media': [],
  'retweet': False,
  'FavCount': 6,
  'source': 'Twitter for iPad',
  'retweet_count': 4},
 {'_id': ObjectId('5ea66afbfb8f5ff7fd47474c'),
  'id_str': '

In [58]:
len(twts)

396

In [67]:
twts = db.tweets.find( { "$text": { "$search": "U.S."} } )
twts = list(twts)
twts

[{'_id': ObjectId('5ea66aecfb8f5ff7fd47442a'),
  'id_str': '1253789808247349248',
  'user_name': 'Bud Babbit',
  'user_id': '1251491717850808320',
  'content': '@IngrahamAngle New York is also responsible for 8% of the U.S. GDP.  It’s economy is the same size as Canada’s.  If NY was a country it’s total economy would sit 11th on the list....repeat, of countries.   Not shocking that it’s population created such a covid problem.',
  'created_at': '2020-04-24 20:55:59',
  'location': '',
  'hashtags': [],
  'mentions': ['50769180'],
  'in_reply_to_user_id': 50769180,
  'in_reply_to_status_id': 1253787025909723139,
  'retweetedFrom': -1,
  'media': [],
  'retweet': False,
  'FavCount': 4,
  'source': 'Twitter for iPhone',
  'retweet_count': 0},
 {'_id': ObjectId('5ea66a53fb8f5ff7fd472275'),
  'id_str': '1254413742210719744',
  'user_name': 'lindokuhle shezi',
  'user_id': '2819505970',
  'content': '@Oscar_T_Hamese That’s the reason why white people still own 80% of our economy, it’s that 

In [76]:
twts = db.tweets.runcommand( { "$text": { "$search": "\"U.S.\" \"#covid19""} } )
twts = list(twts)
twts

SyntaxError: EOL while scanning string literal (<ipython-input-76-bbd02dd4672c>, line 1)

In [83]:
# Search by Time range
twts = db.tweets.find({"created_at":{ "$gte": "2020-04-27 05:13:10", "$lt": "2020-04-27 05:13:50" }})
twts = list(twts)
twts

[{'_id': ObjectId('5ea66a14fb8f5ff7fd471281'),
  'id_str': '1254639834473230336',
  'user_name': 'Dave Talltree, son of Wymo of Mokuleia',
  'user_id': '1098317762626088961',
  'content': 'Many U.S. businesses unlikely to seek government aid: NABE survey https://t.co/9bPT4YQN5I https://t.co/0B3ekpqymK',
  'created_at': '2020-04-27 05:13:41',
  'location': 'Keiki o ka ʻĀina ',
  'hashtags': [],
  'mentions': ['1652541'],
  'in_reply_to_user_id': None,
  'in_reply_to_status_id': None,
  'retweetedFrom': '1652541',
  'media': ['https://t.co/0B3ekpqymK'],
  'retweet': True,
  'FavCount': -1,
  'source': 'Twitter Web App',
  'retweet_count': 7},
 {'_id': ObjectId('5ea66a14fb8f5ff7fd471282'),
  'id_str': '1254639809068519424',
  'user_name': 'Howard',
  'user_id': '2337724718',
  'content': 'U.S.: bailout small businesses by giving grants to 5% of them\n\nJapan: bailout small businesses by giving 100% of them all the payroll money they need\nhttps://t.co/10BjEDjjn3',
  'created_at': '2020-04

## Misc

In [15]:
#???????
myquery = {"location":"New York, NY"}

tweets.count_documents(myquery)

104