# Queries for the search application

In [1]:
import pandas as pd
import json
import pymongo
import io
import pprint
import psycopg2

## PostgreSQL

In [24]:
#connect to postgreSQL database
conn = psycopg2.connect(host="localhost", port = 5432, database="twitter", user="postgres", password="postgres@329")
cur = conn.cursor()

### About user data

In [12]:
#Row sample from the database 
cur.execute("select * from user_df limit 5")
user = pd.DataFrame(cur.fetchall())
user.columns = [desc[0] for desc in cur.description]
user

Unnamed: 0,user_id,user_id_str,name,screen_name,location,description,protected,followers_count,friends_count,listed_count,created_at,favourites_count,verified,statuses_count
0,498137972,498137972,Diana Dumitru,anaiduza,,Dubito ergo...,False,396,2099,225,Mon Feb 20 18:40:31 +0000 2012,4259,False,80987
1,65466158,65466158,Pedro da Costa,pdacosta,"Washington, DC",Federal Reserve & economy watcher at Market Ne...,False,136283,28095,4398,Thu Aug 13 20:59:18 +0000 2009,82454,True,239352
2,1399854920,1399854920,Kobe,BryantBWild,The Wolfpack,"Fall In Order To Grow, Lose In Order To Gain. ...",False,886,635,1,Fri May 03 14:16:58 +0000 2013,11986,False,42034
3,940032448057896960,940032448057896960,JR Thatcher 📈,JrThatcher,"West TX, USA",Graduate Student | B.S. Economics | Finance | ...,False,1145,535,13,Mon Dec 11 01:36:15 +0000 2017,178003,False,97937
4,808545,808545,Bruce Reyes-Chow 🗽,breyeschow,"San Francisco, CA",he/him/his\n#WarrenDemocrat\n@fpcpaloalto @brc...,False,10005,1748,562,Sat Mar 03 15:46:53 +0000 2007,25727,False,60731


In [25]:
## Types of user accounts (Verified vs non-verified)
#Row sample from the database 
cur.execute("SELECT count(CASE WHEN verified THEN 1 END) FROM user_df;")
verified = cur.fetchall()
cur.execute("SELECT count(CASE WHEN not verified THEN 0 END) FROM user_df;")
non_verified = cur.fetchall()

"Number of verified user accounts:" + str(verified[0][0]) + " and non_verified account:" + str(non_verified[0][0])

'Number of verified user accounts:935 and non_verified account:15863'

### Queries based on only user data

In [18]:
# Top N users and their location based on
# Status_count
# Favourites_count
# Followers_count

cur.execute("select user_id, name, location, statuses_count from user_df order by statuses_count desc limit 5")
top_n_statuses_count = pd.DataFrame(cur.fetchall())
top_n_statuses_count.columns = [desc[0] for desc in cur.description]
top_n_statuses_count

cur.execute("select user_id, name, location, favourites_count from user_df order by favourites_count desc limit 5")
top_n_favourites_count = pd.DataFrame(cur.fetchall())
top_n_favourites_count.columns = [desc[0] for desc in cur.description]
top_n_favourites_count

cur.execute("select user_id, name, location, followers_count from user_df order by followers_count desc limit 5;")
top_n_followers_count = pd.DataFrame(cur.fetchall())
top_n_followers_count.columns = [desc[0] for desc in cur.description]
top_n_followers_count

Unnamed: 0,user_id,name,location,followers_count
0,807095,The New York Times,New York City,46425645
1,1652541,Reuters,Around the world,21866712
2,1652541,Reuters,Around the world,21866711
3,1652541,Reuters,Around the world,21866710
4,1652541,Reuters,Around the world,21866710


In [23]:
#Close the cursor and connection to the database
cur.close()
conn.close()

## MongoDB

In [2]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["tweet_database"]
tweets = db.tweets_collection

In [28]:
#Document(row) sample from the mongoDB database 
myquery = {}
mydoc = list(tweets.find(myquery))
mydoc[0]

{'_id': ObjectId('5ea9c38017dfdac82a918317'),
 'id': 1255560342383603721,
 'user_name': 'Lean Consultancy',
 'user_id': '990868256360751104',
 'content': '@Growth_Lean &gt;&gt; U.S. pending home sales fall sharply in March https://t.co/8Lud9zsELG #lean https://t.co/k6PzFSfL1S',
 'created_at': '2020-04-29 18:11:27',
 'location': 'Europe',
 'hashtags': ['lean'],
 'mentions': ['990868256360751104'],
 'in_reply_to_user_id': 990868256360751104,
 'in_reply_to_status_id': None,
 'retweedt_orid_tweetID': -1,
 'retweetedFrom_id': -1,
 'retweetedFrom_name': 'NA',
 'media': ['https://t.co/k6PzFSfL1S'],
 'retweet': False,
 'FavCount': 0,
 'Orig_retweet_fav': -1,
 'source': 'Twibble.io',
 'retweet_count': 0,
 'lang': 'en'}

### About data

In [9]:
## Types of tweets
count_retweet = tweets.count_documents({"retweet":True})
count_original = tweets.count_documents({"retweet":False})
"Number of original tweet:" + str(count_original) + " and number of retweets:" + str(count_retweet)

'Number of original tweet:3306 and number of retweets:12123'

In [10]:
## Type of media in our dataset
type_count = {"text": 0, "Only_image":0, "both":0}
for tweet in tweets.find():
    md = tweet["media"]
    if len(md) != 0:
        if tweet["content"] == "":
            type_count["image"] += 1 # Only Image
        else:
            type_count["both"] += 1 # Image and text
    else:
        type_count["text"] += 1 # Only text

type_count

{'text': 14738, 'Only_image': 0, 'both': 691}

In [15]:
## Number of distint user in database
distinct_users = set()

for tweet in tweets.find():
    distinct_users.add(tweet['user_id'])

print("{} distinct user out of {} total user".format(len(distinct_users), tweets.count()))

14040 distinct user out of 15429 total user


  import sys


In [16]:
## Tweet with  maximum retweet count
max_retweet_count = 0
for tweet in tweets.find():
    if tweet['retweet_count']>max_retweet_count:
        max_retweet_count = tweet['retweet_count']
        tweet_id = tweet['id_str']
print(max_retweet_count)

myquery = {'id_str': tweet_id}
twts = list(tweets.find(myquery))
twts[0]

12676


{'_id': ObjectId('5ea77dc7c9283bc56a270ebc'),
 'id_str': '1254112304754556929',
 'user_name': 'J',
 'user_id': '785827332',
 'content': "3 companies with ties to the Trump admin. received millions under the PPP program. Another got a loan from a bank that once employed its board's chair. https://t.co/GcgqwGPe43",
 'created_at': '2020-04-25 18:17:28',
 'location': 'The World',
 'hashtags': [],
 'mentions': ['14173315'],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': '14173315',
 'media': [],
 'retweet': True,
 'FavCount': -1,
 'source': 'Twitter for iPhone',
 'retweet_count': 12676}

In [17]:
## Tweet with  maximum favorite count
max_FavCount = 0
for tweet in tweets.find():
    if tweet['FavCount']>max_FavCount:
        max_FavCount = tweet['FavCount']
        tweet_id = tweet['id_str']
print(max_FavCount)

myquery = {'id_str': tweet_id}
twts = list(tweets.find(myquery))
twts[0]

4708


{'_id': ObjectId('5ea77daec9283bc56a2709fb'),
 'id_str': '1254168393638449153',
 'user_name': 'POLITICO',
 'user_id': '9300262',
 'content': 'Dr. Anthony Fauci says the U.S. should at least double coronavirus testing in the coming weeks before easing into reopening the economy https://t.co/9LrRsvl1qX',
 'created_at': '2020-04-25 22:00:21',
 'location': 'Washington, D.C.',
 'hashtags': [],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 4708,
 'source': 'Hootsuite Inc.',
 'retweet_count': 1752}

### Search by word

In [24]:
#Find number of tweets with word
def search_word():
    user_input = input("Please enter a word: ")
    myquery = {"$text":{"$search": user_input}}
    twts = pd.DataFrame(tweets.find(myquery)).drop(['_id','source','media'],axis=1).sort_values(["retweet",
                                "FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts

In [25]:
search_word().head()

Please enter a word: covid


Unnamed: 0,index,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,77,1255218978143842308,One America News,1209936918,Increased COVID-19 testing shows promise for r...,2020-04-28 19:34:59,,[OANN],[],,,-1,-1,,False,531,-1,143,en
1,862,1255266382285082625,steve hilton,2779389582,"""reopen pre-K, K-12 schools...Switzerland is p...",2020-04-28 22:43:21,California,[],"[49457533, 851526166100029442]",,,-1,-1,,False,124,-1,66,en
2,685,1255438440814055424,Reuters,1652541,The U.S. economy likely contracted in the firs...,2020-04-29 10:07:03,Around the world,[],[],,,-1,-1,,False,86,-1,44,en
3,0,1255324865684529155,#TestAndTrace EVERYWHERE NOW 🐇,281877818,27/The U.S. wasn't just bad at testing for COV...,2020-04-29 02:35:45,"San Francisco, CA",[],[],281877818.0,1.255325e+18,-1,-1,,False,55,-1,18,en
4,726,1255468651907616769,Reuters Business,15110357,The U.S. economy likely contracted in the firs...,2020-04-29 12:07:06,Around the world,[],[],,,-1,-1,,False,38,-1,22,en


### Search by a hashtag

In [30]:
# #Find number of tweets with #covid19
def search_hashtag():
    user_input = input("Please enter a hashtag(include #): ")
    myquery = {"content":{"$regex":user_input,"$options" :'i'}}
    twts = pd.DataFrame(tweets.find(myquery)).drop(['_id','source','media'],axis=1).sort_values(["retweet",
                                                "FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts

In [31]:
search_hashtag().head()

Please enter a hashtag(include #): #trump


Unnamed: 0,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
32,1255488644762214401,K. Sennholz MD,17194090,"So basically, REPUBLICANS CRASHED THE ECONOMY ...",2020-04-29 13:26:33,United States,[TrumpLiedPeopleDied],[],,,-1,-1,,False,24,-1,11,en
4,1255541044135038976,Dempsey O'Dwyer,984453637773647877,"According to a new NPR/Marist poll, the majori...",2020-04-29 16:54:46,United States,"[coronavirus, unemployment, TrumpIsAnIdiot]",[],,,-1,-1,,False,5,-1,2,en
11,1255520013437591555,M.Mahdi Abbasi,1223499106900946944,سقوط ۴.۸ درصدی اقتصاد آمریکا تو ۳ ماهه اول سال...,2020-04-29 15:31:12,,[TrumpIsTheWORSTPresidentEVER],[],,,-1,-1,,False,4,-1,1,fa
36,1255483052907802625,REMOVEallPOS#trumpublicans,1151842300944928768,And it's #trumpsFAULT! @realDonaldTrump's abso...,2020-04-29 13:04:20,Somewhere in Iowa,[trumpsFAULT],[25073877],,,-1,-1,,False,3,-1,3,en
39,1255480003871027200,Jose Enriquez,3565028294,Much of U.S. economy still plugging along desp...,2020-04-29 12:52:13,,"[coronavirus, COVID19, COVID19, MAGA, KAG2020,...",[25073877],,,-1,-1,,False,2,-1,1,en


### Search by time range

In [48]:
#Search by time range
def search_date():
    start_date = input("Please enter a start date(format:yyyy-mm-dd hh:mm:ss): ")
    end_date = input("Please enter a end date(format:yyyy-mm-dd hh:mm:ss): ")
    twts = pd.DataFrame(tweets.find({"created_at":{ "$gte": start_date, "$lt": end_date }})).drop(['_id','source',
                'media'],axis=1).sort_values(["retweet","FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts
search_date().head()

Please enter a start date(format:yyyy-mm-dd hh:mm:ss): 2020-04-27
Please enter a end date(format:yyyy-mm-dd hh:mm:ss): 2020-04-28


Unnamed: 0,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
532,1254896442595897344,Emily Finn,1187652746,#NEW: At today’s coronavirus task force press ...,2020-04-27 22:13:21,"San Diego, CA",[NEW],[25073877],,,-1,-1,,False,837,-1,180,en
295,1254905495921528836,Richard Stengel,807357676300730368,When Trump says we had the greatest economy ev...,2020-04-27 22:49:19,"New York, NY",[],[],,,-1,-1,,False,571,-1,130,en
536,1254896183161602048,Andrew Clark,60403024,President Trump oversaw a massive mobilization...,2020-04-27 22:12:19,"Washington, DC",[],[],,,-1,-1,,False,91,-1,40,en
433,1254900459225169925,West Wing Reports,20182089,Here's U.S. economic growth since 1940. Trump ...,2020-04-27 22:29:19,White House & elsewhere,[],[],,,-1,-1,,False,81,-1,46,en
972,1254864663658954753,CGTN,1115874631,"Dow jumps 350 points, closing above 24,000, un...",2020-04-27 20:07:04,"Beijing, China",[],[],,,-1,-1,,False,76,-1,15,en


### Search by user name

In [7]:
# Search by user_name
def search_user_name():
    user_input = input("Please enter a user_name: ")
    twts = pd.DataFrame(tweets.find({"user_name":user_input})).drop(['_id','source',
                'media'],axis=1).sort_values(["retweet","FavCount","retweet_count"],ascending=[True,False,False])
    return twts
search_user_name().head()

Please enter a user_name: Tom


Unnamed: 0,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,1255560152599662594,Tom,2355540739,This is bonkers:\n\nRoughly half of all U.S. w...,2020-04-29 18:10:42,,[],[135515077],,,1255099265728352258,135515077,NKingofDC,True,0,7990,2274,en
2,1255207171312300034,Tom,1197488167311597568,"""The consequence of the lack of federal leader...",2020-04-28 18:48:04,,[],[16129920],,,1255206920698433536,16129920,maddow,True,0,2506,1224,en
1,1255225760668925954,Tom,822567886506098689,Increased COVID-19 testing shows promise for r...,2020-04-28 20:01:56,,[OANN],[1209936918],,,1255218978143842308,1209936918,OANN,True,0,531,143,en


### Search by user_id

In [10]:
# Search by user id
def search_user_id():
    user_input = input("Please enter a user_id: ")
    twts = pd.DataFrame(tweets.find({"user_id":user_input})).drop(['_id','source','media'],axis=1).sort_values(["retweet",
                                                    "FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts
search_user_id().head()

Please enter a user_id: 2355540739


Unnamed: 0,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,1255560152599662594,Tom,2355540739,This is bonkers:\n\nRoughly half of all U.S. w...,2020-04-29 18:10:42,,[],[135515077],,,1255099265728352258,135515077,NKingofDC,True,0,7990,2274,en


### Search by two words

In [34]:
#search by two words
def search_by_two_words():
    user_input1 = input("Please enter a word: ")
    user_input2 = input("Please enter a another word: ")
    myquery = {
    "$and" : [
        {"content":{"$regex":user_input1,"$options" :'i'}} , {"content":{"$regex":user_input2,"$options" :'i'}}
        ]
    }
    twts = pd.DataFrame(tweets.find(myquery)).drop(['_id','source','media'],axis=1).sort_values(["retweet",
                                        "FavCount","retweet_count"],ascending=[True,False,False]).reset_index()
    return twts
search_by_two_words().head()

Please enter a word: trump
Please enter a another word: covid


Unnamed: 0,id,user_name,user_id,content,created_at,location,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
136,1255109060141240320,Coronavirus War Room,1240382538461392901,"BLOOMBERG: ""Virus Testing Shortages Undermine ...",2020-04-28 12:18:13,United States,"[Coronavirus, COVIDー19]",[25073877],,,-1,-1,,False,88,-1,48,en
45,1255393881962639360,The Hindu,20751449,"Leaders of the U.S., the U.K. and Germany beli...",2020-04-29 07:10:00,"Chennai, India","[China, DonaldTrump, COVID19Pandemic]",[],,,-1,-1,,False,45,-1,9,en
191,1254875937277186049,RandomAssHoleOnTheInterWeb,1214722400497131520,@EvulGeenyus @inktomi544 @KwikWarren 90% of CO...,2020-04-27 20:51:52,"Los Angeles, California",[],"[882251606464614401, 22839624, 842432472]",8.822516e+17,1.254865e+18,-1,-1,,False,25,-1,12,en
87,1255174631385976834,Thomas Higbee,918538904290111488,"Always remember, trump said Covid-19 was a hoa...",2020-04-28 16:38:46,MI,[NoHoax],[],,,-1,-1,,False,19,-1,7,en
13,1255527538677383172,George Galloway,15484198,COVID19 PERSPECTIVE | Lets not lose grip on re...,2020-04-29 16:01:06,"London, England",[MOATS],"[22000269, 1155475035123081217, 83638901, 6464...",,,-1,-1,,False,18,-1,6,en


In [17]:
len(twts)

9

### Top five hashtags used

In [40]:
hashtag_count = {}
for tweet in tweets.find({"hashtags":{"$ne":[]}},{"hashtags":1, "_id":0}):
    ht = tweet["hashtags"]
    for hash in ht:
        hash = hash.lower()
        if hash not in hashtag_count:
            hashtag_count[hash] = 1
        else:
            hashtag_count[hash] += 1
hashtags = pd.DataFrame.from_dict(hashtag_count, orient='index')
hashtags = hashtags.rename(columns={0:"count"})
hashtags = hashtags.sort_values("count", ascending=False)
hashtags.head(5)

Unnamed: 0,count
covid19,229
economy,200
oann,192
coronavirus,177
new,165


In [None]:
client.close()

## Queries based on MongoDB and postgreSQL

In [14]:
#Connect to postgreSQL
conn = psycopg2.connect(host="localhost", port = 5432, database="twitter", user="postgres", password="postgres@329")
cur = conn.cursor()

In [3]:
#Connect to MongoDB
client = pymongo.MongoClient()
db = client["tweet_database"]
tweets = db.tweets_collection

### Original user of maximum retweeted tweet

In [33]:
#Original user of maximum retweeted tweet
twts = pd.DataFrame(tweets.find({"retweet": {"$ne": False}},{"user_id":1, "_id":0}).sort("retweet_count",-1).limit(1))
twts = twts.astype({'user_id': 'int64'})

cur.execute("SELECT user_id, name FROM user_df;")
users = pd.DataFrame(cur.fetchall())
users.columns = [desc[0] for desc in cur.description]

pd.merge(users,twts,on="user_id")

Unnamed: 0,user_id,name
0,856978948965769220,The_War_Economy


### What is the location of Maximum tweets.

In [5]:
#Location of maximum tweets
user_twt_count = pd.DataFrame(tweets.aggregate([
  {
    "$group": {
       "_id": "$user_id",
       "count": { "$sum": 1 }
    }
  }
]))
user_twt_count.columns = ["user_id","count"]
user_twt_count = user_twt_count.astype({'user_id': 'int64'})

In [6]:
cur.execute("SELECT user_id,location FROM user_df where location <> '';")
user_loc = pd.DataFrame(cur.fetchall())
user_loc.columns = [desc[0] for desc in cur.description]

In [7]:
loc_df = pd.merge(user_loc,user_twt_count,on="user_id")
loc_df = loc_df[["location","count"]].groupby("location").count()
loc_df.index[loc_df["count"].argmax()]

'United States'

### Search by keyword, hashtag and threshold on number of followers.

In [36]:
def search_word_hashtag_followers_count():
    input_word = input("Please enter a word: ")
    input_hashtag = input("Please enter a hashtag: ")
    threshold = input("Pleas enter a threshold for followers count of users:")
    
    twts = pd.DataFrame(tweets.find( {
        "$and" : [
            {"$text":{"$search": input_word}},
            {"content":{"$regex":input_hashtag,"$options" :'i'}}
        ]
    }))
    twts = twts.drop(["_id","location",'source','media'],axis=1)
    twts = twts.astype({'user_id': 'int64'})
    
    cur.execute("SELECT user_id,followers_count FROM user_df where followers_count > " + threshold + " order by followers_count desc;")
    users = pd.DataFrame(cur.fetchall())
    users.columns = [desc[0] for desc in cur.description]
    
    twts_df = pd.merge(users,twts,on="user_id").sort_values(["followers_count","retweet",
                            "FavCount"],ascending=[False,True,False])
    
    return twts_df

search_word_hashtag_followers_count().head()

Please enter a word: economy
Please enter a hashtag: #trump
Pleas enter a threshold for followers count of users:1000


Unnamed: 0,user_id,followers_count,id,user_name,content,created_at,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,17194090,85643,1255488644762214401,K. Sennholz MD,"So basically, REPUBLICANS CRASHED THE ECONOMY ...",2020-04-29 13:26:33,[TrumpLiedPeopleDied],[],,,-1,-1,,False,24,-1,11,en
1,63144098,25532,1255489158484758528,Dante Boykin,"So basically, REPUBLICANS CRASHED THE ECONOMY ...",2020-04-29 13:28:35,[],[17194090],,,1255488644762214401,17194090,MtnMD,True,0,24,11,en
2,984453637773647877,20241,1255541044135038976,Dempsey O'Dwyer,"According to a new NPR/Marist poll, the majori...",2020-04-29 16:54:46,"[coronavirus, unemployment, TrumpIsAnIdiot]",[],,,-1,-1,,False,5,-1,2,en
3,1151842300944928768,19388,1255483052907802625,REMOVEallPOS#trumpublicans,And it's #trumpsFAULT! @realDonaldTrump's abso...,2020-04-29 13:04:20,[trumpsFAULT],[25073877],,,-1,-1,,False,3,-1,3,en
4,1151842300944928768,19388,1255541058513129475,REMOVEallPOS#trumpublicans,And it's #trumpsFAULT! @realDonaldTrump's abso...,2020-04-29 16:54:49,[trumpsFAULT],"[1151842300944928768, 25073877]",,,1255483052907802625,1151842300944928768,Jeff66923777,True,0,3,3,en
5,25843801,17816,1255481103508520960,"Larry Underwood: ""Taking A Stand Against Trump...",#TrumpDepression U.S. economy shrank at 4.8% r...,2020-04-29 12:56:35,"[TrumpDepression, coronavirus]",[],,,-1,-1,,False,1,-1,1,en
6,25843801,17816,1255483947280216067,"Larry Underwood: ""Taking A Stand Against Trump...",#Coronavirus live updates: The #TrumpPandemic ...,2020-04-29 13:07:53,"[Coronavirus, TrumpPandemic]",[],,,-1,-1,,False,0,-1,0,en
7,62048968,16084,1255500164791447554,Hilary Sontag,And it's #trumpsFAULT! @realDonaldTrump's abso...,2020-04-29 14:12:20,[trumpsFAULT],"[1151842300944928768, 25073877]",,,1255483052907802625,1151842300944928768,Jeff66923777,True,0,3,3,en
8,182559944,16066,1255491643173752832,Robert Raymond,"So basically, REPUBLICANS CRASHED THE ECONOMY ...",2020-04-29 13:38:28,[],[17194090],,,1255488644762214401,17194090,MtnMD,True,0,24,11,en
9,519728191,14462,1255525447821144064,Jasper #Ω.,U.S. Economy Shrinks 4.8 Percent in the First ...,2020-04-29 15:52:47,[TrumpIsTheWORSTPresidentEVER],[],,,-1,-1,,False,0,-1,0,en


### Search by keyword, hashtag and a location.

In [39]:
def search_word_hashtag_location():
    input_word = input("Please enter a word: ")
    input_hashtag = input("Please enter a hashtag: ")
    location = input("Pleas enter a location:")
    
    twts = pd.DataFrame(tweets.find( {
        "$and" : [
            {"$text":{"$search": input_word}},
            {"content":{"$regex":input_hashtag,"$options" :'i'}}
        ]
    }))
    twts = twts.drop(["_id","location",'source','media'],axis=1)
    twts = twts.astype({'user_id': 'int64'})
    
    cur.execute("SELECT user_id,location FROM user_df where location like '%" + location + "%';")
    users = pd.DataFrame(cur.fetchall())
    users.columns = [desc[0] for desc in cur.description]
    
    twts_df = pd.merge(users,twts,on="user_id").sort_values(["retweet","FavCount",
                                                             "retweet_count"],ascending=[True,False,False]).reset_index()
    
    return twts_df
search_word_hashtag_location()

Please enter a word: economy
Please enter a hashtag: #trump
Pleas enter a location:New York


Unnamed: 0,index,user_id,location,id,user_name,content,created_at,hashtags,mentions,in_reply_to_user_id,in_reply_to_status_id,retweedt_orid_tweetID,retweetedFrom_id,retweetedFrom_name,retweet,FavCount,Orig_retweet_fav,retweet_count,lang
0,0,379297386,New York City,1255272718376800258,tatianyc.oeuvre,@tatianycoeuvre #Trump Wants You To Die For th...,2020-04-28 23:08:32,[Trump],"[954497565336178688, 379297386]",,,1255268879032614915,954497565336178688,korol_koshek,True,0,2,1,en


In [13]:
#Close the cursor and connection to the database
cur.close()
conn.close()

In [42]:
client.close()

## Misc

In [21]:
hashtag_count = {}
for tweet in tweets.find():
    ht = tweet["hashtags"]
    for hash in ht:
        hash = hash.lower()
        if hash not in hashtag_count:
            hashtag_count[hash] = 1
        else:
            hashtag_count[hash] += 1

hc = sorted(hashtag_count, key=hashtag_count.get, reverse=True)  # Sorting the top 10.
count = 0

for z in hc:
    if count == 10:
        break
    jsonx = {"hashtag": z, "count": str(hashtag_count[z])}
    count += 1
# {k: v for k, v in sorted(hashtag_count.items(), key=lambda item: item[1])}

## Experimenting with text index

In [12]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db1 = client["TWT_DB"]
test = db1["test"]

In [29]:
#Create a test index
test.create_index([("content",pymongo.TEXT)])

'content_text'

In [30]:
twts1 = test.find({"$text": {"$search": "\"#covid19\"" } } )
twts1 = list(twts1)
twts1[0]

{'_id': ObjectId('5ea66afbfb8f5ff7fd474756'),
 'id_str': '1253752842361483265',
 'user_name': 'Dr. Marcell Vollmer #SocialDistancing #StayHome',
 'user_id': '99674560',
 'content': 'The Great #Lockdown of the economy has been completely unprecedented, both in terms of the speed of the shutdown and its impact on jobs.\n\n #coronavirus #COVID19 #health #COVIDー19 #worklife #motivation #FutureofWork #Leadership #WorkLifeBalance #USA\n\n https://t.co/yoEHiyJcxc',
 'created_at': '2020-04-24 18:29:05',
 'location': 'Munich, Bavaria',
 'hashtags': ['Lockdown',
  'coronavirus',
  'COVID19',
  'health',
  'COVIDー19',
  'worklife',
  'motivation',
  'FutureofWork',
  'Leadership',
  'WorkLifeBalance',
  'USA'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 6,
 'source': 'Twitter for iPad',
 'retweet_count': 4}

In [31]:
len(twts1)

396

In [58]:
twts1 = test.find({"$text": {"$search": "covid19"} } )
twts1 = list(twts1)
twts1[0]

{'_id': ObjectId('5ea66afbfb8f5ff7fd474756'),
 'id_str': '1253752842361483265',
 'user_name': 'Dr. Marcell Vollmer #SocialDistancing #StayHome',
 'user_id': '99674560',
 'content': 'The Great #Lockdown of the economy has been completely unprecedented, both in terms of the speed of the shutdown and its impact on jobs.\n\n #coronavirus #COVID19 #health #COVIDー19 #worklife #motivation #FutureofWork #Leadership #WorkLifeBalance #USA\n\n https://t.co/yoEHiyJcxc',
 'created_at': '2020-04-24 18:29:05',
 'location': 'Munich, Bavaria',
 'hashtags': ['Lockdown',
  'coronavirus',
  'COVID19',
  'health',
  'COVIDー19',
  'worklife',
  'motivation',
  'FutureofWork',
  'Leadership',
  'WorkLifeBalance',
  'USA'],
 'mentions': [],
 'in_reply_to_user_id': None,
 'in_reply_to_status_id': None,
 'retweetedFrom': -1,
 'media': [],
 'retweet': False,
 'FavCount': 6,
 'source': 'Twitter for iPad',
 'retweet_count': 4}

In [56]:
len(twts1)

408

In [44]:
test.find({"$text":{"$search": "covid19"}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.test',
  'indexFilterSet': False,
  'parsedQuery': {'$text': {'$search': 'covid19',
    '$language': 'english',
    '$caseSensitive': False,
    '$diacriticSensitive': False}},
  'winningPlan': {'stage': 'TEXT',
   'indexPrefix': {},
   'indexName': 'content_text',
   'parsedTextQuery': {'terms': ['covid19'],
    'negatedTerms': [],
    'phrases': [],
    'negatedPhrases': []},
   'textIndexVersion': 3,
   'inputStage': {'stage': 'TEXT_MATCH',
    'inputStage': {'stage': 'FETCH',
     'inputStage': {'stage': 'OR',
      'inputStage': {'stage': 'IXSCAN',
       'keyPattern': {'_fts': 'text', '_ftsx': 1},
       'indexName': 'content_text',
       'isMultiKey': True,
       'isUnique': False,
       'isSparse': False,
       'isPartial': False,
       'indexVersion': 2,
       'direction': 'backward',
       'indexBounds': {}}}}}},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 408,
  'exe

In [36]:
#Connect to the mongoDB database
client = pymongo.MongoClient()
db = client["TWT_DB"]
tweets = db["tweets"]

In [50]:
tweets.find({"content":{"$regex":"covid19","$options" :'i'}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.tweets',
  'indexFilterSet': False,
  'parsedQuery': {'content': {'$regex': 'covid19', '$options': 'i'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'covid19', '$options': 'i'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 413,
  'executionTimeMillis': 11,
  'totalKeysExamined': 0,
  'totalDocsExamined': 15429,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'covid19', '$options': 'i'}},
   'nReturned': 413,
   'executionTimeMillisEstimate': 0,
   'works': 15431,
   'advanced': 413,
   'needTime': 15017,
   'needYield': 0,
   'saveState': 120,
   'restoreState': 120,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 15429},
  'allPlansExecution': []},
 'serverInfo': {'host': 'DESKTOP-T0J0HGC',
  'port': 27017,
  'version': '4.2.6',
  'gitVersion': '20364840b8f1af16917e4c23c1b5f5efd8b3

In [13]:
tweets.find({"content":{"$regex":"trump","$options" :'i'}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.tweets',
  'indexFilterSet': False,
  'parsedQuery': {'content': {'$regex': 'trump', '$options': 'i'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 2240,
  'executionTimeMillis': 37,
  'totalKeysExamined': 0,
  'totalDocsExamined': 15429,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'nReturned': 2240,
   'executionTimeMillisEstimate': 0,
   'works': 15431,
   'advanced': 2240,
   'needTime': 13190,
   'needYield': 0,
   'saveState': 120,
   'restoreState': 120,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 15429},
  'allPlansExecution': []},
 'serverInfo': {'host': 'DESKTOP-T0J0HGC',
  'port': 27017,
  'version': '4.2.6',
  'gitVersion': '20364840b8f1af16917e4c23c1b5f5efd8b352f

In [14]:
test.find({"content":{"$regex":"trump","$options" :'i'}}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'TWT_DB.test',
  'indexFilterSet': False,
  'parsedQuery': {'content': {'$regex': 'trump', '$options': 'i'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 2240,
  'executionTimeMillis': 18,
  'totalKeysExamined': 0,
  'totalDocsExamined': 15429,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'content': {'$regex': 'trump', '$options': 'i'}},
   'nReturned': 2240,
   'executionTimeMillisEstimate': 0,
   'works': 15431,
   'advanced': 2240,
   'needTime': 13190,
   'needYield': 0,
   'saveState': 120,
   'restoreState': 120,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 15429},
  'allPlansExecution': []},
 'serverInfo': {'host': 'DESKTOP-T0J0HGC',
  'port': 27017,
  'version': '4.2.6',
  'gitVersion': '20364840b8f1af16917e4c23c1b5f5efd8b352f8'

In [33]:
client.close()