# Part 1: Data Ingestion
Ingest the data: figure out a way to put the data in a structure so that you can query it as described in Part 2.


## Loading and Understanding the Data

In [9]:
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import time

In [10]:
small_tsv = '../data/Copy of correct_twitter_201904.tsv'
large_tsv = '../data/Copy of correct_twitter_202102.tsv'

In [38]:
df = pd.read_csv(large_tsv, sep="\t")
df.head()

  df = pd.read_csv(large_tsv, sep="\t")


Unnamed: 0,id,event,ts1,ts2,from_stream,directly_from_stream,from_search,directly_from_search,from_quote_search,directly_from_quote_search,...,retweeted,retweeted_author_id,retweeted_handle,retweeted_follower_count,mentioned_author_ids,mentioned_handles,hashtags,urls,media_keys,place_id
0,1358322479136178177,britney_202102,2022-03-01 09:06:52.403595-05:00,2022-03-01 09:06:52.403595-05:00,True,True,False,False,False,False,...,1.358249e+18,93259370.0,briebxrries,1002.0,,,,,,
1,1358323016736796677,britney_202102,2022-01-05 08:34:04.477789-05:00,2022-03-01 09:06:52.398778-05:00,True,True,False,False,False,False,...,1.358189e+18,17525170.0,Variety,2669599.0,,,['FreeBritney'],,,
2,1358322996696465409,britney_202102,2022-03-01 09:06:52.399008-05:00,2022-03-01 09:06:52.399008-05:00,True,True,False,False,False,False,...,1.358201e+18,403576600.0,Johnnerkell,2650.0,,,,,,
3,1358322976769286151,britney_202102,2022-03-01 09:06:52.399205-05:00,2022-03-01 09:06:52.399205-05:00,True,True,False,False,False,False,...,1.358238e+18,4429004000.0,PopCrave,880695.0,,,['FramingBritney'],,,
4,1368923802260889606,britney_202102,2022-03-01 07:26:05.505986-05:00,2022-03-01 07:26:05.505986-05:00,True,True,False,False,False,False,...,,,,,,,,,,


In [51]:
df['datetime'] = pd.to_datetime(df['ts1'], errors="coerce")

In [55]:
set(dates)

{'2022-01-04', '2022-01-05', '2022-01-22', '2022-03-01'}

In [41]:
df.iloc[0].id

1358322479136178177

In [42]:
df.iloc[0]['ts1']

'2022-03-01 09:06:52.403595-05:00'

In [43]:
df.iloc[0]['text']

'RT @briebxrries: what is joe biden and kamala harris’ plans to finally make justin timberlake pay for his crimes against janet jackson and…'

In [102]:
print(f"{df['place_id'].isna().sum()} out of {len(df)} tweets have null Place ID")

87016 out of 88037 tweets have null Place ID


In [63]:
for col in df.columns:
    print(col)

id
event
ts1
 ts2
from_stream
directly_from_stream
from_search
directly_from_search
from_quote_search
directly_from_quote_search
from_convo_search
directly_from_convo_search
from_timeline_search
directly_from_timeline_search
text
lang
author_id
author_handle
created_at
conversation_id
possibly_sensitive
reply_settings
source
author_follower_count
retweet_count
reply_count
like_count
quote_count
replied_to
replied_to_author_id
replied_to_handle
replied_to_follower_count
quoted
quoted_author_id
quoted_handle
quoted_follower_count
retweeted
retweeted_author_id
retweeted_handle
retweeted_follower_count
mentioned_author_ids
mentioned_handles
hashtags
urls
media_keys
place_id


## Connect to MongoDB

In [11]:
import pymongo
from pymongo import MongoClient
import urllib.parse
import os
username=urllib.parse.quote_plus('root')
password=urllib.parse.quote_plus('password')

In [12]:
client = MongoClient('mongodb',
                     username=os.environ['MONGO_INITDB_ROOT_USERNAME'],
                     password=os.environ['MONGO_INITDB_ROOT_PASSWORD'],
                     authMechanism='SCRAM-SHA-256')

try:
   # The ismaster command is cheap and does not require auth.
   client.admin.command('ismaster')
except ConnectionFailure:
   print("Server not available")

In [5]:
client.server_info()

{'version': '7.0.12',
 'gitVersion': 'b6513ce0781db6818e24619e8a461eae90bc94fc',
 'modules': [],
 'allocator': 'tcmalloc',
 'javascriptEngine': 'mozjs',
 'sysInfo': 'deprecated',
 'versionArray': [7, 0, 12, 0],
 'openssl': {'running': 'OpenSSL 3.0.2 15 Mar 2022',
  'compiled': 'OpenSSL 3.0.2 15 Mar 2022'},
 'buildEnvironment': {'distmod': 'ubuntu2204',
  'distarch': 'aarch64',
  'cc': '/opt/mongodbtoolchain/v4/bin/gcc: gcc (GCC) 11.3.0',
  'ccflags': '-Werror -include mongo/platform/basic.h -ffp-contract=off -fasynchronous-unwind-tables -g2 -Wall -Wsign-compare -Wno-unknown-pragmas -Winvalid-pch -gdwarf-5 -fno-omit-frame-pointer -fno-strict-aliasing -O2 -march=armv8.2-a -mtune=generic -Wno-unused-local-typedefs -Wno-unused-function -Wno-deprecated-declarations -Wno-unused-const-variable -Wno-unused-but-set-variable -Wno-missing-braces -fstack-protector-strong -gdwarf64 -Wa,--nocompress-debug-sections -Wimplicit-fallthrough=5',
  'cxx': '/opt/mongodbtoolchain/v4/bin/g++: g++ (GCC) 11.3.

## Create Database & Collection

In [13]:
db = client['nio']
tweets_col = db['tweets']

## Load Data into the database

In [13]:
num_lines = sum(1 for _ in open(large_tsv, 'r'))

with open(large_tsv, 'r') as fIn:
    docs=[]
    
    for line_ind, line in tqdm(enumerate(fIn.readlines()), total=num_lines):
        if line_ind==0:
            col_headers=line.split("\t")
        else:
            doc = {}
            
            for col_ind, col_val in enumerate(line.split("\t")):
               doc[col_headers[col_ind]] = col_val
            
            docs.append(doc)

print("Bulk Inserting Documents.")
result = tweets_col.insert_many(docs)
print(result.inserted_ids[:10])

  0%|          | 0/843854 [00:00<?, ?it/s]

[ObjectId('6681b23d34f58518da1d647d'), ObjectId('6681b23d34f58518da1d647e'), ObjectId('6681b23d34f58518da1d647f'), ObjectId('6681b23d34f58518da1d6480'), ObjectId('6681b23d34f58518da1d6481'), ObjectId('6681b23d34f58518da1d6482'), ObjectId('6681b23d34f58518da1d6483'), ObjectId('6681b23d34f58518da1d6484'), ObjectId('6681b23d34f58518da1d6485'), ObjectId('6681b23d34f58518da1d6486')]


## Create Text Index on `text` field

In [18]:
tweets_col.create_index({"text":"text"})

'text_text'

# Part 2 - Querying the Data
Construct functionality that allows you to query the data. If we search for a term, like “music,” we would like to know some subset of the following:
* How many tweets were posted containing the term on each day?
* How many unique users posted a tweet containing the term?
* How many likes did tweets containing the term get, on average?
* Where (in terms of place IDs) did the tweets come from?
* What times of day were the tweets posted at? 
* Which user posted the most tweets containing the term?

In [30]:
def get_tweets(term):
    q = {"$text":
         {"$search": term},
    }

    print("retreiving...")
    start_t = time.time()
    results = list(tweets_col.find(q, {"text":1, "id":1,"ts1":1, "place_id": 1, "author_id":1, "author_handle":1, "like_count":1}))
    print(f"Query took {time.time()-start_t} seconds.")

    print("counting...")
    results_df = pd.DataFrame(results).set_index('id')
    results_df['datetime'] = pd.to_datetime(results_df['ts1'])
    results_df['like_count'] = pd.to_numeric(results_df['like_count'], downcast='integer', errors='coerce')

    return results_df

In [35]:
term_df.loc['1131594960443199488']

KeyError: '1131594960443199488'

### How many tweets were posted containing the term on each day?

In [31]:
def tweet_count_per_day(term_df):
    counts_df = term_df.groupby([term_df['datetime'].dt.date]).count()

    print("serializing...")
    data = []
    
    for ind, row in counts_df.iterrows():
        record = {'date':ind.strftime("%Y-%m-%d"),
                  'count':int(row['text'])}
        data.append(record)

    return data

In [32]:
term_df = get_tweets("loneliness")
tweet_count_per_day(term_df)

retreiving...
Query took 0.011831045150756836 seconds.
counting...
serializing...


[{'date': '2022-01-04', 'count': 1},
 {'date': '2022-01-05', 'count': 6},
 {'date': '2022-03-01', 'count': 530}]

In [33]:
term_df.head()

Unnamed: 0_level_0,_id,ts1,text,author_id,author_handle,like_count,datetime
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1360054061203140609,6681b24b34f58518da2300ec,2022-03-01 07:55:32.673859-05:00,"""My loneliness is killing me"" -Britney Spears ...",1411800175,RachelMComedy,749,2022-03-01 07:55:32.673859-05:00
1363033764117377029,6681b24b34f58518da22671e,2022-03-01 07:53:58.711658-05:00,"From ""my loneliness is killing me"" to ""my lo...",1033227162524893184,RedRoomAlumna,1,2022-03-01 07:53:58.711658-05:00
1360054114453970946,6681b24f34f58518da252849,2022-03-01 08:26:31.664896-05:00,"RT @RachelMComedy: ""My loneliness is killing m...",196659829,ElleCuhBong,0,2022-03-01 08:26:31.664896-05:00
1360054345069379592,6681b24f34f58518da25281a,2022-03-01 08:26:31.659370-05:00,"RT @RachelMComedy: ""My loneliness is killing m...",47706419,Krakoan_Lorax,0,2022-03-01 08:26:31.659370-05:00
1360055360330084352,6681b24f34f58518da252777,2022-03-01 08:26:28.561586-05:00,"RT @RachelMComedy: ""My loneliness is killing m...",3050023319,treydayway,0,2022-03-01 08:26:28.561586-05:00


### How many unique users posted a tweet containing the term?

In [10]:
author_vc = term_df['author_id'].value_counts()
author_vc.shape

(354479,)

### How many likes did tweets containing the term get, on average?

In [11]:
term_df['like_count'].mean()

7.897923772542023

### Where (in terms of place IDs) did the tweets come from?

In [12]:
test_df = term_df

if 'place_id' not in test_df.columns:
    print("no place_id")
    place_ids = []
else:
    place_ids = list(test_df['place_id'].unique())

print(len(place_ids))

no place_id
0


### What times of day were the tweets posted at?

In [14]:
term_df.set_index('datetime').between_time('09:00:00', '10:00:00').shape

(188536, 5)

In [15]:
morning_df = term_df.set_index('datetime').between_time('05:00:00', '12:00:00')
afternoon_df = term_df.set_index('datetime').between_time('12:00:00','17:00:00')
evening_df = term_df.set_index('datetime').between_time('17:00:00','23:59:59')
overnight_df = term_df.set_index('datetime').between_time('00:00:00','05:00:00')

### Which user posted the most tweets containing the term?

In [16]:
term_df['author_handle'].value_counts().index[0]

'britneyplaylist'

# Putting it into a method

In [10]:
def query_term(term):
    start_t = time.time()
    term_df = get_tweets(term)
    time_to_query = time.time() - start_t
    # How many tweets were posted containing the term on each day?
    daily_counts = tweet_count_per_day(term_df)

    # How many unique users posted a tweet containing the term?
    author_vc = term_df['author_id'].value_counts()
    unique_users = len(author_vc)

    # How many likes did tweets containing the term get, on average?
    avg_likes_per_tweet = round(term_df['like_count'].mean(), 3)

    # Where (in terms of place IDs) did the tweets come from?
    place_ids = []
    if 'place_id' in term_df.columns:
        place_ids = list(term_df['place_id'].unique())

    # What times of day were the tweets posted at?
    term_df = term_df.set_index('datetime')
    morning_df = term_df.between_time('05:00:00', '12:00:00')
    afternoon_df = term_df.between_time('12:00:00','17:00:00')
    evening_df = term_df.between_time('17:00:00','23:59:59')
    overnight_df = term_df.between_time('00:00:00','05:00:00')

    times_of_day = {'morning':len(morning_df),
                    'afternoon':len(afternoon_df),
                    'evening':len(evening_df),
                    'overnight':len(overnight_df)}

    results = {'term':term,
               'time_to_complete_query':time_to_query,
               'counts_by_day':daily_counts,
               'users':unique_users,
               'avg_likes_per_tweet':avg_likes_per_tweet,
               'place_ids':place_ids,
               'times_of_day':times_of_day}
    return results

In [18]:
result = query_term("music")
result

retreiving...
Query took 0.45240306854248047 seconds.
counting...
serializing...


{'term': 'music',
 'time_to_complete_query': 0.6354808807373047,
 'counts_by_day': [{'date': '2022-01-04', 'count': 276},
  {'date': '2022-01-05', 'count': 1219},
  {'date': '2022-01-22', 'count': 14},
  {'date': '2022-03-01', 'count': 24513}],
 'users': 18532,
 'avg_likes_per_tweet': 112.733,
 'place_ids': [],
 'times_of_day': {'morning': 25732,
  'afternoon': 9,
  'evening': 281,
  'overnight': 0}}

In [16]:
type(result['counts_by_day'][0]['count'])

numpy.int64

In [12]:
import json

In [19]:
json.dumps(result)

'{"term": "music", "time_to_complete_query": 0.6354808807373047, "counts_by_day": [{"date": "2022-01-04", "count": 276}, {"date": "2022-01-05", "count": 1219}, {"date": "2022-01-22", "count": 14}, {"date": "2022-03-01", "count": 24513}], "users": 18532, "avg_likes_per_tweet": 112.733, "place_ids": [], "times_of_day": {"morning": 25732, "afternoon": 9, "evening": 281, "overnight": 0}}'