# Part 1: Data Ingestion
Ingest the data: figure out a way to put the data in a structure so that you can query it as described in Part 2.


## Loading and Understanding the Data

In [2]:
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import time

In [3]:
small_tsv = '../data/Copy of correct_twitter_201904.tsv'
large_tsv = '../data/Copy of correct_twitter_202102.tsv'

In [7]:
df = pd.read_csv(large_tsv, 
                 sep="\t",
                 # nrows=100 # comment out to load entire dataset
                )
df.head()

  df = pd.read_csv(large_tsv,


Unnamed: 0,id,event,ts1,ts2,from_stream,directly_from_stream,from_search,directly_from_search,from_quote_search,directly_from_quote_search,...,retweeted,retweeted_author_id,retweeted_handle,retweeted_follower_count,mentioned_author_ids,mentioned_handles,hashtags,urls,media_keys,place_id
0,1358322479136178177,britney_202102,2022-03-01 09:06:52.403595-05:00,2022-03-01 09:06:52.403595-05:00,True,True,False,False,False,False,...,1.358249e+18,93259370.0,briebxrries,1002.0,,,,,,
1,1358323016736796677,britney_202102,2022-01-05 08:34:04.477789-05:00,2022-03-01 09:06:52.398778-05:00,True,True,False,False,False,False,...,1.358189e+18,17525170.0,Variety,2669599.0,,,['FreeBritney'],,,
2,1358322996696465409,britney_202102,2022-03-01 09:06:52.399008-05:00,2022-03-01 09:06:52.399008-05:00,True,True,False,False,False,False,...,1.358201e+18,403576600.0,Johnnerkell,2650.0,,,,,,
3,1358322976769286151,britney_202102,2022-03-01 09:06:52.399205-05:00,2022-03-01 09:06:52.399205-05:00,True,True,False,False,False,False,...,1.358238e+18,4429004000.0,PopCrave,880695.0,,,['FramingBritney'],,,
4,1368923802260889606,britney_202102,2022-03-01 07:26:05.505986-05:00,2022-03-01 07:26:05.505986-05:00,True,True,False,False,False,False,...,,,,,,,,,,


In [32]:
df['datetime'] = pd.to_datetime(df['ts1'], errors="coerce")
df[df['datetime'].isna()][['ts1',' ts2', 'created_at','text']]

Unnamed: 0,ts1,ts2,created_at,text
568585,363406372,sunset1912,,1.3595108025171395e+18
595222,819646999939207168,ClickySound,,
641041,718955223750840320,MashedUpMovies,,
819200,913671438,juniorfonseca,,


# Data Quality Observations

## 4 `ts1` values do not fit the general schema

In [25]:
df['datetime'] = pd.to_datetime(df['ts1'], errors="coerce")
df[df['datetime'].isna()][['ts1',' ts2', 'text']]

Unnamed: 0,ts1,ts2,text
568585,363406372,sunset1912,1.3595108025171395e+18
595222,819646999939207168,ClickySound,
641041,718955223750840320,MashedUpMovies,
819200,913671438,juniorfonseca,


### Action Taken
* Each record is ingested into mongodb to faithfully represent the source data.
* The API code will be responsible for filtering out invalid data from its queries.
    * *This is handled in `db.get_tweets()`*

## `ts1` and `ts2` values matched 87.3% of the time

In [29]:
matched_ts = (df['ts1'] == df[' ts2']).sum()
print(f"`ts1` and `ts2` values matched {matched_ts / len(df)}% ({matched_ts}/{len(df)}) of the time.")

`ts1` and `ts2` values matched 0.8739065659252953% (739089/845730) of the time.


## Action Taken
* When filtering for a range of time, I only consider the `ts1` column.

## 99.52% of `place_id` values are null

In [31]:
place_id_inv = df['place_id'].isna().sum()
print(f"{round(place_id_inv / len(df), 4)*100}% ({df['place_id'].isna().sum()}/{len(df)}) of all tweets have null Place ID")

99.52% (841697/845730) of all tweets have null Place ID


### Action Taken
* This data quality issue seems out of place, so I would first want to further investigate the root cause by learning more about how the dataset was assembled.
* For now, the code is primarily designed such that it will return an empty list when the `place_id` is null and checked that it does work in rare cases when it is not.

## Connect to MongoDB

In [11]:
import pymongo
from pymongo import MongoClient
import urllib.parse
import os

In [12]:
client = MongoClient('mongodb',
                     username=os.environ['MONGO_INITDB_ROOT_USERNAME'],
                     password=os.environ['MONGO_INITDB_ROOT_PASSWORD'],
                     authMechanism='SCRAM-SHA-256')

try:
   # The ismaster command is cheap and does not require auth.
   client.admin.command('ismaster')
except ConnectionFailure:
   print("Server not available")

In [5]:
client.server_info()

{'version': '7.0.12',
 'gitVersion': 'b6513ce0781db6818e24619e8a461eae90bc94fc',
 'modules': [],
 'allocator': 'tcmalloc',
 'javascriptEngine': 'mozjs',
 'sysInfo': 'deprecated',
 'versionArray': [7, 0, 12, 0],
 'openssl': {'running': 'OpenSSL 3.0.2 15 Mar 2022',
  'compiled': 'OpenSSL 3.0.2 15 Mar 2022'},
 'buildEnvironment': {'distmod': 'ubuntu2204',
  'distarch': 'aarch64',
  'cc': '/opt/mongodbtoolchain/v4/bin/gcc: gcc (GCC) 11.3.0',
  'ccflags': '-Werror -include mongo/platform/basic.h -ffp-contract=off -fasynchronous-unwind-tables -g2 -Wall -Wsign-compare -Wno-unknown-pragmas -Winvalid-pch -gdwarf-5 -fno-omit-frame-pointer -fno-strict-aliasing -O2 -march=armv8.2-a -mtune=generic -Wno-unused-local-typedefs -Wno-unused-function -Wno-deprecated-declarations -Wno-unused-const-variable -Wno-unused-but-set-variable -Wno-missing-braces -fstack-protector-strong -gdwarf64 -Wa,--nocompress-debug-sections -Wimplicit-fallthrough=5',
  'cxx': '/opt/mongodbtoolchain/v4/bin/g++: g++ (GCC) 11.3.

## Create Database & Collection

In [13]:
db = client['nio']
tweets_col = db['tweets']

## Load Data into the database

In [13]:
num_lines = sum(1 for _ in open(large_tsv, 'r'))

with open(large_tsv, 'r') as fIn:
    docs=[]
    
    for line_ind, line in tqdm(enumerate(fIn.readlines()), total=num_lines):
        if line_ind==0:
            col_headers=line.split("\t")
        else:
            doc = {}
            
            for col_ind, col_val in enumerate(line.split("\t")):
               doc[col_headers[col_ind]] = col_val
            
            docs.append(doc)

print("Bulk Inserting Documents.")
result = tweets_col.insert_many(docs)
print(result.inserted_ids[:10])

  0%|          | 0/843854 [00:00<?, ?it/s]

[ObjectId('6681b23d34f58518da1d647d'), ObjectId('6681b23d34f58518da1d647e'), ObjectId('6681b23d34f58518da1d647f'), ObjectId('6681b23d34f58518da1d6480'), ObjectId('6681b23d34f58518da1d6481'), ObjectId('6681b23d34f58518da1d6482'), ObjectId('6681b23d34f58518da1d6483'), ObjectId('6681b23d34f58518da1d6484'), ObjectId('6681b23d34f58518da1d6485'), ObjectId('6681b23d34f58518da1d6486')]


## Create Text Index on `text` field

In [18]:
tweets_col.create_index({"text":"text"})

'text_text'