In [1]:
# standard library
from collections import namedtuple
import os

# pydata
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# bson-numpy libraries
import numpy as np
from bson import CodecOptions
from bson.raw_bson import RawBSONDocument
from pymongo import MongoClient
import bsonnumpy

# other libraries
import maya

In [2]:
# get mongodb params (using configparser)
mlab_uri = os.environ['MLAB_URI']
mlab_collection = os.environ['MLAB_COLLECTION']

In [3]:
codec_options=CodecOptions(document_class=RawBSONDocument)
client = MongoClient(mlab_uri)
db = client.get_default_database()
collection = db.get_collection(
    mlab_collection,
    codec_options=codec_options
)

---

## Get original and retweeted separately

### Original

In [37]:
# we only pull the fields defined here
dtype_original = np.dtype(
    [
        ('full_text', 'S400'),
        ('created_at', 'S64'),
    ]
)

In [38]:
ndarray_original = bsonnumpy.sequence_to_ndarray(
    (doc.raw for doc in collection.find({"retweeted_status": {"$exists": False}})), 
    dtype_original, 
    collection.count(),
)

In [39]:
print(ndarray_original)

[ (b'Still. API keys in ipython notebooks! Yikes', b'Sat Dec 16 21:23:32 +0000 2017')
 (b'Fixing up a project I worked on in April 2016. Good job me for documenting sorta well', b'Sat Dec 16 21:22:47 +0000 2017')
 (b'https://t.co/6Ubtx6S1O8\n#bookmark', b'Sat Dec 16 20:45:00 +0000 2017')
 ...,
 (b'Starting up a new public account.', b'Wed Dec 26 18:49:53 +0000 2012')
 (b'Was also logging to AWS CloudWatch. Best practices sorta', b'Sat Dec 16 21:45:38 +0000 2017')
 (b'Got my local dev environment all set up. @droneio builds are also passing. Can finally start working on the business logic! #dockercompose #devops', b'Sun Dec 17 01:06:06 +0000 2017')]


In [40]:
tweets_original = pd.DataFrame(ndarray_original)

In [48]:
tweets_original['full_text'] = tweets_original['full_text'].str.decode('utf-8')
tweets_original['created_at'] = tweets_original['created_at'].str.decode('utf-8')

In [49]:
tweets_original.head()

Unnamed: 0,full_text,created_at
0,Still. API keys in ipython notebooks! Yikes,Sat Dec 16 21:23:32 +0000 2017
1,Fixing up a project I worked on in April 2016....,Sat Dec 16 21:22:47 +0000 2017
2,https://t.co/6Ubtx6S1O8\n#bookmark,Sat Dec 16 20:45:00 +0000 2017
3,VSCode November release \nhttps://t.co/U4nzPLE8QI,Sat Dec 16 20:23:18 +0000 2017
4,@TooManyNickLees It does for this project I'm ...,Sat Dec 16 20:08:26 +0000 2017


---

### Retweeted

In [42]:
# we only pull the fields defined here
dtype_rt = np.dtype(
    [
        ('retweeted_status', np.dtype([('full_text', 'S400')])),
        ('created_at', 'S64'),
    ]
)

In [43]:
ndarray_rt = bsonnumpy.sequence_to_ndarray(
    (doc.raw for doc in collection.find({"retweeted_status": {"$exists": True}})), 
    dtype_rt, 
    collection.count(),
)

In [44]:
tweets_rt = pd.DataFrame(ndarray_rt)

In [46]:
tweets_rt['full_text'] = tweets_rt['retweeted_status'].map(lambda value: value[0].decode('utf-8'))
tweets_rt['created_at'] = tweets_rt['created_at'].str.decode('utf-8')

In [47]:
tweets_rt.head()

Unnamed: 0,retweeted_status,created_at,full_text
0,(b'A list of things that might be useful when ...,Sat Dec 16 20:29:52 +0000 2017,A list of things that might be useful when ans...
1,(b'OUT: import pdb; pdb.set_trace()\nIN: break...,Sat Dec 16 20:08:52 +0000 2017,OUT: import pdb; pdb.set_trace()\nIN: breakpoi...
2,"(b'Great news, Guido has pronounced dicts to r...",Sat Dec 16 16:46:55 +0000 2017,"Great news, Guido has pronounced dicts to reta..."
3,(b'First ever PyCon happening in Pakistan http...,Sat Dec 16 13:46:57 +0000 2017,First ever PyCon happening in Pakistan https:/...
4,(b'#python news: \xf0\x9f\x98\x80 @gvanrossum...,Sat Dec 16 05:14:45 +0000 2017,#python news: 😀 @gvanrossum just pronounced t...


---

## Combine Together

In [50]:
len(tweets_original) + len(tweets_rt)

2569

In [61]:
tweets = pd.concat([tweets_rt, tweets_original])

In [62]:
len(tweets)

2569

In [63]:
tweets['retweeted'] = tweets['retweeted_status'].notna()
tweets = tweets.drop(columns=['retweeted_status'])

In [64]:
tweets.retweeted.value_counts()

False    1571
True      998
Name: retweeted, dtype: int64

In [65]:
tweets.describe(include='all')

Unnamed: 0,created_at,full_text,retweeted
count,2569,2569,2569
unique,2569,2562,2
top,Fri May 26 05:03:19 +0000 2017,Git branching done right with Gitflow &amp; im...,False
freq,1,2,1571


---

## Add additional information

* Parse datetime and convert to correct timezone
* Add `chars` field

In [66]:
# create helper function (which we can probably move into sivtools)

ConvertTZArgs = namedtuple("ConvertTZArgs", ["dt_col", "to_timezone"])

def convert_timezone(row, *args):
    # get datetime
    dt_col = args[0].dt_col
    to_timezone = args[0].to_timezone
    dt = row[dt_col]
    
    dt = maya.parse(dt).datetime(to_timezone=to_timezone)
    return dt

In [67]:
tweets['created_at'] = tweets.apply(
    convert_timezone,
    axis=1, 
    args=(ConvertTZArgs('created_at', 'US/Central'),)
)

In [68]:
tweets['chars'] = tweets['full_text'].str.len()

In [69]:
tweets.head()

Unnamed: 0,created_at,full_text,retweeted,chars
0,2017-12-16 14:29:52-06:00,A list of things that might be useful when ans...,True,166
1,2017-12-16 14:08:52-06:00,OUT: import pdb; pdb.set_trace()\nIN: breakpoi...,True,90
2,2017-12-16 10:46:55-06:00,"Great news, Guido has pronounced dicts to reta...",True,246
3,2017-12-16 07:46:57-06:00,First ever PyCon happening in Pakistan https:/...,True,62
4,2017-12-15 23:14:45-06:00,#python news: 😀 @gvanrossum just pronounced t...,True,137


In [70]:
tweets.tail()

Unnamed: 0,created_at,full_text,retweeted,chars
1566,2012-12-27 07:39:59-06:00,Fifth anniversary of #Benazir's assassination....,False,67
1567,2012-12-26 13:30:56-06:00,3D printers are bringing us into the age of St...,False,114
1568,2012-12-26 12:49:53-06:00,Starting up a new public account.,False,33
1569,2017-12-16 15:45:38-06:00,Was also logging to AWS CloudWatch. Best pract...,False,56
1570,2017-12-16 19:06:06-06:00,Got my local dev environment all set up. @dron...,False,146


In [71]:
tweets.describe(include='all')

Unnamed: 0,created_at,full_text,retweeted,chars
count,2569,2569,2569,2569.0
unique,2569,2562,2,
top,2016-12-10 00:05:07-06:00,Git branching done right with Gitflow &amp; im...,False,
freq,1,2,1571,
mean,,,,100.330479
std,,,,36.794551
min,,,,2.0
25%,,,,74.0
50%,,,,101.0
75%,,,,130.0


---

## Output cleaned data set

In [72]:
tweets.to_pickle('cleaned_tweets_data.pkl')