In [1]:
# standard library
from collections import namedtuple
import os

# pydata
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# bson-numpy libraries
import numpy as np
from bson import CodecOptions
from bson.raw_bson import RawBSONDocument
from pymongo import MongoClient
import bsonnumpy

# other libraries
import maya

In [54]:
# get mongodb params (using configparser)
mlab_uri = os.environ['MLAB_URI']
mlab_collection = os.environ['MLAB_COLLECTION']

In [55]:
codec_options=CodecOptions(document_class=RawBSONDocument)
client = MongoClient(mlab_uri)
db = client.get_default_database()
collection = db.get_collection(
    mlab_collection,
    codec_options=codec_options
)

https://docs.scipy.org/doc/numpy/reference/generated/numpy.dtype.html#numpy.dtype

In [56]:
# we only pull the fields defined here
dtype = np.dtype(
    [   ('id_str', 'S20'),
        ('full_text', 'S400'),
        ('created_at', 'S64'),
        ('retweeted', 'b'),
    ]
)

In [57]:
ndarray = bsonnumpy.sequence_to_ndarray(
    (doc.raw for doc in collection.find()), 
    dtype, 
    collection.count(),
)

In [58]:
print(ndarray)

[ (b'942143177330487296', b'Still. API keys in ipython notebooks! Yikes', b'Sat Dec 16 21:23:32 +0000 2017', 0)
 (b'942142988574175232', b'Fixing up a project I worked on in April 2016. Good job me for documenting sorta well', b'Sat Dec 16 21:22:47 +0000 2017', 0)
 (b'942133480699260929', b'https://t.co/6Ubtx6S1O8\n#bookmark', b'Sat Dec 16 20:45:00 +0000 2017', 0)
 ...,
 (b'942148741343186945', b'Was also logging to AWS CloudWatch. Best practices sorta', b'Sat Dec 16 21:45:38 +0000 2017', 0)
 (b'942199189223673857', b'Got my local dev environment all set up. @droneio builds are also passing. Can finally start working on the business logic! #dockercompose #devops', b'Sun Dec 17 01:06:06 +0000 2017', 0)
 (b'942281643926093824', b'RT @crossphd: Pushpin \xe2\x80\x94 An Open Source Library That Turns REST APIs into Realtime APIs - An intro to evented APIs and how to build them wit\xe2\x80\xa6', b'Sun Dec 17 06:33:45 +0000 2017', 1)]


In [59]:
tweets = pd.DataFrame(ndarray)

In [60]:
tweets['id_str'] = tweets['id_str'].str.decode('utf-8')
tweets['full_text'] = tweets['full_text'].str.decode('utf-8')
tweets['created_at'] = tweets['created_at'].str.decode('utf-8')

In [61]:
tweets.head()

Unnamed: 0,id_str,full_text,created_at,retweeted
0,942143177330487296,Still. API keys in ipython notebooks! Yikes,Sat Dec 16 21:23:32 +0000 2017,0
1,942142988574175232,Fixing up a project I worked on in April 2016....,Sat Dec 16 21:22:47 +0000 2017,0
2,942133480699260929,https://t.co/6Ubtx6S1O8\n#bookmark,Sat Dec 16 20:45:00 +0000 2017,0
3,942129673416343552,RT @dbader_org: A list of things that might be...,Sat Dec 16 20:29:52 +0000 2017,1
4,942128018516037633,VSCode November release \nhttps://t.co/U4nzPLE8QI,Sat Dec 16 20:23:18 +0000 2017,0


In [62]:
tweets['retweeted'].value_counts()

0    1774
1     795
Name: retweeted, dtype: int64

In [63]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 4 columns):
id_str        2569 non-null object
full_text     2569 non-null object
created_at    2569 non-null object
retweeted     2569 non-null int8
dtypes: int8(1), object(3)
memory usage: 62.8+ KB


### Parse datetime and convert to correct timezone

In [64]:
# create helper function (which we can probably move into sivtools)

ConvertTZArgs = namedtuple("ConvertTZArgs", ["dt_col", "to_timezone"])

def convert_timezone(row, *args):
    # get datetime
    dt_col = args[0].dt_col
    to_timezone = args[0].to_timezone
    dt = row[dt_col]
    
    dt = maya.parse(dt).datetime(to_timezone=to_timezone)
    return dt

In [65]:
tweets['created_at'] = tweets.apply(
    convert_timezone,
    axis=1, 
    args=(ConvertTZArgs('created_at', 'US/Central'),)
)

In [66]:
tweets['chars'] = tweets['full_text'].str.len()

In [67]:
tweets.head()

Unnamed: 0,id_str,full_text,created_at,retweeted,chars
0,942143177330487296,Still. API keys in ipython notebooks! Yikes,2017-12-16 15:23:32-06:00,0,43
1,942142988574175232,Fixing up a project I worked on in April 2016....,2017-12-16 15:22:47-06:00,0,85
2,942133480699260929,https://t.co/6Ubtx6S1O8\n#bookmark,2017-12-16 14:45:00-06:00,0,33
3,942129673416343552,RT @dbader_org: A list of things that might be...,2017-12-16 14:29:52-06:00,1,140
4,942128018516037633,VSCode November release \nhttps://t.co/U4nzPLE8QI,2017-12-16 14:23:18-06:00,0,48


In [68]:
tweets.tail()

Unnamed: 0,id_str,full_text,created_at,retweeted,chars
2564,284018476102848512,3D printers are bringing us into the age of St...,2012-12-26 13:30:56-06:00,0,114
2565,284008146480009216,Starting up a new public account.,2012-12-26 12:49:53-06:00,0,33
2566,942148741343186945,Was also logging to AWS CloudWatch. Best pract...,2017-12-16 15:45:38-06:00,0,56
2567,942199189223673857,Got my local dev environment all set up. @dron...,2017-12-16 19:06:06-06:00,0,146
2568,942281643926093824,RT @crossphd: Pushpin — An Open Source Library...,2017-12-17 00:33:45-06:00,1,140


In [69]:
tweets.describe(include='all')

Unnamed: 0,id_str,full_text,created_at,retweeted,chars
count,2569.0,2569,2569,2569.0,2569.0
unique,2569.0,2562,2569,,
top,8.657150296218746e+17,RT @PythonQnA: Proper way to declare custom ex...,2016-12-10 00:05:07-06:00,,
freq,1.0,2,1,,
mean,,,,0.309459,103.27248
std,,,,0.462361,33.810651
min,,,,0.0,2.0
25%,,,,0.0,79.0
50%,,,,0.0,107.0
75%,,,,1.0,135.0


---

## Get original and retweeted separately

How are they different?

### Original

In [72]:
# we only pull the fields defined here
dtype_original = np.dtype(
    [   ('id_str', 'S20'),
        ('full_text', 'S400'),
        ('created_at', 'S64'),
        ('retweeted', 'b'),
    ]
)

In [73]:
ndarray_original = bsonnumpy.sequence_to_ndarray(
    (doc.raw for doc in collection.find({"retweeted_status": {"$exists": False}})), 
    dtype_original, 
    collection.count(),
)

In [74]:
print(ndarray_original)

[ (b'942143177330487296', b'Still. API keys in ipython notebooks! Yikes', b'Sat Dec 16 21:23:32 +0000 2017', 0)
 (b'942142988574175232', b'Fixing up a project I worked on in April 2016. Good job me for documenting sorta well', b'Sat Dec 16 21:22:47 +0000 2017', 0)
 (b'942133480699260929', b'https://t.co/6Ubtx6S1O8\n#bookmark', b'Sat Dec 16 20:45:00 +0000 2017', 0)
 ...,
 (b'284008146480009216', b'Starting up a new public account.', b'Wed Dec 26 18:49:53 +0000 2012', 0)
 (b'942148741343186945', b'Was also logging to AWS CloudWatch. Best practices sorta', b'Sat Dec 16 21:45:38 +0000 2017', 0)
 (b'942199189223673857', b'Got my local dev environment all set up. @droneio builds are also passing. Can finally start working on the business logic! #dockercompose #devops', b'Sun Dec 17 01:06:06 +0000 2017', 0)]


In [75]:
tweets_original = pd.DataFrame(ndarray_original)

In [76]:
tweets_original['id_str'] = tweets_original['id_str'].str.decode('utf-8')
tweets_original['full_text'] = tweets_original['full_text'].str.decode('utf-8')
tweets_original['created_at'] = tweets_original['created_at'].str.decode('utf-8')

In [82]:
tweets_original.head()

Unnamed: 0,id_str,full_text,created_at,retweeted
0,942143177330487296,Still. API keys in ipython notebooks! Yikes,Sat Dec 16 21:23:32 +0000 2017,0
1,942142988574175232,Fixing up a project I worked on in April 2016....,Sat Dec 16 21:22:47 +0000 2017,0
2,942133480699260929,https://t.co/6Ubtx6S1O8\n#bookmark,Sat Dec 16 20:45:00 +0000 2017,0
3,942128018516037633,VSCode November release \nhttps://t.co/U4nzPLE8QI,Sat Dec 16 20:23:18 +0000 2017,0
4,942124280183250954,@TooManyNickLees It does for this project I'm ...,Sat Dec 16 20:08:26 +0000 2017,0


In [83]:
tweets_original['retweeted'].value_counts()

0    1571
Name: retweeted, dtype: int64

In [84]:
tweets_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571 entries, 0 to 1570
Data columns (total 4 columns):
id_str        1571 non-null object
full_text     1571 non-null object
created_at    1571 non-null object
retweeted     1571 non-null int8
dtypes: int8(1), object(3)
memory usage: 38.4+ KB


---

### Retweeted

In [111]:
# we only pull the fields defined here
dtype_rt = np.dtype(
    [   ('id_str', 'S20'),
        ('retweeted_status', np.dtype([('full_text', 'S400')])),
        ('created_at', 'S64'),
        ('retweeted', 'b'),
    ]
)

In [112]:
ndarray_rt = bsonnumpy.sequence_to_ndarray(
    (doc.raw for doc in collection.find({"retweeted_status": {"$exists": True}})), 
    dtype_rt, 
    collection.count(),
)

In [None]:
tweets_rt = pd.DataFrame(ndarray_rt)

In [168]:
tweets_rt['id_str'] = tweets_rt['id_str'].str.decode('utf-8')
tweets_rt['full_text'] = tweets_rt['retweeted_status'].map(lambda value: value[0].decode('utf-8'))
tweets_rt['created_at'] = tweets_rt['created_at'].str.decode('utf-8')

In [169]:
tweets_rt.head()

Unnamed: 0,id_str,retweeted_status,created_at,retweeted,full_text
0,942129673416343552,(b'A list of things that might be useful when ...,Sat Dec 16 20:29:52 +0000 2017,1,A list of things that might be useful when ans...
1,942124389352566785,(b'OUT: import pdb; pdb.set_trace()\nIN: break...,Sat Dec 16 20:08:52 +0000 2017,1,OUT: import pdb; pdb.set_trace()\nIN: breakpoi...
2,942073565154893824,"(b'Great news, Guido has pronounced dicts to r...",Sat Dec 16 16:46:55 +0000 2017,1,"Great news, Guido has pronounced dicts to reta..."
3,942028275249041408,(b'First ever PyCon happening in Pakistan http...,Sat Dec 16 13:46:57 +0000 2017,1,First ever PyCon happening in Pakistan https:/...
4,941899375424524289,(b'#python news: \xf0\x9f\x98\x80 @gvanrossum...,Sat Dec 16 05:14:45 +0000 2017,1,#python news: 😀 @gvanrossum just pronounced t...


In [170]:
tweets_rt['retweeted'].value_counts()

1    795
0    203
Name: retweeted, dtype: int64

In [171]:
tweets_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571 entries, 0 to 1570
Data columns (total 4 columns):
id_str        1571 non-null object
full_text     1571 non-null object
created_at    1571 non-null object
retweeted     1571 non-null int8
dtypes: int8(1), object(3)
memory usage: 38.4+ KB


In [173]:
tweets_rt[tweets_rt['retweeted'] == 0]

Unnamed: 0,id_str,retweeted_status,created_at,retweeted,full_text
794,838811334296862721,(b'#TruvenHealth and #IBMWatson announced our ...,Mon Mar 06 17:59:41 +0000 2017,0,#TruvenHealth and #IBMWatson announced our 100...
795,838784428520407040,(b'Happy Birthday Toronto! #OTD in 1834 the to...,Mon Mar 06 16:12:46 +0000 2017,0,Happy Birthday Toronto! #OTD in 1834 the town ...
796,838784352343379968,(b'Is an MBA helpful for people who want to wo...,Mon Mar 06 16:12:28 +0000 2017,0,Is an MBA helpful for people who want to work ...
797,838696318340063232,(b'Challenges of using Twitter as a data sourc...,Mon Mar 06 10:22:39 +0000 2017,0,Challenges of using Twitter as a data source: ...
798,838626694273634305,(b'Getting Started with #DeepLearning in #Pyth...,Mon Mar 06 05:45:59 +0000 2017,0,Getting Started with #DeepLearning in #Python:...
799,838619112326709248,(b'A step by step guide for getting Python 3.6...,Mon Mar 06 05:15:51 +0000 2017,0,A step by step guide for getting Python 3.6 &a...
800,838618194499751938,"(b'Introducing #IBMQ quantum systems, building...",Mon Mar 06 05:12:12 +0000 2017,0,"Introducing #IBMQ quantum systems, building th..."
801,838618092985008128,(b'IBM is launching IBM Q \xe2\x80\x94 an effo...,Mon Mar 06 05:11:48 +0000 2017,0,IBM is launching IBM Q — an effort to turn its...
802,838599397789298688,(b'10 Scifi\xc2\xa0Curse Words for All Occasio...,Mon Mar 06 03:57:31 +0000 2017,0,10 Scifi Curse Words for All Occasions https:/...
803,838027902302228480,(b'A mythical full stack developer https://t.c...,Sat Mar 04 14:06:36 +0000 2017,0,A mythical full stack developer https://t.co/K...


## Looks like these are RTs. The better flag is if they have a retweeted_status indicator. Let's do them separately, combine and flag the actual retweets before we start doing analysis