In [1]:
!pip install folium --quiet

In [2]:
import pandas as pd
from pymongo import MongoClient

%matplotlib inline

mongo_client = MongoClient('this-mongo.cc', 27016)
database_reference = mongo_client.twitter

In [3]:
collection_reference = database_reference.instructor_test_group

In [4]:
collection_reference.count()

20000

## The Aggregation Pipeline

A call to the aggregation framework defines a pipeline (figure 6.1), the **aggregation pipeline**, where the output from each step in the pipeline provides input to the next step. Each step executes a single operation on the input documents to transform the input and generate output documents.

![](https://www.evernote.com/l/AAGxerRxKLZNFrjqxlYK2HPz1R11tr95FFkB/image.png)

### Useful Aggregation Pipeline Operations

- `$project` // Specify fields to be placed in the output document (projected).
- `$match` // Select documents to be processed, similar to find().
- `$limit` // Limit the number of documents to be passed to the next step.
- `$skip` // Skip a specified number of documents.
- `$unwind` // Expand an array, generating one output document for each array entry.
- `$group` // Group documents by a specified key.
- `$sort` // Sort documents.
- `$geoNear` // Select documents near a geospatial location.
- `$out` // Write the results of the pipeline to a collection (new in v2.6).
- `$redact` // Control access to certain data (new in v2.6).

In [None]:
def dictionary_to_datestring(x):
    month = x['month']
    day = x['day']
    year = x['year']
    return "{}-{}-{}".format(month, day, year)

In [None]:
cursor = collection_reference.aggregate([
    date_to_id,
    group_by_date
])

daily_tweets = pd.DataFrame(list(cursor))

datestrings = daily_tweets['_id'].apply(dictionary_to_datestring)
daily_tweets['date'] = pd.to_datetime(datestrings)

daily_tweets.drop('_id', axis=1, inplace=True)
daily_tweets.sort_values('date', inplace=True)
daily_tweets.set_index('date', inplace=True)
daily_tweets.plot()

## Tweet Locations

In [None]:
nonnull_geo = {'geo' : {'$ne' : None}}
just_geo = {'geo' : 1}

cursor = collection_reference.find(nonnull_geo, just_geo)
cursor.count()

In [None]:
geo_tweets = pd.DataFrame(list(cursor))

In [None]:
def parse_geo_from_tweets(tweets):
    geo = pd.DataFrame(list(tweets['geo'].values))
    return geo

In [None]:
geo = parse_geo_from_tweets(geo_tweets)
geo.sample(5)

In [None]:
import folium
starting_loc = [34.0689, -118.4452]
la_map = folium.Map(location=starting_loc, zoom_start=13)

In [None]:
for loc in geo.coordinates.values:
    folium.Marker(loc).add_to(la_map)

In [None]:
la_map