In [None]:
from pymongo import MongoClient

## Flexibly Structured Data

Creating MongoDb

In [None]:
# Client connects to localhost by default
client = MongoClient()

# Create local 'Nobel' database on the fly
db = client['nobel']

documents = response.json()['collection_name']

# Create collections on the fly
db['collection_name'].insert_many(documents)

Accessing data from mongodb

In [None]:
# One way access data from databaase as dictionareis
# Client is a dictionary of databases
db = client['nobel']

# database is a dictionary of collections
prizes_colection = db['prizes']

# Other way is dot notation
# Databases are an atribute of client
db = client.nobel

# collections are an atribute of database
prizes_colection = db.prizes

Count Documents

In [None]:
# Use empty document as filter
filter = {}

# Count documents in a collection
n_prizes = db.prizes.count_documents(filter)
n_laureates = db.laureates.count_documents(filter)

Fetch documents

In [None]:
# List all databases managed by client
client.list_database_names()

# List all collections inside database
db.list_collection_names()

# Find one document to inspect
db.prizes.find_one(filter)

# Get all fields inside document
db.prizes.find_one().keys()

Find Documents

In [None]:
# Discover how many laureates are female
filter_document = {'gender':'female'}
db.laureates.count_documents(filter_document)

# Query Operators 

# $in: <list>
# Discover how many died in France OR USA
filter_document = {'diedCountry' : {'$in': ['France','USA']}}
db.laureates.count_documents(filter_document)

# $ne: <value> Not equal
filter_document = {'diedCountry' : {'$ne': 'France'}}
db.laureates.count_documents(filter_document)

# $gt: <value> greater than 
# $gte: <value> greater than and equal
# $lt: <value> less than
# $lte: <value> less than and equal
filter_document = {'diedCountry' : {'$gt': 'USA' , '$lte' : 'Belgium'}}
db.laureates.count_documents(filter_document)


# Example
# Save a filter for laureates who died in the USA and were not born there
criteria = { 'diedCountry': 'USA',
             'bornCountry': { "$ne": 'USA'}, 
             }
count = db.laureates.count_documents(criteria)
print(count)

Dot notation: reach into substructure

In [None]:
# dot notation lets you work in density
criteria = {'prizes.affiliations.name' : ('University of California')}
db.laureates.count_documents(criteria)

Some fields in mongoDb doesnt have to be present

In [None]:
# Check all laureates that dont have bornCountry
criteria = {'bornCountry' : {'$exists' : False}}
db.laureates.count_documents(criteria)

# Check all laureates that have more than one prize
criteria = {'prizes.1' : {'$exists' : True}}
db.laureates.count_documents(criteria)

## Working with Distinct Values and Sets

Find distinct values of one field in a document

In [None]:
# Find distinct values in gender
db.laureates.distinct('gender')

# Example with dot notation
db.laureates.distinct('prizes.category')

Find distinct with filter

In [None]:
# Find distinct categories that have shared the prize with 4 laureates
db.laureates.distinct('prizes.category', {'prizes.share' : '4'})

# Find categories that laureates won more than one prize
db.laureates.distinct('prizes.category', {'prizes.1' : {'$exists' : True}})

# Find countries that have 
db.laureates.distinct('prizes.affiliations.country', {'bornCountry': 'USA'})

Element Match

In [None]:
#The $elemMatch operator matches documents that contain an array field with at least one element that matches all the specified query criteria.
db.laureates.count_documents({
    'prizes' : {'$elemMatch':
    {'category': 'physics', 'share': '1'}}})

In [None]:
# Save a filter for laureates with unshared prizes
unshared = {
    "prizes": {"$elemMatch": {
        "category": {"$nin": ["physics", "chemistry", "medicine"]},
        "share": "1",
        "year": {"$gte": "1945"},
    }}}

# Save a filter for laureates with shared prizes
shared = {
    "prizes": {"$elemMatch": {
        "category": {"$nin": ["physics", "chemistry", "medicine"]},
        "share": {"$ne": "1"},
        "year": {"$gte": "1945"},
    }}}

ratio = db.laureates.count_documents(unshared) / db.laureates.count_documents(shared)
print(ratio)

Regex

In [None]:
from bson.regex import Regex

db.laureates.count_documents('bornCountry' , {'bornCountry' : {'$regex' : 'poland', '$options' : 'i'}})
db.laureates.count_documents('bornCountry' , {'bornCountry' : Regex('^Poland \(now')})

## Get Only What You Need, and Fast

Projection

In [None]:
# Include fields with 1 and exclude with 0
docs = db.laureates.find(
    filter={},
    projection={'prizes.affilitions':1,
                '_id':0})

print(list(docs))

# Using list
docs = db.laureates.find(
    filter={'gender':'org'},
    projection=['firstname','bornCountry'])

In [None]:
# Use projection to select only firstname and surname
docs = db.laureates.find(
       filter= {"firstname" : {"$regex" : "^G"},
                "surname" : {"$regex" : "^S"}  },
   projection= ["firstname", "surname"]  )

# Iterate over docs and concatenate first name and surname
full_names = [doc["firstname"] + " " + doc["surname"]  for doc in docs]

# Print the full names
print(full_names)

Sorting

In [None]:
from operator import itemgetter

docs = list(db.prizes.find({'category':'physics'},['year']))
docs = sorted(docs, key=itemgetter('year'))
print([doc['year'] for doc in docs])

# Or sorting in server side
docs = db.prizes.find({'category':'physics'},['year'], sort=[('year',1)]) # Descing order use -1
print([doc['year'] for doc in docs])

# Sort multiple fields
docs = db.prizes.find({'category':'physics'},['category','year'], sort=[('year',1), ('category',-1)]) 
print([doc['year'] for doc in docs])

Indexing

In [None]:
# Create index on year in acs order
db.prizes.create_index([('year', 1)])

# Example
# Specify an index model for compound sorting
index_model = [('category', 1), ('year', -1)]
db.prizes.create_index(index_model)

# Collect the last single-laureate year for each category
report = ""
for category in sorted(db.prizes.distinct("category")):
    doc = db.prizes.find_one(
        {'category': category, "laureates.share": "1"},
        sort=[('year', -1)]
    )
    report += "{category}: {year}\n".format(**doc)

print(report)

Limits

In [None]:
# Limit only 3 rows
db.prizes.find({'laureates.share' : '3'}, limit=3)

# Limit 3 rows and skip the first 3 rows
db.prizes.find({'laureates.share' : '3'}, limit=3, skip=3)

# Chain commands
db.prizes.find({'laureates.share' : '3'}).sort([('year',1)]).limit(3)

## Aggregation Pipelines: Let the Server Do It For You

Aggregation Examples

In [None]:
# Same result but with aggregation
cursor = db.laureates.find(
    filter={'bornCountry': 'USA'},
    projection={'prizes.year':1},
    limit=3
)

cursor = db.laureates.aggregate([
    {'$match' : {'bornCountry': 'USA'}},
    {'$project' : {'prizes.year': 1}},
    {'$limit' : 3}
])

Count aggregation

In [None]:
db.laureates.aggregation([
    {'$match' : {'bornCountry': 'USA'}},
    {'$count' : 'n_USA-born-laureates'}
])

In [None]:
# Return a list with true when prizeshare == 1
db.laureates.aggregate([
    {"$project" : {'solo_winner' : {'$in' : ['1', '$prizes.share']}}}
])

GroupBy

In [None]:
# Same as distinct
db.laureates.aggregate([
    {'$group' : {'_id' : 'bornCountry'}}
])

# Get the sum of total prizes
db.laureates.aggregate([
    {"$project": {"n_prizes": {"$size": "$prizes"}}},
    {"$group": {"_id" : None, "n_prizes_total" : {'$sum' : '$n_prizes'}}}
])

In [None]:
# Count prizes awarded (at least partly) to organizations as a sum over sizes of "prizes" arrays.
pipeline = [
    {'$match': {'gender': "org"}},
    {"$project": {"n_prizes": {"$size": '$prizes'}}},
    {"$group": {"_id": None, "n_prizes_total": {"$sum": '$n_prizes'}}}
]

print(list(db.laureates.aggregate(pipeline)))

In [None]:
from collections import OrderedDict

original_categories = sorted(set(db.prizes.distinct("category", {"year": "1901"})))
pipeline = [
    {"$match": {"category": {"$in": original_categories}}},
    {"$project": {"category": 1, "year": 1}},
    
    # Collect the set of category values for each prize year.
    {"$group": {"_id": "$year", "categories": {"$addToSet": "$category"}}},
    
    # Project categories *not* awarded (i.e., that are missing this year).
    {"$project": {"missing": {"$setDifference": [original_categories, "$categories"]}}},
    
    # Only include years with at least one missing category
    {"$match": {"missing.0": {"$exists": True}}},
    
    # Sort in reverse chronological order. Note that "_id" is a distinct year at this stage.
    {"$sort": OrderedDict([("_id", -1)])},
]
for doc in db.prizes.aggregate(pipeline):
    print("{year}: {missing}".format(year=doc["_id"],missing=", ".join(sorted(doc["missing"]))))