In [1]:
import requests
from pymongo import MongoClient

client = MongoClient()
print(client)
db = client["nobel"]

for collection_name in ["prizes", "laureates"]:
    response = requests.get("http://api.nobelprize.org/v1/{}.json".format(collection_name[:-1]))
    documents = response.json()[collection_name]
    db[collection_name].insert_many(documents)

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)


# Get Only What You Need, and Fast

## Projection

In [69]:
#Projection is reducing multidimensional data. With a table of data
# it's about selecting columns. With MongoDB, it's about selecting substructure.

# For each field that we want to include in the projection, value of 1 is given.
# Fields that we don't include in the dictionary are not included in the projection.
# But the "_id" is alwats included in a projection by default. We should assign
# 0 in the projection dictionary to leave it out.

docs = db.laureates.find(filter={}, projection={"prizes.affiliations":"1", "_id":0})
print(list(docs)[:3])
print()
docs = db.laureates.find(filter={"gender": "org"}, projection=["bornCountry", "firstname"])
print(list(docs)[:3]) # Organizations have no bornCountry fields and rather thatn
# raising an error, MongoDB returns the documents without those fields.

# When a projection doesn't involve excluding fields, the pymongo driver accepts
# lists.

n_prizes = 0
docs = db.laureates.find({}, ["prizes"])
for doc in docs:
    n_prizes += len(doc['prizes'])
print(n_prizes)
docs = db.laureates.find({}, ["prizes"])
sum([len(doc["prizes"]) for doc in docs])

[{'prizes': [{'affiliations': [{'name': 'Munich University', 'city': 'Munich', 'country': 'Germany'}]}]}, {'prizes': [{'affiliations': [{'name': 'Leiden University', 'city': 'Leiden', 'country': 'the Netherlands'}]}]}, {'prizes': [{'affiliations': [{'name': 'Amsterdam University', 'city': 'Amsterdam', 'country': 'the Netherlands'}]}]}]

[{'_id': ObjectId('614dc045a86e98712247b188'), 'firstname': 'Institute of International Law'}, {'_id': ObjectId('614dc045a86e98712247b191'), 'firstname': 'Permanent International Peace Bureau'}, {'_id': ObjectId('614dc045a86e98712247b196'), 'firstname': 'International Committee of the Red Cross'}]
2886


2886

### Shares of the 1903 Prize in Physics


In [78]:
db.laureates.find_one({"prizes": {"$elemMatch": {"category": "physics", "year": "1903"}}}, 
                      {"firstname":1,"surname":1, "prizes.share": 1, "_id":0})

{'firstname': 'Henri', 'surname': 'Becquerel', 'prizes': [{'share': '2'}]}

### Rounding up the G.S. crew

In [83]:
docs = db.laureates.find({"firstname": {"$regex": "^G"}, "surname": {"$regex": "^S"}}, projection=["firstname", "surname"])
full_names = [doc["firstname"] + " " + doc["surname"] for doc in docs]
print(full_names)

['Glenn T. Seaborg', 'George D. Snell', 'Gustav Stresemann', 'George Bernard Shaw', 'Giorgos Seferis', 'George J. Stigler', 'George F. Smoot', 'George E. Smith', 'George P. Smith', 'Gregg Semenza', 'Glenn T. Seaborg', 'George D. Snell', 'Gustav Stresemann', 'George Bernard Shaw', 'Giorgos Seferis', 'George J. Stigler', 'George F. Smoot', 'George E. Smith', 'George P. Smith', 'Gregg Semenza', 'Glenn T. Seaborg', 'George D. Snell', 'Gustav Stresemann', 'George Bernard Shaw', 'Giorgos Seferis', 'George J. Stigler', 'George F. Smoot', 'George E. Smith', 'George P. Smith', 'Gregg Semenza']


### Doing our share of data validation


In [112]:
prizes = db.prizes.find({}, ["laureates.share"])

for prize in prizes:
    total_share = 0
    try:
        for laureate in prize["laureates"]:
            total_share += 1 / float(laureate["share"])
    except:
        pass 
    print(total_share)  

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


## Sorting

In [144]:
# To sort, you can use Python's built in fucntion itemgetter

docs = list(db.prizes.find({"category":"physics"}, ["year"]))
print([doc["year"] for doc in docs][:5])

from operator import itemgetter
docs = sorted(docs, key=itemgetter("year"))
print([doc["year"] for doc in docs][:5])

docs = sorted(docs, key=itemgetter("year"), reverse=True)
print([doc["year"] for doc in docs][:5])

# Also Mongo can do that.

cursor = db.prizes.find({"category":"physics"}, ["year"], sort=[("year",1)])
print([doc["year"] for doc in cursor][:5])

cursor = db.prizes.find({"category":"physics"}, ["year"], sort=[("year",-1)])
print([doc["year"] for doc in cursor][:5])

# In sort key field a list is used to sort by others.

for doc in db.prizes.find({"year": {"$gt": "1966", "$lt": "1970"}}, ["category", "year"],
                         sort=[("year", 1), ("category", -1)]):
    print("{year} {category}".format(**doc)) # ** dictionary unpacking.

['2020', '2019', '2018', '2017', '2016']
['1901', '1901', '1901', '1902', '1902']
['2020', '2020', '2020', '2019', '2019']
['1901', '1901', '1901', '1902', '1902']
['2020', '2020', '2020', '2019', '2019']
1967 physics
1967 physics
1967 physics
1967 peace
1967 peace
1967 peace
1967 medicine
1967 medicine
1967 medicine
1967 literature
1967 literature
1967 literature
1967 chemistry
1967 chemistry
1967 chemistry
1968 physics
1968 physics
1968 physics
1968 peace
1968 peace
1968 peace
1968 medicine
1968 medicine
1968 medicine
1968 literature
1968 literature
1968 literature
1968 chemistry
1968 chemistry
1968 chemistry
1969 physics
1969 physics
1969 physics
1969 peace
1969 peace
1969 peace
1969 medicine
1969 medicine
1969 medicine
1969 literature
1969 literature
1969 literature
1969 economics
1969 economics
1969 economics
1969 chemistry
1969 chemistry
1969 chemistry


### What the sort?

In [148]:
docs = list(db.laureates.find(
    {"born": {"$gte": "1900"}, "prizes.year": {"$gte": "1954"}},
    {"born": 1, "prizes.year": 1, "_id": 0},
    sort=[("prizes.year", 1), ("born", -1)]))
for doc in docs[:5]:
    print(doc)

{'born': '1950-12-14', 'prizes': [{'year': '1954'}, {'year': '1981'}]}
{'born': '1950-12-14', 'prizes': [{'year': '1954'}, {'year': '1981'}]}
{'born': '1950-12-14', 'prizes': [{'year': '1954'}, {'year': '1981'}]}
{'born': '1916-08-25', 'prizes': [{'year': '1954'}]}
{'born': '1916-08-25', 'prizes': [{'year': '1954'}]}


### Sorting together: MongoDB + Python

In [172]:
sample_prize = {'_id': ('5bc56145f35b634065ba1bd5'),
 'category': 'physics',
 'laureates': [{'firstname': 'Antoine Henri',
   'id': '4',
   'motivation': '"in recognition of the extraordinary services he has rendered by his discovery of spontaneous radioactivity"',
   'share': '2',
   'surname': 'Becquerel'},
  {'firstname': 'Pierre',
   'id': '5',
   'motivation': '"in recognition of the extraordinary services they have rendered by their joint researches on the radiation phenomena discovered by Professor Henri Becquerel"',
   'share': '4',
   'surname': 'Curie'},
  {'firstname': 'Marie',
   'id': '6',
   'motivation': '"in recognition of the extraordinary services they have rendered by their joint researches on the radiation phenomena discovered by Professor Henri Becquerel"',
   'share': '4',
   'surname': 'Curie, née Sklodowska'}],
 'year': '1903'}

def all_laureates(prize):
    sorted_laureates = sorted(prize["laureates"], key= itemgetter("surname"))
    surnames = [laureate["surname"] for laureate in sorted_laureates]
    all_names = " and ".join(surnames)
    return all_names

print(all_laureates(sample_prize))

docs = db.prizes.find(filter={"category":"physics"}, projection=["year", "laureates.firstname", "laureates.surname"],
                     sort=[("year", 1)])

for doc in docs:
    try:
        print("{year}: {names}".format(year=doc["year"], names=all_laureates(doc)))
    except:
        pass

Becquerel and Curie and Curie, née Sklodowska
1901: Röntgen
1901: Röntgen
1901: Röntgen
1902: Lorentz and Zeeman
1902: Lorentz and Zeeman
1902: Lorentz and Zeeman
1903: Becquerel and Curie and Curie
1903: Becquerel and Curie and Curie
1903: Becquerel and Curie and Curie
1904: Rayleigh
1904: Rayleigh
1904: Rayleigh
1905: Lenard
1905: Lenard
1905: Lenard
1906: Thomson
1906: Thomson
1906: Thomson
1907: Michelson
1907: Michelson
1907: Michelson
1908: Lippmann
1908: Lippmann
1908: Lippmann
1909: Braun and Marconi
1909: Braun and Marconi
1909: Braun and Marconi
1910: van der Waals
1910: van der Waals
1910: van der Waals
1911: Wien
1911: Wien
1911: Wien
1912: Dalén
1912: Dalén
1912: Dalén
1913: Kamerlingh Onnes
1913: Kamerlingh Onnes
1913: Kamerlingh Onnes
1914: von Laue
1914: von Laue
1914: von Laue
1915: Bragg and Bragg
1915: Bragg and Bragg
1915: Bragg and Bragg
1917: Barkla
1917: Barkla
1917: Barkla
1918: Planck
1918: Planck
1918: Planck
1919: Stark
1919: Stark
1919: Stark
1920: Guillaume

### Gap years

In [175]:
original_categories = db.prizes.distinct("category", {"year": "1901"})
print(original_categories)

docs = db.prizes.find(filter={}, projection={"year":1,"category":1,"_id":0},
                     sort = [("year", 1), ("category",-1)])
for doc in docs:
    print(doc)

['chemistry', 'literature', 'peace', 'physics', 'medicine']
{'year': '1901', 'category': 'physics'}
{'year': '1901', 'category': 'physics'}
{'year': '1901', 'category': 'physics'}
{'year': '1901', 'category': 'peace'}
{'year': '1901', 'category': 'peace'}
{'year': '1901', 'category': 'peace'}
{'year': '1901', 'category': 'medicine'}
{'year': '1901', 'category': 'medicine'}
{'year': '1901', 'category': 'medicine'}
{'year': '1901', 'category': 'literature'}
{'year': '1901', 'category': 'literature'}
{'year': '1901', 'category': 'literature'}
{'year': '1901', 'category': 'chemistry'}
{'year': '1901', 'category': 'chemistry'}
{'year': '1901', 'category': 'chemistry'}
{'year': '1902', 'category': 'physics'}
{'year': '1902', 'category': 'physics'}
{'year': '1902', 'category': 'physics'}
{'year': '1902', 'category': 'peace'}
{'year': '1902', 'category': 'peace'}
{'year': '1902', 'category': 'peace'}
{'year': '1902', 'category': 'medicine'}
{'year': '1902', 'category': 'medicine'}
{'year': '19

{'year': '2018', 'category': 'economics'}
{'year': '2018', 'category': 'chemistry'}
{'year': '2018', 'category': 'chemistry'}
{'year': '2018', 'category': 'chemistry'}
{'year': '2019', 'category': 'physics'}
{'year': '2019', 'category': 'physics'}
{'year': '2019', 'category': 'physics'}
{'year': '2019', 'category': 'peace'}
{'year': '2019', 'category': 'peace'}
{'year': '2019', 'category': 'peace'}
{'year': '2019', 'category': 'medicine'}
{'year': '2019', 'category': 'medicine'}
{'year': '2019', 'category': 'medicine'}
{'year': '2019', 'category': 'literature'}
{'year': '2019', 'category': 'literature'}
{'year': '2019', 'category': 'literature'}
{'year': '2019', 'category': 'economics'}
{'year': '2019', 'category': 'economics'}
{'year': '2019', 'category': 'economics'}
{'year': '2019', 'category': 'chemistry'}
{'year': '2019', 'category': 'chemistry'}
{'year': '2019', 'category': 'chemistry'}
{'year': '2020', 'category': 'physics'}
{'year': '2020', 'category': 'physics'}
{'year': '2020

## What are indexes?


In [178]:
%%timeit
docs = list(db.prizes.find({"year":"1901"}))

1.94 ms ± 40.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [180]:
%%timeit
docs = list(db.prizes.find({}, sort=[("year", 1)]))

28.2 ms ± 964 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [181]:
db.prizes.create_index([("year", 1)])

'year_1'

In [182]:
%%timeit
docs = list(db.prizes.find({"year":"1901"}))

733 µs ± 9.22 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [183]:
%%timeit
docs = list(db.prizes.find({}, sort=[("year", 1)]))

22.7 ms ± 125 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [196]:
db.prizes.create_index([("category", 1), ("year",1)])

print(list(db.prizes.find({"category": "economics"}, {"year":1,"_id":0})))

# by indexing by category and year, mongo doesn't need to examin the collection
# itself to execute the query. The query is "covered" by the index.

print(db.prizes.find_one({"category": "economics"},{"year": 1, "_id":0}, 
                    sort=[("year", 1)]))

# some tools to troubleshoot query performance.
print(db.laureates.index_information()) # which indexes exist for a collection

db.laureates.find({"firstname": "Marie"}, {"bornCounry":1, "_id":0}).explain()
# details about the query



[{'year': '1969'}, {'year': '1969'}, {'year': '1969'}, {'year': '1970'}, {'year': '1970'}, {'year': '1970'}, {'year': '1971'}, {'year': '1971'}, {'year': '1971'}, {'year': '1972'}, {'year': '1972'}, {'year': '1972'}, {'year': '1973'}, {'year': '1973'}, {'year': '1973'}, {'year': '1974'}, {'year': '1974'}, {'year': '1974'}, {'year': '1975'}, {'year': '1975'}, {'year': '1975'}, {'year': '1976'}, {'year': '1976'}, {'year': '1976'}, {'year': '1977'}, {'year': '1977'}, {'year': '1977'}, {'year': '1978'}, {'year': '1978'}, {'year': '1978'}, {'year': '1979'}, {'year': '1979'}, {'year': '1979'}, {'year': '1980'}, {'year': '1980'}, {'year': '1980'}, {'year': '1981'}, {'year': '1981'}, {'year': '1981'}, {'year': '1982'}, {'year': '1982'}, {'year': '1982'}, {'year': '1983'}, {'year': '1983'}, {'year': '1983'}, {'year': '1984'}, {'year': '1984'}, {'year': '1984'}, {'year': '1985'}, {'year': '1985'}, {'year': '1985'}, {'year': '1986'}, {'year': '1986'}, {'year': '1986'}, {'year': '1987'}, {'year': 

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'nobel.laureates',
  'indexFilterSet': False,
  'parsedQuery': {'firstname': {'$eq': 'Marie'}},
  'winningPlan': {'stage': 'PROJECTION',
   'transformBy': {'bornCounry': 1, '_id': 0},
   'inputStage': {'stage': 'COLLSCAN',
    'filter': {'firstname': {'$eq': 'Marie'}},
    'direction': 'forward'}},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 3,
  'executionTimeMillis': 51,
  'totalKeysExamined': 0,
  'totalDocsExamined': 2865,
  'executionStages': {'stage': 'PROJECTION',
   'nReturned': 3,
   'executionTimeMillisEstimate': 0,
   'works': 2867,
   'advanced': 3,
   'needTime': 2863,
   'needYield': 0,
   'saveState': 22,
   'restoreState': 22,
   'isEOF': 1,
   'invalidates': 0,
   'transformBy': {'bornCounry': 1, '_id': 0},
   'inputStage': {'stage': 'COLLSCAN',
    'filter': {'firstname': {'$eq': 'Marie'}},
    'nReturned': 3,
    'executionTimeMillisEstimate': 0,
    'works': 2867,
    'adv

### High-share categories

In [198]:
%%timeit
db.prizes.distinct("category", {"laureates.share": {"$gt": "3"}})

5.43 ms ± 87.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [199]:
%%timeit
db.prizes.create_index([("laureates.share", 1), ("category", 1)])
db.prizes.distinct("category", {"laureates.share": {"$gt": "3"}})

1.09 ms ± 36.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Recently single?


In [210]:
index_model = [("category",1),("year", -1)]
db.prizes.create_index(index_model)

report = ""

for category in sorted(db.prizes.distinct("category")):
    doc = db.prizes.find_one({"category": category, "laureates.share": "1"}, sort = [("year", -1)])
    report += "{category}: {year}\n".format(**doc)
print(report)

chemistry: 2011
economics: 2017
literature: 2020
medicine: 2016
peace: 2020
physics: 1992



### Born and affiliated


In [235]:
from collections import Counter

db.laureates.create_index([("bornCountry",1)])

n_born_and_affiliated = {country: db.laureates.count_documents({"bornCountry": country,
                                                               "prizes.affiliations.country": country}) for country in db.laureates.distinct("bornCountry")}

five_most_common = Counter(n_born_and_affiliated).most_common(5)
print(five_most_common)

[('USA', 744), ('United Kingdom', 174), ('Germany', 111), ('France', 78), ('Japan', 54)]


## Limits

In [273]:
for doc in db.prizes.find({}, ["laureates.share"]):
    try:
        share_is_three = [laureates["share"] == "3" for laureates in doc["laureates"]]
        assert all(share_is_three) or not any(share_is_three)
    except:
        pass
for doc in db.prizes.find({"laureates.share":"3"}, limit=3): # limitted to prevent longer results
    print("{year} {category}".format(**doc))
print() 
for doc in db.prizes.find({"laureates.share":"3"}, limit=3, skip=3): # you can skip some of them
    print("{year} {category}".format(**doc))
    
# You can also chain methods.

print()
for doc in db.prizes.find({"laureates.share":"3"}).limit(3):
    print("{year} {category}".format(**doc))
print()
for doc in db.prizes.find({"laureates.share":"3"}).skip(3).limit(3):
    print("{year} {category}".format(**doc))
print()
for doc in db.prizes.find({"laureates.share":"3"}).skip(3).limit(3).sort([("year", 1)]): 
    print("{year} {category}".format(**doc)) # sort("year",1), sort("year")
    
# You cannot use cursor methods with find_one.

2019 chemistry
2017 chemistry
2016 chemistry

2015 chemistry
2014 chemistry
2013 chemistry

2019 chemistry
2017 chemistry
2016 chemistry

2015 chemistry
2014 chemistry
2013 chemistry

1945 medicine
1945 medicine
1945 medicine


### Setting a new limit?


In [277]:
list(db.prizes.find({"category": "economics"}, {"year": 1, "_id": 0}).sort("year")
     .limit(3).limit(5))

[{'year': '1969'},
 {'year': '1969'},
 {'year': '1969'},
 {'year': '1970'},
 {'year': '1970'}]

### The first five prizes with quarter shares


In [278]:
from pprint import pprint

filter = {"laureates.share": "4"}
projection = ["category", "year", "laureates.motivation"]

cursor = db.prizes.find(filter, projection).sort("year", -1).limit(5)
pprint(list(cursor))

[{'_id': ObjectId('615308abc13d1667d2ffbefe'),
  'category': 'physics',
  'laureates': [{'motivation': '"for the discovery that black hole formation '
                               'is a robust prediction of the general theory '
                               'of relativity"'},
                {'motivation': '"for the discovery of a supermassive compact '
                               'object at the centre of our galaxy"'},
                {'motivation': '"for the discovery of a supermassive compact '
                               'object at the centre of our galaxy"'}],
  'year': '2020'},
 {'_id': ObjectId('614e53de176e1d398b7dda2f'),
  'category': 'physics',
  'laureates': [{'motivation': '"for the discovery that black hole formation '
                               'is a robust prediction of the general theory '
                               'of relativity"'},
                {'motivation': '"for the discovery of a supermassive compact '
                               'object at

### Pages of particle-prized people

In [295]:
def get_particle_laureates(page_number=1, page_size=3):
    if page_number < 1:
        raise ValueError("Pages are natural numbers (starting from 1).")
    particle_laureates = list(db.laureates.find(
            {"prizes.motivation": {"$regex": "particle"}},
            ["firstname", "surname", "prizes"])
        .sort([("prizes.year", 1), ("surname", 1)])
        .skip(page_size * (page_number - 1))
        .limit(page_size))
    return particle_laureates
pages = [get_particle_laureates(page_number=page) for page in range(1,9)]
pprint(pages[0])

[{'_id': ObjectId('614dc045a86e98712247afdb'),
  'firstname': 'C.T.R.',
  'prizes': [{'affiliations': [{'city': 'Cambridge',
                                'country': 'United Kingdom',
                                'name': 'University of Cambridge'}],
              'category': 'physics',
              'motivation': '"for his method of making the paths of '
                            'electrically charged particles visible by '
                            'condensation of vapour"',
              'share': '2',
              'year': '1927'}],
  'surname': 'Wilson'},
 {'_id': ObjectId('614e53de176e1d398b7ddcd7'),
  'firstname': 'C.T.R.',
  'prizes': [{'affiliations': [{'city': 'Cambridge',
                                'country': 'United Kingdom',
                                'name': 'University of Cambridge'}],
              'category': 'physics',
              'motivation': '"for his method of making the paths of '
                            'electrically charged particles visi