In [1]:
import requests
from pymongo import MongoClient

client = MongoClient()
print(client)
db = client["nobel"]

for collection_name in ["prizes", "laureates"]:
    response = requests.get("http://api.nobelprize.org/v1/{}.json".format(collection_name[:-1]))
    documents = response.json()[collection_name]
    db[collection_name].insert_many(documents)

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)


In [2]:
while 1:
    if db.prizes.count_documents({'overallMotivation': {"$exists": True}}) != 0:
        db.prizes.find_one_and_delete({'overallMotivation': {"$exists": True}})
    else:
        break

# Aggregation Pipelines: Let the Server Do It For You

## Intro to Aggregation

In [3]:
cursor = db.laureates.find(filter={"bornCountry": "USA"}, projection={"prizes.year": 1},
                          limit=3)
for doc in cursor:
    print(doc["prizes"])
# An aggregation pipeline contains stages. It is a list, a sequence of stages.

cursor = db.laureates.aggregate([{"$match": {"bornCountry": "USA"}},
                                 {"$project": {"prizes.year": 1}},
                                 {"$limit": 3}])
for doc in cursor:
    print(doc["prizes"])
    
# sorting and skipping are also available as pipleine stages.

from collections import OrderedDict

print(list(db.laureates.aggregate([
    {"$match": {"bornCountry": "USA"}},
    {"$project": {"prizes.year": 1, "_id": 0}},
    {"$sort": OrderedDict([("prizes.year",1)])},
    {"$skip": 1},
    {"$limit": 3}
])))

print(list(db.laureates.aggregate([
    {"$match": {"bornCountry": "USA"}},
    {"$count": "n_USA-born-laureates"}
])))

[{'year': '1923'}]
[{'year': '1927'}]
[{'year': '1936'}]
[{'year': '1923'}]
[{'year': '1927'}]
[{'year': '1936'}]
[{'prizes': [{'year': '1906'}]}, {'prizes': [{'year': '1906'}]}, {'prizes': [{'year': '1906'}]}]
[{'n_USA-born-laureates': 3906}]


### Sequencing stages

In [4]:
cursor = (db.laureates.find(
    projection={"firstname": 1, "prizes.year": 1, "_id": 0},
    filter={"gender": "org"})
 .limit(3).sort("prizes.year", -1))

project_stage = {"$project": {"firstname": 1, "prizes.year": 1, "_id": 0}}
match_stage = {"$match": {"gender": "org"}}
limit_stage = {"$limit": 3}
sort_stage = {"$sort": {"prizes.year": -1}}

for i in cursor:
    print(i)
    
a =db.laureates.aggregate([
    {"$match": {"gender": "org"}},
    {"$project": {"firstname": 1, "prizes.year": 1, "_id": 0}},
    {"$sort": {"prizes.year": -1}},
    {"$limit": 3}
])
for i in a:
    print(i)

{'firstname': 'World Food Programme', 'prizes': [{'year': '2020'}]}
{'firstname': 'World Food Programme', 'prizes': [{'year': '2020'}]}
{'firstname': 'World Food Programme', 'prizes': [{'year': '2020'}]}
{'firstname': 'World Food Programme', 'prizes': [{'year': '2020'}]}
{'firstname': 'World Food Programme', 'prizes': [{'year': '2020'}]}
{'firstname': 'World Food Programme', 'prizes': [{'year': '2020'}]}


### Aggregating a few individuals' country data

In [5]:
cursor = (db.laureates.find(
    {"gender": {"$ne": "org"}},
    ["bornCountry", "prizes.affiliations.country"]
).limit(3))

for i in cursor:
    print(i)
    
pipeline = [
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry":1, "prizes.affiliations.country":1}},
    {"$limit": 3}
]
for i in db.laureates.aggregate(pipeline):
    print("{bornCountry}: {prizes}".format(**i))

{'_id': ObjectId('614dc045a86e98712247afbb'), 'bornCountry': 'Prussia (now Germany)', 'prizes': [{'affiliations': [{'country': 'Germany'}]}]}
{'_id': ObjectId('614dc045a86e98712247afbc'), 'bornCountry': 'the Netherlands', 'prizes': [{'affiliations': [{'country': 'the Netherlands'}]}]}
{'_id': ObjectId('614dc045a86e98712247afbd'), 'bornCountry': 'the Netherlands', 'prizes': [{'affiliations': [{'country': 'the Netherlands'}]}]}
Prussia (now Germany): [{'affiliations': [{'country': 'Germany'}]}]
the Netherlands: [{'affiliations': [{'country': 'the Netherlands'}]}]
the Netherlands: [{'affiliations': [{'country': 'the Netherlands'}]}]


### Passing the aggregation baton to Python


In [6]:
from collections import OrderedDict
from itertools import groupby
from operator import itemgetter

original_categories = set(db.prizes.distinct("category", {"year": "1901"}))

# Save an pipeline to collect original-category prizes
pipeline = [
    {"$match": {"category": {"$in": list(original_categories)}}},
    {"$project": {"year": 1, "category": 1}},
    {"$sort": OrderedDict([("year", -1)])}
]
cursor = db.prizes.aggregate(pipeline)
for key, group in groupby(cursor, key=itemgetter("year")):
    missing = original_categories - {doc["category"] for doc in group}
    if missing:
        print("{year}: {missing}".format(year=key, missing=", ".join(sorted(missing))))

2019: physics
2018: physics
2003: chemistry
2002: chemistry
2000: physics
1995: physics
1994: physics
1993: chemistry
1972: peace
1967: peace
1966: peace
1956: peace
1955: peace
1948: peace
1943: literature, peace
1939: peace
1935: literature
1934: physics
1933: chemistry
1932: peace
1931: physics
1928: peace
1925: medicine
1924: chemistry, peace
1923: peace
1921: medicine
1919: chemistry
1918: literature, medicine, peace
1917: chemistry, medicine
1916: chemistry, medicine, peace, physics
1915: medicine, peace
1914: literature, peace


### Aggregation Operators and Grouping

In [7]:
print(db.laureates.aggregate([
    {"$project": {"n_prizes": {"$size": "$prizes"}}}
]).next())
# Expressin object assigns the fielsd n-prizes to the size of the prizes array.

print(db.laureates.aggregate([
    {"$project": {"solo_winner": {"$in": ["1", "$prizes.share"]}}}
]).next())

# A group stage takes an expression object that must map the "_id" field.
# "_id" field must be unique.

print(list(db.laureates.aggregate([
    {"$project": {"n_prizes": {"$size": "$prizes"}}},
    {"$group": {"_id": None, "n_prizes_total": {"$sum":"$n_prizes"}}}
])))

{'_id': ObjectId('614dc045a86e98712247afbb'), 'n_prizes': 1}
{'_id': ObjectId('614dc045a86e98712247afbb'), 'solo_winner': True}
[{'_id': None, 'n_prizes_total': 13468}]


In [8]:
list(db.prizes.aggregate([
    {"$project": {"allThree": {"$setEquals": ["$laureates.share", ["3"]]},
                  "noneThree": {"$not": {"$setIsSubset": [["3"], "$laureates.share"]}}}},
    {"$match": {"$nor": [{"allThree": True}, {"noneThree": True}]}}]))

[]

### Organizing prizes

In [9]:
pipeline = [
    {"$match": {"gender": "org"}},
    {"$project": {"n_prizes": {"$size": "$prizes"}}},
    {"$group": {"_id": None, "n_prizes_total": {"$sum": "$n_prizes"}}}
]
print(list(db.laureates.aggregate(pipeline)))

[{'_id': None, 'n_prizes_total': 392}]


### Gap years, aggregated

In [11]:
from collections import OrderedDict

original_categories = sorted(set(db.prizes.distinct("category", {"year": "1901"})))
pipeline = [
    {"$match": {"category": {"$in": original_categories}}},
    {"$project": {"category": 1, "year": 1}},
    
    # Collect the set of category values for each prize year.
    {"$group": {"_id": "$year", "categories": {"$addToSet": "$category"}}},
    
    # Project categories *not* awarded (i.e., that are missing this year).
    {"$project": {"missing": {"$setDifference": [original_categories, "$categories"]}}},
    
    # Only include years with at least one missing category
    {"$match": {"missing.0": {"$exists": True}}},
    
    # Sort in reverse chronological order. Note that "_id" is a distinct year at this stage.
    {"$sort": OrderedDict([("_id", -1)])},
]
for doc in db.prizes.aggregate(pipeline):
    print("{year}: {missing}".format(year=doc["_id"],missing=", ".join(sorted(doc["missing"]))))

2019: physics
2018: physics
2003: chemistry
2002: chemistry
2000: physics
1995: physics
1994: physics
1993: chemistry
1972: peace
1967: peace
1966: peace
1956: peace
1955: peace
1948: peace
1943: literature, peace
1939: peace
1935: literature
1934: physics
1933: chemistry
1932: peace
1931: physics
1928: peace
1925: medicine
1924: chemistry, peace
1923: peace
1921: medicine
1919: chemistry
1918: literature, medicine, peace
1917: chemistry, medicine
1916: chemistry, medicine, peace, physics
1915: medicine, peace
1914: literature, peace


## Zoom into Array Fields

In [87]:
# Number of laureates for each prize

print(list(db.prizes.aggregate([
    {"$project": {"n_laureates":{"$size": "$laureates"}, "category":1, "_id":0}},
    {"$group": {"_id": "$category", "n_laureates": {"$sum": "$n_laureates"}}},
    {"$sort": {"n_laureates": -1}}
])))
print()
# unwind stage

print(list(db.prizes.aggregate([
    {"$unwind": "$laureates"},
    {"$project": {"_id": 0, "year": 1, "category": 1, "laureates.surname":1, "laureates.share":1}},
    {"$limit": 3}
])))

# normalization

list(db.prizes.aggregate([
    {"$unwind": "$laureates"},
    {"$project": {"year":1, "category": 1, "laureates.id":1}},
    {"$group": {"_id": {"$concat": ["$category", ":","$year"]},
               "laureate_ids": {"$addToSet": "$laureates.id"}}},
    {"$limit": 5}
]))

print(list(db.prizes.aggregate([
    {"$project": {"n_laureates":{"$size": "$laureates"}, "category":1, "_id":0}},
    {"$group": {"_id": "$category", "n_laureates": {"$sum": "$n_laureates"}}},
    {"$sort": {"n_laureates": -1}}
])))# instead of this
print()
print(list(db.prizes.aggregate([
    {"$unwind": "$laureates"},
    {"$group": {"_id": "$category", "n_laureates": {"$sum":1}}},
    {"$sort": {"n_laureates":-1}}
]))) # You can do this
print()

# lookup pulls documents from another collection via what's termed a left outer join.
print(list(db.prizes.aggregate([
    {"$match": {"category": "economics"}},
    {"$unwind": "$laureates"},
    {"$lookup": {"from":"laureates", "foreignField": "id",
                    "localField": "laureates.id", "as":"laureate_bios"}},
    {"$unwind": "$laureate_bios"},
    {"$group": {"_id":None,
               "bornCountries":
               {"$addToSet": "$laureate_bios.bornCountry"}}}
])))
print()
bornCountries = db.laureates.distinct(
"bornCountry", {"prizes.category":"economics"})
print(bornCountries)

[{'_id': 'medicine', 'n_laureates': 3108}, {'_id': 'physics', 'n_laureates': 2842}, {'_id': 'chemistry', 'n_laureates': 2506}, {'_id': 'peace', 'n_laureates': 1890}, {'_id': 'literature', 'n_laureates': 1638}, {'_id': 'economics', 'n_laureates': 1204}]

[{'year': '2020', 'category': 'chemistry', 'laureates': {'surname': 'Charpentier', 'share': '2'}}, {'year': '2020', 'category': 'chemistry', 'laureates': {'surname': 'Doudna', 'share': '2'}}, {'year': '2020', 'category': 'economics', 'laureates': {'surname': 'Milgrom', 'share': '2'}}]
[{'_id': 'medicine', 'n_laureates': 3108}, {'_id': 'physics', 'n_laureates': 2842}, {'_id': 'chemistry', 'n_laureates': 2506}, {'_id': 'peace', 'n_laureates': 1890}, {'_id': 'literature', 'n_laureates': 1638}, {'_id': 'economics', 'n_laureates': 1204}]

[{'_id': 'medicine', 'n_laureates': 3108}, {'_id': 'physics', 'n_laureates': 2842}, {'_id': 'chemistry', 'n_laureates': 2506}, {'_id': 'peace', 'n_laureates': 1890}, {'_id': 'literature', 'n_laureates': 163

### Here and elsewhere


In [111]:
key_ac = "prizes.affiliations.country"
key_bc = "bornCountry"
pipeline = [
   {"$project": {key_bc: 1, key_ac: 1}},
    {"$unwind": "$prizes"},
    {"$unwind": "$prizes.affiliations"},
    {"$match": {key_ac: {"$in": db.laureates.distinct(key_ac)}}},
    {"$project": {"affilCountrySameAsBorn": {
        "$gte": [{"$indexOfBytes": ["$"+key_ac, "$"+key_bc]}, 0]}}},
    {"$group": {"_id": "$affilCountrySameAsBorn",
                "count": {"$sum": 1}}},
]
for doc in db.laureates.aggregate(pipeline):
    print(doc)

{'_id': True, 'count': 7000}
{'_id': False, 'count': 3696}


In [109]:
key_ac = "prizes.affiliations.country"
key_bc = "bornCountry"
pipeline = [
    {"$project": {key_bc: 1, key_ac: 1}},

    # Ensure a single prize affiliation country per pipeline document
    {"$unwind": "$prizes"},
    {"$unwind": "$prizes.affiliations"},

    # Ensure values in the list of distinct values (so not empty)
    {"$match": {key_ac: {"$in": db.laureates.distinct(key_ac)}}},
    {"$project": {"affilCountrySameAsBorn": {
        "$gte": [{"$indexOfBytes": ["$"+key_ac, "$"+key_bc]}, 0]}}},

    # Count by "$affilCountrySameAsBorn" value (True or False)
    {"$group": {"_id": "$affilCountrySameAsBorn",
                "count": {"$sum": 1}}},
]
for doc in db.laureates.aggregate(pipeline): print(doc)

{'_id': True, 'count': 7000}
{'_id': False, 'count': 3696}


### Countries of birth by prize category


In [112]:
pipeline = [
    {"$unwind": "$laureates"},
    {"$lookup": {
        "from": "laureates", "foreignField": "id",
        "localField": "laureates.id", "as": "laureate_bios"}},
    {"$unwind": "$laureate_bios"},
    {"$project": {"category": 1,
                  "bornCountry": "$laureate_bios.bornCountry"}},
    {"$group": {"_id": "$category",
                "bornCountries": {"$addToSet": "$bornCountry"}}},
    {"$project": {"category": 1,
                  "nBornCountries": {"$size": "$bornCountries"}}},
    {"$sort": {"nBornCountries": -1}},
]
for doc in db.prizes.aggregate(pipeline): print(doc)

{'_id': 'literature', 'nBornCountries': 57}
{'_id': 'peace', 'nBornCountries': 53}
{'_id': 'chemistry', 'nBornCountries': 48}
{'_id': 'medicine', 'nBornCountries': 44}
{'_id': 'physics', 'nBornCountries': 43}
{'_id': 'economics', 'nBornCountries': 21}


## Something Extra: $addFields to Aid Analysis


In [139]:
# Add fields without using project. Use $addFields for this.
# cond is a ternary operator. It evaluates the first expression and if it's
# true returns the value of the next expression. If not it returns the third
# expression.

docs =list(db.laureates.aggregate([
    {"$match": {"died": {"$gt": "1700"}, "born":{"$gt":"1700"}}},
    {"$addFields":{"bornArray":{"$split": ["$born","-"]},
                   "diedArray":{"$split": ["$died", "-"]}}},
    {"$addFields": {"born": {"$cond":[
        {"$in": ["00", "$bornArray"]},
        {"$concat":[{"$arrayElemAt":["$bornArray", 0]},"-01-01"]},
        "$born"
    ]}}},
    {"$project":{"died":{"$dateFromString":{"dateString":"$died"}},
                 "born":{"$dateFromString":{"dateString":"$born"}}}},
    {"$project":{"years":{"$floor":{"$divide":[
        {"$subtract":["$died","$born"]},
        31557600000]}}}},
    #{"$bucket":{"groupBy":"$years","boundaries":list(range(30,120,10))}}
]))
docs

[{'years': 77.0},
 {'years': 74.0},
 {'years': 78.0},
 {'years': 55.0},
 {'years': 46.0},
 {'years': 66.0},
 {'years': 76.0},
 {'years': 84.0},
 {'years': 83.0},
 {'years': 78.0},
 {'years': 75.0},
 {'years': 63.0},
 {'years': 67.0},
 {'years': 85.0},
 {'years': 64.0},
 {'years': 68.0},
 {'years': 72.0},
 {'years': 80.0},
 {'years': 79.0},
 {'years': 81.0},
 {'years': 67.0},
 {'years': 89.0},
 {'years': 83.0},
 {'years': 77.0},
 {'years': 76.0},
 {'years': 77.0},
 {'years': 85.0},
 {'years': 91.0},
 {'years': 81.0},
 {'years': 88.0},
 {'years': 71.0},
 {'years': 69.0},
 {'years': 90.0},
 {'years': 79.0},
 {'years': 94.0},
 {'years': 82.0},
 {'years': 74.0},
 {'years': 73.0},
 {'years': 82.0},
 {'years': 82.0},
 {'years': 81.0},
 {'years': 85.0},
 {'years': 76.0},
 {'years': 83.0},
 {'years': 53.0},
 {'years': 57.0},
 {'years': 81.0},
 {'years': 89.0},
 {'years': 58.0},
 {'years': 79.0},
 {'years': 72.0},
 {'years': 76.0},
 {'years': 74.0},
 {'years': 65.0},
 {'years': 70.0},
 {'years':

### "...it's the life in your years"

In [138]:
from operator import itemgetter

print(max(docs, key=itemgetter("years")))
print(min(docs, key=itemgetter("years")))

{'_id': ObjectId('614dc045a86e98712247b168'), 'years': 103.0}
{'_id': ObjectId('614dc045a86e98712247b1bd'), 'years': 39.0}


### How many prizes were awarded to immigrants?


In [179]:
list(db.laureates.aggregate([
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$unwind": "$prizes"},
    {"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
    # see https://docs.mongodb.com/manual/reference/operator/aggregation/in/
    {"$match": {"bornCountryInAffiliations": False}},
    {"$count": "awardedElsewhere"}
]))

    


[{'awardedElsewhere': 6566}]

### Refinement: filter out "unaffiliated" people

In [184]:
pipeline = [
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$unwind": "$prizes"},
    {"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
    {"$match": {"bornCountryInAffiliations": False}},
    {"$count": "awardedElsewhere"},
]

added_stage = {"$match": {"prizes.affiliations.country": {"$in": db.laureates.distinct("prizes.affiliations.country")}}}
pipeline.insert(3, added_stage)
print(list(db.laureates.aggregate(pipeline)))

[{'awardedElsewhere': 3388}]
