# Mongo Queries

In [1]:
import pymongo
import numpy

#Mongo setup code. Our database is called testdb and our content is in three collections.
mongoclient = pymongo.MongoClient("mongodb://localhost:27017/")
dblp = mongoclient["testdb"]
articles_collection = dblp["articles"]
proceeding_collection = dblp["proceedings"]
inproceeding_collection = dblp["inproceedings"]

In [2]:
#Query wrapper class for the benchmarking code. The 'col' parameter is the collection on which to aggregate.
class Query:
    def __init__(self, q, col):
        self.q = q
        self.col = col
    def run_query(self):
        return self.col.aggregate(self.q)

#### E1: Who is the publisher of the PODS conference proceedings?

In [4]:
e1 = Query([
    {
        #Match only the documents that have 'PODS' as their booktitle field.
        "$match": {
            "booktitle": "PODS"
        }
    },
    {
        #Limit the result to only one.
        "$limit": 1
    },
    {
        #Only keep the publisher field.
        "$project": {
            "_id": 0,
            "publisher": "$publisher"
        }
    }
], proceeding_collection)

for doc in e1.run_query():
    print(doc)

{'publisher': 'ACM'}


#### E2: What are the titles of the articles that Martin Grohe wrote in the Theory of Computing Systems journal? (Sort in alphabetic order)


In [11]:
e2 = Query([
    {
        #Match on only the documents where 'Martin Grohe' is an author.
        "$match": {
            "author": "Martin Grohe"
        }
    },
    {
        #Match the results to only documents where the journal is 'Theory Comput. Syst.'.
        "$match": {
            "journal": "Theory Comput. Syst."
        }
    },
    {
        #Only keep the titles.
        "$project": {
            "_id": 0,
            "title": "$title"
        }
    },
    {
        #Sort the titles alphabetically.
        "$sort": {
            "title": 1
        }
    }
], articles_collection)

for doc in e2.run_query():
    print(doc)


{'title': 'Database Query Processing Using Finite Cursor Machines.'}
{'title': 'Learnability and Definability in Trees and Similar Structures.'}
{'title': 'Tight Lower and Upper Bounds for the Complexity of Canonical Colour Refinement.'}


#### M1: How many articles were published in the SIGMOD conference proceedings this year?

In [6]:
m1 = Query([
    {
        #Match on documents where the booktitle field contains the string 'SIGMOD'. 
        #The regexmatch is case-insensitive.
        "$match": 
        {
            "$expr": 
            {
                "$regexMatch": {
                    "input": "$booktitle",
                    "regex": "SIGMOD"
                }
            }
        }
    },
    {
        #Match the results when the year is '2022'
        "$match": {
            "year": "2022"
        }
    },
    {
        #Count the amount of results and name this count 'article_count'.
        "$count": "article_count"
    }
], inproceeding_collection)

for doc in m1.run_query():
    print(doc)


{'article_count': 282}


#### M2: How many articles were published in the oldest journal, and what is its title?

In [10]:
m2 = Query([
    {
        #Set the year value of each document that has it as 'NaN' to the MAX value, so that when sorted in descending order it will be found last, which makes them not be considered.
        "$set":
        {
            "year":
            {
                "$cond": [{"$eq": ['$year', numpy.NaN]}, "MAXVAL", '$year']
            }
        }
    },
    {
        #Group the documents by journal, count the amount of documents per group and set the year value per group to the year value of the oldest arcticle in that group.
        "$group": 
        {
            "_id": "$journal",
            "count": {"$sum": 1},
            "year": {"$min": "$year"}
        }
    },
    {
        #Sort the results in ascending order.
        "$sort": {"year": 1}
    },
    {
        #take the oldest journal.
        "$limit": 1
    },
    {
        #Rename it to say 'journal'.
        "$project": {
            "_id": 0,
            "journal": "$_id",
            "count": 1,
            "year": 1
        }
    }
    
], articles_collection)


for doc in m2.run_query():
    print(doc)




{'count': 4864, 'year': '1936', 'journal': 'J. Symb. Log.'}


#### M3: What was the median amount of articles published for each year of the CIDR conference?

In [14]:
m3 = Query([
    {
        #Match the booktitles to the 'CIDR' conference.
        "$match": {
            "booktitle": "CIDR"
        }
    },
    {
        #Group the documents by year and count them per year.
        "$group": {
            "_id": "$year",
            "articles_year": { "$sum": 1 }
        }
    },
    {
        #Sort them in ascending counts.
        "$sort": {
            "articles_year": 1
        }
    },
    {
        #Group them by booktitle and add the counts per year to an array.
        "$group": 
        {
            "_id": "$booktitle", 
            "count_array": 
            {
                "$push": "$articles_year" 
            }
        }
    },
    {
        #Take the array size as a variable.
        "$project": {
        "_id": 1,
        "count_array": 1,
        "size": { "$size": [ "$count_array" ] }
        }
    },
    {
        #Calculate the middle in the array of counts.
        "$project": {
            "_id": 1,
            "count_array": 1,
            "mid": { "$trunc": { "$divide": ["$size", 2] } }
            }
    },
    {
        #Take the element in the middle of the array.
        "$project": {
            "_id": 0,
            "median": {
                "$arrayElemAt": ["$count_array", "$mid"]
            }
        }
    }
], inproceeding_collection)
for doc in m3.run_query():
    print(doc)

{'median': 47}


#### M4: In which year did the SIGMOD conference have the most papers with over 10 authors?

In [15]:
m4 = Query([
    {
        #Match on any documents where the booktitle field contains the 'SIGMOD' string.
        "$match": {
            "$expr": 
            {
                "$regexMatch": {
                    "input": "$booktitle",
                    "regex": "SIGMOD"
                }
            }
        }
    },
    {
        #Check if the author array has more than 10 elements by seeing of the element with index 10 exists.
        "$match": {
            "author.10": {"$exists": "true"}
        }
    },
    {
        #Group the documents by year.
        "$group": {
            "_id": "$year",
            "count": { "$sum": 1 }
        }
    },
    {
        #Sort the counts in descending order.
        "$sort": {
            "count": -1
        }
    },
    {
        #take year with the highest count.
        "$limit": 1
    }
], inproceeding_collection)

for doc in m4.run_query():
    print(doc)

{'_id': '2020', 'count': 13}


#### M5: Who were the most frequent editors for the PODS conference? How many times were they an editor?

In [17]:
m5 = Query([
    {
        #Remove the documents with no booktitle.
      "$match": 
      {
          "booktitle": { "$ne": numpy.NaN }
      }
    },
    {
        #Match the results to documents containing 'PODS' in the booktitle.
        "$match": {
            "$expr": 
            {
                "$regexMatch": {
                    "input": "$booktitle",
                    "regex": "PODS",
                    "options": "i"
                }
            }
        }
    },
    {
        #Unwind the editor array - Any document with an array in the editor field will be unwinded so that for every value in the editor array a document exists.
        #Example: [name: "John Doe", editor: [1,2,3]] becomes [name "John Doe", editor: 1], [name "John Doe", editor: 2], [name "John Doe", editor: 3]
        "$unwind": "$editor"
    },
    {
        #Group the documents by editor. and count how many documents exists for each editor.
        "$group": {
            "_id": "$editor",
            "count": { "$sum": 1 },
        }
    },
    {
        #Group the documents by count. and push their name into an array.
        "$group": {
            "_id": "$count",
            "names": {
                "$push": "$_id"
            }
        }
    },
    {
        #Sort the counts in descending order.
        "$sort": {
            "_id": -1
        }
    },
    {
        #Take the document of the highest count. Here, the 'names' array contains all editors who have been the most frequent editors.
        "$limit": 1
    },
    {
        #Keep the names and rename the _id field to amount.
        "$project": {
            "names": 1,
            "amount": "$_id",
            "_id": 0
        }
    }
    
], proceeding_collection)
for doc in m5.run_query():
    print(doc)

{'names': ['Maurizio Lenzerini', 'Leonid Libkin', 'Josep Lluís Larriba-Pey'], 'amount': 3}


In [4]:
m6 = Query([
  {
    #Unwind the editor field. See m5 for explanation.
    "$unwind": "$editor"
  },
  {
    #Remove the documents that have no editor.
      "$match": 
      {
          "editor": { "$ne": numpy.NaN }
      }
  },
  {
    #Group the documents by editor and push all the conferences they've written for into an array. 
    "$group": {
      "_id": "$editor",
      "booktitles": {
        "$push": "$booktitle"
      }
    }
  },
  {
    #Peform a lookup from the inproceedings collection where the editors are used as local field and the foreign field is are the authors.
    "$lookup": {
      "from": "inproceedings",
      "localField": "_id",
      "foreignField": "author",
      "as": "inproceedings"
    }
  },
  {
    #unwind the found documents.
    "$unwind": "$inproceedings"
  },
  {
    #Push the booktitles of the documents that were looked up into the booktitle array.
    "$group": {
      "_id": "$_id",
      "booktitles": {
        "$push": "$inproceedings.booktitle"
      }
    }
  },
  {
    #count the amount of publications for both collections by checking the size of the booktitle array.
    "$project": {
        "_id": 1,
        "booktitles": 1,
        "publication_count": 
        {
        "$size": "$booktitles"
        }
      }
  },
  {
    #sort the counts in descending order.
    "$sort": {
      "publication_count": -1
    }
  },
  {
    #Limit to only the academic with the highest publication count.
    "$limit": 1
  },
  {
    #Unwind the booktitle array.
    "$unwind": "$booktitles"
  },
  {
    #Group the documents by booktitle.
    "$group": {
      "_id": "$booktitles",
      "count": {"$sum": 1}
    }
  },
  {
    #count how many different groups there are.
    "$count": "distinct_booktitles"
  }

  ], proceeding_collection)
for doc in m6.run_query():
    print(doc)

{'distinct_booktitles': 173}


#### H1: For each researcher that published to the ICDT conference in 2020: Who was their most frequently occurring co-author (conference & journal)? How many times did they collaborate?

In [6]:
H1 = inproceeding_collection.aggregate([
  {
    "$match": {
      "booktitle": "ICDT"
    }
  },
  {
    "$match": {
      "year": "2020"
    }
  },
  { "$unwind": "$author" },
  {
    "$group": {
      "_id": "$author",
      "authors": { "$addToSet": "$author" }
    }
  },
])

for doc in H1:
    print(doc)

{'_id': 'Evgenij Thorstensen', 'authors': ['Evgenij Thorstensen']}
{'_id': 'Gonzalo Navarro', 'authors': ['Gonzalo Navarro']}
{'_id': 'Peter Lindner 0001', 'authors': ['Peter Lindner 0001']}
{'_id': 'Henrik Forssell', 'authors': ['Henrik Forssell']}
{'_id': 'Diego Figueira', 'authors': ['Diego Figueira']}
{'_id': 'Jan Van den Bussche', 'authors': ['Jan Van den Bussche']}
{'_id': nan, 'authors': [nan]}
{'_id': 'Mohammad Sadoghi', 'authors': ['Mohammad Sadoghi']}
{'_id': 'Leopoldo E. Bertossi', 'authors': ['Leopoldo E. Bertossi']}
{'_id': 'Dimitri Surinx', 'authors': ['Dimitri Surinx']}
{'_id': 'Bas Ketsman', 'authors': ['Bas Ketsman']}
{'_id': 'Dan Suciu', 'authors': ['Dan Suciu']}
{'_id': 'Heba Aamer', 'authors': ['Heba Aamer']}
{'_id': 'Johannes Doleschal', 'authors': ['Johannes Doleschal']}
{'_id': 'Juan L. Reutter', 'authors': ['Juan L. Reutter']}
{'_id': 'Yu Chen', 'authors': ['Yu Chen']}
{'_id': 'Ke Yi 0001', 'authors': ['Ke Yi 0001']}
{'_id': 'Alejandro Grez', 'authors': ['Alejan