# Mongo Queries

In [1]:
import pymongo
import numpy

mongoclient = pymongo.MongoClient("mongodb://localhost:27017/")
dblp = mongoclient["testdb"]
articles_collection = dblp["articles"]
proceeding_collection = dblp["proceedings"]
inproceeding_collection = dblp["inproceedings"]

In [3]:
class Query:
    def __init__(self, q, col):
        self.q = q
        self.col = col
    def run_query(self):
        return self.col.aggregate(self.q)

#### E1: Who is the publisher of the PODS conference proceedings?

In [63]:
#Test comment (Nico)

#Query E1
doc = proceeding_collection.find({"booktitle": "PODS"}, {"publisher" : 1})

print(doc[0].get('publisher'))

e1_old = [
    {
        "$match": {
            "booktitle": "PODS"
        }
    },
    {
        "$project": {
            "_id": 0,
            "publisher": "$publisher"
        }
    },
    {
        "$limit": 1
    }
]

e1 = Query([
    {
        "$match": {
            "booktitle": "PODS"
        }
    },
    {
        "$project": {
            "_id": 0,
            "publisher": "$publisher"
        }
    },
    {
        "$limit": 1
    }
], proceeding_collection)

for doc in e1.run_query():
    print(doc)

ACM
{'publisher': 'ACM'}


#### E2: What are the titles of the articles that Martin Grohe wrote in the Theory of Computing Systems journal? (Sort in alphabetic order)


In [64]:
docs = articles_collection.find({"author" : "Martin Grohe", "journal": "Theory Comput. Syst."}, {"title": 1}).sort("title")
for doc in docs:
    print(doc.get('title'))

e2_old = articles_collection.aggregate([
    {
        "$match": {
            "author": "Martin Grohe"
        }
    },
    {
        "$match": {
            "journal": "Theory Comput. Syst."
        }
    },
    {
        "$project": {
            "_id": 0,
            "title": "$title"
        }
    }
])

e2 = Query([
    {
        "$match": {
            "author": "Martin Grohe"
        }
    },
    {
        "$match": {
            "journal": "Theory Comput. Syst."
        }
    },
    {
        "$project": {
            "_id": 0,
            "title": "$title"
        }
    }
], articles_collection)

for doc in e2.run_query():
    print(doc)


Database Query Processing Using Finite Cursor Machines.
Learnability and Definability in Trees and Similar Structures.
Tight Lower and Upper Bounds for the Complexity of Canonical Colour Refinement.
{'title': 'Learnability and Definability in Trees and Similar Structures.'}
{'title': 'Tight Lower and Upper Bounds for the Complexity of Canonical Colour Refinement.'}
{'title': 'Database Query Processing Using Finite Cursor Machines.'}


#### M1: How many articles were published in the SIGMOD conference proceedings this year?

In [65]:
count = inproceeding_collection.count_documents({"booktitle": {"$regex": "SIGMOD"}, "year": "2022"})
miauw = inproceeding_collection.aggregate([
    {
        "$match": 
        {
            "$expr": 
            {
                "$regexMatch": {
                    "input": "$booktitle",
                    "regex": "SIGMOD"
                }
            }
        }
    },
    {
        "$match": {
            "year": "2022"
        }
    },
    {
        "$count": "article_count"
    }
])

m1 = Query([
    {
        "$match": 
        {
            "$expr": 
            {
                "$regexMatch": {
                    "input": "$booktitle",
                    "regex": "SIGMOD"
                }
            }
        }
    },
    {
        "$match": {
            "year": "2022"
        }
    },
    {
        "$count": "article_count"
    }
], inproceeding_collection)

for doc in m1.run_query():
    print(doc)


{'article_count': 282}


In [None]:
[
    {
        "$set":
        {
            "year":
            {
                "$cond": [{"$eq": ['$year', numpy.NaN]}, "MAXVAL", '$year']
            }
        }
    },
    {
        "$group": 
        {
            "_id": "$journal",
            "count": {"$sum": 1},
            "year": {"$min": "$year"}
        }
    },
    {
        "$sort": {"year": 1}
    },
    {
        "$limit": 1
    }
]

#### M2: How many articles were published in the oldest journal, and what is its title?

In [67]:
m2 = Query([
    {
        "$set":
        {
            "year":
            {
                "$cond": [{"$eq": ['$year', numpy.NaN]}, "MAXVAL", '$year']
            }
        }
    },
    {
        "$group": 
        {
            "_id": "$journal",
            "count": {"$sum": 1},
            "year": {"$min": "$year"}
        }
    },
    {
        "$sort": {"year": 1}
    },
    {
        "$limit": 1
    }
], articles_collection)


for doc in m2.run_query():
    print(doc)




{'_id': 'J. Symb. Log.', 'count': 4864, 'year': '1936'}


In [17]:
q2 = articles_collection.aggregate([
    {
        "$match": {
            "journal": { "$ne": numpy.NaN },
            "journal": articles_collection.find({ "journal": { "$ne": numpy.NaN } }).sort("year", 1).limit(1)[0]["journal"]
        }
    },
    {
        "$group": {
            "_id": "$journal",
            "count": { "$sum": 1 }
        }
    }
])
for doc in q2:
    print(doc)

{'_id': 'J. Symb. Log.', 'count': 4864}


#### M3: What was the median amount of articles published for each year of the CIDR conference?

In [4]:
m3_old = inproceeding_collection.aggregate([
    {
        "$match": {
            "booktitle": "CIDR"
        }
    },
    {
        "$group": {
            "_id": "$year",
            "num_records": { "$sum": 1 }
        }
    },
    {
        "$sort": {
            "num_records": 1
        }
    },
    {
        "$group": 
        {
            "_id": "$booktitle", 
            "valueArray": 
            {
                "$push": "$num_records" 
            }
        }
    },
    {
        "$project": {
        "_id": 1,
        "valueArray": 1,
        "size": { "$size": [ "$valueArray" ] }
        }
    },
    {
        "$project": {
            "_id": 1,
            "valueArray": 1,
            "mid": { "$trunc": { "$divide": ["$size", 2] } }
            }
    },
    {
        "$project": {
            "_id": 0,
            "median": {
                "$arrayElemAt": ["$valueArray", "$mid"]
            }
        }
    }
])

m3 = Query([
    {
        "$match": {
            "booktitle": "CIDR"
        }
    },
    {
        "$group": {
            "_id": "$year",
            "num_records": { "$sum": 1 }
        }
    },
    {
        "$sort": {
            "num_records": 1
        }
    },
    {
        "$group": 
        {
            "_id": "$booktitle", 
            "valueArray": 
            {
                "$push": "$num_records" 
            }
        }
    },
    {
        "$project": {
        "_id": 1,
        "valueArray": 1,
        "size": { "$size": [ "$valueArray" ] }
        }
    },
    {
        "$project": {
            "_id": 1,
            "valueArray": 1,
            "mid": { "$trunc": { "$divide": ["$size", 2] } }
            }
    },
    {
        "$project": {
            "_id": 0,
            "median": {
                "$arrayElemAt": ["$valueArray", "$mid"]
            }
        }
    }
], inproceeding_collection)
for doc in m3.run_query():
    print(doc)

{'_id': None, 'valueArray': [27, 27, 39, 39, 42, 43, 47, 49, 50, 60, 64, 74], 'mid': 6.0}


#### M4: In which year did the SIGMOD conference have the most papers with over 10 authors?

In [24]:
m4_old = articles_collection.aggregate([
    {
        "$match": {
            "journal": "SIGMOD Rec."
        }
    },
    {
        "$match": {
            "author.1": {"$exists": "true"}
        }
    },
    {
        "$group": {
            "_id": "$year",
            "count": { "$sum": 1 }
        }
    },
    {
        "$sort": {
            "count": -1
        }
    },
    {
        "$limit": 1
    }
])

m4 = Query([
    {
        "$match": {
            "$expr": 
            {
                "$regexMatch": {
                    "input": "$booktitle",
                    "regex": "SIGMOD"
                }
            }
        }
    },
    {
        "$match": {
            "author.10": {"$exists": "true"}
        }
    },
    
    {
        "$group": {
            "_id": "$year",
            "count": { "$sum": 1 }
        }
    },
    {
        "$sort": {
            "count": -1
        }
    },
    {
        "$limit": 1
    }
], inproceeding_collection)

for doc in m4.run_query():
    print(doc)

{'_id': '2020', 'count': 13}


#### M5: Who were the most frequent editors for the PODS conference? How many times were they an editor?

In [58]:
m5_old = [
    {
        "$match": {
            "$expr": 
            {
                "$regexMatch": {
                    "input": "$booktitle",
                    "regex": "PODS"
                }
            }
        }
    },
    {
        "$unwind": "$editor"
    },
    {
        "$group": {
            "_id": "$editor",
            "count": { "$sum": 1 }
        }
    },
    {
        "$sort": {
            "count": -1
        }
    },
    {
        "$limit": 2
    }
]

m5 = Query([
    {
      "$match": 
      {
          "booktitle": { "$ne": numpy.NaN }
      }
    },
    {
        "$match": {
            "$expr": 
            {
                "$regexMatch": {
                    "input": "$booktitle",
                    "regex": "PODS",
                    "options": "i"
                }
            }
        }
    },
    {
        "$unwind": "$editor"
    },
    {
        "$group": {
            "_id": "$editor",
            "count": { "$sum": 1 },
        }
    },
    {
        "$sort": {
            "count": -1
        }
    },
    {
        "$group": {
            "_id": "$count",
            "names": {
                "$push": "$_id"
            }
        }
    },
    {
        "$sort": {
            "_id": -1
        }
    },
    {
        "$limit": 1
    },
    {
        "$project": {
            "names": 1,
            "amount": "$_id",
            "_id": 0
        }
    }
    
], proceeding_collection)
for doc in m5.run_query():
    print(doc)

{'names': ['Maurizio Lenzerini', 'Josep Lluís Larriba-Pey', 'Leonid Libkin'], 'amount': 3}


In [72]:
m6 = Query([
  {
    "$unwind": "$editor"
  },
  {
      "$match": 
      {
          "editor": { "$ne": numpy.NaN }
      }
  },
  {
    "$group": {
      "_id": "$editor",
      "booktitles": {
        "$push": "$booktitle"
      },
      "editorCount": {
        "$sum": 1
      }
    }
  },
  {
    "$lookup": {
      "from": "inproceedings",
      "localField": "_id",
      "foreignField": "author",
      "as": "inproceedings"
    }
  },
  {
    "$unwind": "$inproceedings"
  },
  {
    "$group": {
      "_id": "$_id",
      "booktitles": {
        "$push": "$inproceedings.booktitle"
      }
    }
  },
  {
    "$project": {
        "_id": 1,
        "booktitles": 1,
        "publication_count": 
        {
        "$size": "$booktitles"
        }
      }
  },
  {
    "$sort": {
      "publication_count": -1
    }
  },
  {
    "$limit": 1
  },
  {
    "$unwind": "$booktitles"
  },
  {
    "$group": {
      "_id": "$booktitles",
      "count": {"$sum": 1}
    }
  },
  {
    "$count": "distinct_booktitles"
  }

  ], proceeding_collection)
for doc in m6.run_query():
    print(doc)

{'distinct_booktitles': 173}


#### H1: For each researcher that published to the ICDT conference in 2020: Who was their most frequently occurring co-author (conference & journal)? How many times did they collaborate?

In [6]:
H1 = inproceeding_collection.aggregate([
  {
    "$match": {
      "booktitle": "ICDT"
    }
  },
  {
    "$match": {
      "year": "2020"
    }
  },
  { "$unwind": "$author" },
  {
    "$group": {
      "_id": "$author",
      "authors": { "$addToSet": "$author" }
    }
  },
])

for doc in H1:
    print(doc)

{'_id': 'Evgenij Thorstensen', 'authors': ['Evgenij Thorstensen']}
{'_id': 'Gonzalo Navarro', 'authors': ['Gonzalo Navarro']}
{'_id': 'Peter Lindner 0001', 'authors': ['Peter Lindner 0001']}
{'_id': 'Henrik Forssell', 'authors': ['Henrik Forssell']}
{'_id': 'Diego Figueira', 'authors': ['Diego Figueira']}
{'_id': 'Jan Van den Bussche', 'authors': ['Jan Van den Bussche']}
{'_id': nan, 'authors': [nan]}
{'_id': 'Mohammad Sadoghi', 'authors': ['Mohammad Sadoghi']}
{'_id': 'Leopoldo E. Bertossi', 'authors': ['Leopoldo E. Bertossi']}
{'_id': 'Dimitri Surinx', 'authors': ['Dimitri Surinx']}
{'_id': 'Bas Ketsman', 'authors': ['Bas Ketsman']}
{'_id': 'Dan Suciu', 'authors': ['Dan Suciu']}
{'_id': 'Heba Aamer', 'authors': ['Heba Aamer']}
{'_id': 'Johannes Doleschal', 'authors': ['Johannes Doleschal']}
{'_id': 'Juan L. Reutter', 'authors': ['Juan L. Reutter']}
{'_id': 'Yu Chen', 'authors': ['Yu Chen']}
{'_id': 'Ke Yi 0001', 'authors': ['Ke Yi 0001']}
{'_id': 'Alejandro Grez', 'authors': ['Alejan

In [45]:
m6 = proceeding_collection.aggregate([
  {
    "$unwind": "$editor"
  },
  {
      "$match": 
      {
          "editor": { "$ne": numpy.NaN }
      }
  },
  {
    "$group": {
      "_id": "$editor",
      "editorCount": {
        "$sum": 1
      }
    }
  },
  {
    "$lookup": {
      "from": "inproceedings",
      "localField": "_id",
      "foreignField": "author",
      "as": "inproceedings"
    }
  },
  {
    "$project": {
      "_id": 0,
      "name": "$_id",
      "editorCount": 1,
      "authorCount": {
        "$size": "$inproceedings"
      },
      
      "totalCount": {
        "$add": [
          "$editorCount",
          {
            "$size": "$inproceedings"
          }
        ]
      }
    }
  },
  {
    "$sort": {
      "totalCount": -1
    }
  },
  {
    "$limit": 1
  },
  {
    "$project": {
      "_id": 0,
      "name": "$name",
      "conf": proceeding_collection.find({"author": "$name"}).distinct("booktitle").count("booktitle")
    }
  }
  

])
for doc in m6:
    print(doc)

OperationFailure: Invalid $project :: caused by :: Cannot do exclusion on field conf in inclusion projection, full error: {'ok': 0.0, 'errmsg': 'Invalid $project :: caused by :: Cannot do exclusion on field conf in inclusion projection', 'code': 31254, 'codeName': 'Location31254'}

In [None]:
[
    {
        "$unwind": "$editor"
    },
    {
        "$match": {
            "editor": { "$ne": numpy.NaN }
        }
    },
    {
        "$group": {
            "_id": "$editor",
            "count": { "$sum": 1 }
        }
    },
    
    {
        "$lookup": 
        {
            "from": "inproceedings",
            "localField": "_id",
            "foreignField": "author",
            "as": "inproceedings"
        }
    },
    {
        "$project": {
            "_id": 1,
            "count": 1,
            "inproceedings_count": { "$size": "$inproceedings" }
        }
    },
    {
        "$addFields": {
            "total_count": { "$sum": ["$count", "$inproceedings_count"] }
        }
    },
    {
        "$sort": {
            "total_count": -1
        }
    },
    {
        "$limit": 10
    }
]