In [1]:
!pip install pymongo




In [2]:
import pymongo

In [3]:
import requests

url = "https://dst-de.s3.eu-west-3.amazonaws.com/mongo_fr/books.json"
response = requests.get(url)


with open("books.json", "wb") as f:
    f.write(response.content)

print("Download complete.")


Download complete.


In [30]:
from pymongo import MongoClient

from pprint import pprint

client = MongoClient(
    host="127.0.0.1",
    port = 27017,
        # auth line 1,
    # auth line 2
)

In [5]:
client = MongoClient("mongodb://localhost:27017/")
print(client.list_database_names())


['admin', 'config', 'employees', 'local', 'sample_training']


In [6]:
from pymongo import MongoClient

client = MongoClient("mongodb://datascientest:dst123@localhost:27017/?authSource=admin")
print(client.list_database_names())



['admin', 'config', 'employees', 'local', 'sample_training']


In [7]:
db = client["sample_training"]

collections = db.list_collection_names()
print("Collections in 'sample_training' database:", collections)

Collections in 'sample_training' database: ['books']


In [8]:
db = client["sample_training"]
collection = db["books"]  

document = collection.find_one()

print("One document from the 'books' collection:", document)

One document from the 'books' collection: {'_id': 1, 'title': 'Unlocking Android', 'isbn': '1933988673', 'pageCount': 416, 'publishedDate': datetime.datetime(2009, 4, 1, 7, 0), 'thumbnailUrl': 'https://s3.amazonaws.com/AKIAJC5RLADLUMVRPFDQ.book-thumb-images/ableson.jpg', 'shortDescription': "Unlocking Android: A Developer's Guide provides concise, hands-on instruction for the Android operating system and development tools. This book teaches important architectural concepts in a straightforward writing style and builds on this with practical and useful examples throughout.", 'longDescription': "Android is an open source mobile phone platform based on the Linux operating system and developed by the Open Handset Alliance, a consortium of over 30 hardware, software and telecom companies that focus on open standards for mobile devices. Led by search giant, Google, Android is designed to deliver a better and more open and cost effective mobile experience.    Unlocking Android: A Developer's 

In [9]:
document = collection.find_one()


pprint(document)

{'_id': 1,
 'authors': ['W. Frank Ableson', 'Charlie Collins', 'Robi Sen'],
 'categories': ['Open Source', 'Mobile'],
 'isbn': '1933988673',
 'longDescription': 'Android is an open source mobile phone platform based on '
                    'the Linux operating system and developed by the Open '
                    'Handset Alliance, a consortium of over 30 hardware, '
                    'software and telecom companies that focus on open '
                    'standards for mobile devices. Led by search giant, '
                    'Google, Android is designed to deliver a better and more '
                    'open and cost effective mobile experience.    Unlocking '
                    "Android: A Developer's Guide provides concise, hands-on "
                    'instruction for the Android operating system and '
                    'development tools. This book teaches important '
                    'architectural concepts in a straightforward writing style '
                    

In [10]:
document_count = collection.count_documents({})


print(f"Number of documents in the 'books' collection: {document_count}")

Number of documents in the 'books' collection: 431


In [11]:
books_over_400_pages = collection.count_documents({"pageCount": {"$gt": 400}})

print(f"Number of books with more than 400 pageCount: {books_over_400_pages}")

Number of books with more than 400 pageCount: 157


In [12]:
books_over_400_pages = collection.count_documents({"pageCount": {"$gt": 400}})
print(f"Books with more than 400 pages: {books_over_400_pages}")


published_books = collection.count_documents({"status": "PUBLISH"})
print(f"Books that are published: {published_books}")



Books with more than 400 pages: 157
Books that are published: 363


In [13]:
query = {
    "$or": [
        {"shortDescription": {"$regex": "Android", "$options": "i"}}, 
        {"longDescription": {"$regex": "Android", "$options": "i"}}   
    ]
}


android_books_count = collection.count_documents(query)


print(f"Number of books with 'Android' in their description: {android_books_count}")

Number of books with 'Android' in their description: 4


In [14]:
pipeline = [
    {
        "$group": {
            "_id": None,  
            "categories": {"$addToSet": "$categories"} 
        }
    },
    {
        "$project": {
            "first_category": {"$arrayElemAt": ["$categories", 0]},  
            "second_category": {"$arrayElemAt": ["$categories", 1]}  
        }
    }
]


result = collection.aggregate(pipeline)


for doc in result:
    print(f"First Category: {doc['first_category']}, Second Category: {doc['second_category']}")

First Category: ['Microsoft', '.NET'], Second Category: ['Java', 'Client-Server', 'Internet']


In [15]:
query = {
    "$or": [
        {"longDescription": {"$regex": "Python", "$options": "i"}},  
        {"longDescription": {"$regex": "Java", "$options": "i"}},   
        {"longDescription": {"$regex": "C\\+\\+", "$options": "i"}},  
        {"longDescription": {"$regex": "Scala", "$options": "i"}}    
    ]
}


language_books_count = collection.count_documents(query)


print(f"Number of books containing Python, Java, C++, or Scala in their long description: {language_books_count}")

Number of books containing Python, Java, C++, or Scala in their long description: 131


In [16]:
pipeline = [
 
    {
        "$match": {
            "pageCount": {"$gt": 0},
            "categories": {"$ne": []},
            "categories": {"$ne": None},
        }
    },
  
    {
        "$unwind": "$categories"
    },
  
    {
        "$group": {
            "_id": "$categories",
            "max_pages": {"$max": "$pageCount"},
            "min_pages": {"$min": "$pageCount"},
            "avg_pages": {"$avg": "$pageCount"}
        }
    },
    
    {
        "$project": {
            "_id": 0,
            "category": "$_id",
            "max_pages": 1,
            "min_pages": 1,
            "avg_pages": {"$ifNull": ["$avg_pages", 0]} 
        }
    }
]


result = collection.aggregate(pipeline)

for doc in result:
    print(f"Category: {doc['category']}, Max Pages: {doc['max_pages']}, Min Pages: {doc['min_pages']}, Avg Pages: {doc['avg_pages']:.2f}")

Category: internet, Max Pages: 400, Min Pages: 400, Avg Pages: 400.00
Category: Theory, Max Pages: 591, Min Pages: 280, Avg Pages: 420.43
Category: Programming, Max Pages: 570, Min Pages: 280, Avg Pages: 434.60
Category: S, Max Pages: 350, Min Pages: 350, Avg Pages: 350.00
Category: , Max Pages: 368, Min Pages: 200, Avg Pages: 284.00
Category: Mobile, Max Pages: 416, Min Pages: 416, Avg Pages: 416.00
Category: XML, Max Pages: 680, Min Pages: 304, Avg Pages: 482.00
Category: Java, Max Pages: 1088, Min Pages: 180, Avg Pages: 513.94
Category: Object-Technology Programming, Max Pages: 200, Min Pages: 200, Avg Pages: 200.00
Category: Microsoft .NET, Max Pages: 848, Min Pages: 300, Avg Pages: 464.34
Category: Microsoft, Max Pages: 925, Min Pages: 344, Avg Pages: 572.38
Category: SOA, Max Pages: 250, Min Pages: 250, Avg Pages: 250.00
Category: Object-Oriented Programming, Max Pages: 512, Min Pages: 350, Avg Pages: 408.50
Category: Computer Graphics, Max Pages: 840, Min Pages: 328, Avg Pages: 

In [17]:
import datetime

pipeline = [
   
    {
        "$match": {
            "publishedDate": {"$gt": datetime.datetime(2009, 4, 1, 7, 0)}
        }
    },
    
    {
        "$project": {
            "_id": 0,  
            "title": 1,  
            "authors": 1,  
            "publishedDate": 1,  
            "year": {"$year": "$publishedDate"}, 
            "month": {"$month": "$publishedDate"},  
            "day": {"$dayOfMonth": "$publishedDate"}  
        }
    },
    
    {
        "$limit": 20
    }
]


result = collection.aggregate(pipeline)


for doc in result:
    print(f"Title: {doc['title']}, Year: {doc['year']}, Month: {doc['month']}, Day: {doc['day']}")


Title: Android in Action, Second Edition, Year: 2011, Month: 1, Day: 14
Title: Specification by Example, Year: 2011, Month: 6, Day: 3
Title: Flex 4 in Action, Year: 2010, Month: 11, Day: 15
Title: Flex on Java, Year: 2010, Month: 10, Day: 15
Title: Griffon in Action, Year: 2012, Month: 6, Day: 4
Title: OSGi in Depth, Year: 2011, Month: 12, Day: 12
Title: Hello! Flex 4, Year: 2009, Month: 11, Day: 1
Title: Brownfield Application Development in .NET, Year: 2010, Month: 4, Day: 16
Title: MongoDB in Action, Year: 2011, Month: 12, Day: 12
Title: jQuery in Action, Second Edition, Year: 2010, Month: 6, Day: 1
Title: Website Owner's Manual, Year: 2009, Month: 10, Day: 1
Title: ASP.NET 4.0 in Practice, Year: 2011, Month: 5, Day: 15
Title: Hello! Python, Year: 2012, Month: 2, Day: 13
Title: iOS in Practice, Year: 2013, Month: 11, Day: 1
Title: The Quick Python Book, Second Edition, Year: 2010, Month: 1, Day: 1
Title: Spring Dynamic Modules in Action, Year: 2010, Month: 9, Day: 4
Title: SQL Serve

In [18]:
import datetime

pipeline = [
   
    {
        "$match": {
            "publishedDate": {"$gt": datetime.datetime(2009, 12, 31)}
        }
    },
  
    {
        "$project": {
            "_id": 0,  
            "title": 1, 
            "publishedDate": 1,  
            "authors": 1,  
            "author_1": {"$arrayElemAt": ["$authors", 0]},  
            "author_2": {"$arrayElemAt": ["$authors", 1]},  
            "author_3": {"$arrayElemAt": ["$authors", 2]},  
            "author_4": {"$arrayElemAt": ["$authors", 3]},  
  
        }
    },
   
    {
        "$sort": {"publishedDate": 1}  
    },
 
    {
        "$limit": 20
    }
]


result = collection.aggregate(pipeline)


for doc in result:
    print(f"Title: {doc['title']}, Published Date: {doc['publishedDate']}, "
          f"Author 1: {doc.get('author_1', 'N/A')}, "
          f"Author 2: {doc.get('author_2', 'N/A')}, "
          f"Author 3: {doc.get('author_3', 'N/A')}, "
          f"Author 4: {doc.get('author_4', 'N/A')}")

Title: The Quick Python Book, Second Edition, Published Date: 2010-01-01 08:00:00, Author 1: Naomi R. Ceder, Author 2: N/A, Author 3: N/A, Author 4: N/A
Title: DSLs in Boo: Domain-Specific Languages in .NET, Published Date: 2010-01-01 08:00:00, Author 1: Oren Eini writing as Ayende Rahien, Author 2: N/A, Author 3: N/A, Author 4: N/A
Title: Brownfield Application Development in .NET, Published Date: 2010-04-16 07:00:00, Author 1: Kyle Baley, Author 2: Donald Belcham, Author 3: N/A, Author 4: N/A
Title: jQuery in Action, Second Edition, Published Date: 2010-06-01 07:00:00, Author 1: Bear Bibeault, Author 2: Yehuda Katz, Author 3: N/A, Author 4: N/A
Title: ASP.NET MVC 2 in Action, Published Date: 2010-06-01 07:00:00, Author 1: Jeffrey Palermo, Author 2: Ben Scheirman, Author 3: Jimmy Bogard, Author 4: Eric Hexter
Title: PowerShell in Practice, Published Date: 2010-06-08 07:00:00, Author 1: Richard Siddaway, Author 2: N/A, Author 3: N/A, Author 4: N/A
Title: JUnit in Action, Second Edition

In [19]:


pipeline = [
    
    {
        "$match": {
            "publishedDate": {"$gt": datetime.datetime(2009, 12, 31)}
        }
    },
    
    {
        "$project": {
            "_id": 0,  
            "title": 1,  
            "authors": 1,  
            "first_author": {"$arrayElemAt": ["$authors", 0]}  
        }
    },
     
    {
        "$group": {
            "_id": "$first_author",  
            "publication_count": {"$sum": 1}  
        }
    },
   
    {
        "$sort": {"publication_count": -1}  
    },

    {
        "$limit": 10
    }
]


result = collection.aggregate(pipeline)


for doc in result:
    print(f"Author: {doc['_id']}, Publications: {doc['publication_count']}")


Author: None, Publications: 10
Author: Don Jones, Publications: 5
Author: Pete Brown, Publications: 2
Author: Timothy Binkley-Jones, Publications: 2
Author: Christopher R. Mitchell, Publications: 2
Author: Richard Siddaway, Publications: 2
Author: W. Frank Ableson, Publications: 2
Author: Jeffrey Palermo, Publications: 2
Author: Arnaud Cogoluegnes, Publications: 2
Author: Rob Crowther, Publications: 2


In [20]:
pipeline = [
   
    {
        "$project": {
            "_id": 0, 
            "num_authors": {"$size": "$authors"}  
        }
    },
   
    {
        "$group": {
            "_id": "$num_authors",  
            "book_count": {"$sum": 1}  
        }
    },

    {
        "$sort": {"_id": 1}  
    }
]

result = collection.aggregate(pipeline)


for doc in result:
    print(f"Number of authors: {doc['_id']}, Number of books: {doc['book_count']}")


Number of authors: 0, Number of books: 37
Number of authors: 1, Number of books: 206
Number of authors: 2, Number of books: 105
Number of authors: 3, Number of books: 9
Number of authors: 4, Number of books: 47
Number of authors: 5, Number of books: 16
Number of authors: 6, Number of books: 6
Number of authors: 7, Number of books: 2
Number of authors: 8, Number of books: 3


In [21]:
pipeline = [
   
    {
        "$unwind": "$authors"
    },
   
    {
        "$project": {
            "_id": 0,  
            "author": {
                "$cond": {"if": {"$gt": [{"$type": "$authors"}, "missing"]}, "then": "$authors", "else": None}
            }
        }
    },
   
    {
        "$group": {
            "_id": "$author",  
            "count": {"$sum": 1}  
        }
    },

    {
        "$sort": {"count": -1}   
    },

    {
        "$limit": 20
    }
]


result = collection.aggregate(pipeline)


for doc in result:
    print(f"Author: {doc['_id']}, Occurrences: {doc['count']}")


Author: , Occurrences: 59
Author: Vikram Goyal, Occurrences: 12
Author: Don Jones, Occurrences: 6
Author: Richard Siddaway, Occurrences: 6
Author: Jon Skeet, Occurrences: 5
Author: Yehuda Katz, Occurrences: 5
Author: Gavin King, Occurrences: 5
Author: Christian Bauer, Occurrences: 5
Author: Greg Low, Occurrences: 4
Author: Erik Hatcher, Occurrences: 4
Author: Craig Walls, Occurrences: 4
Author: Kalen Delaney, Occurrences: 4
Author: Kimberly L. Tripp, Occurrences: 3
Author: Christopher Allen, Occurrences: 3
Author: Matthew Scarpino, Occurrences: 3
Author: Jeffrey Palermo, Occurrences: 3
Author: Paul S. Randal, Occurrences: 3
Author: Daniel Minoli, Occurrences: 3
Author: David A. Black, Occurrences: 3
Author: Jeffery Hicks, Occurrences: 3
