In [None]:
# Homework 1 - Samuel Dobrossy and Lothaire Lemarquis

# Data importation process

import json
import pymongo
import pandas as pd
import tqdm

client = pymongo.MongoClient('localhost', 27017)

# Creation of the database

my_db = client["Homework_1"]

# Creation of the article collection 

collection = my_db["pubmed_cleaned"]
with open(r"C:\NoSQL-master\data\Chap3\pubmed_cleaned\pubmed_cleaned.json") as file:
    file_data = json.load(file)

print(collection)

In [None]:
from bson import ObjectId

# Conversion of IDs into ObjectId
for document in file_data:
    if '_id' in document and '$oid' in document['_id']:
        document['_id'] = ObjectId(document['_id']['$oid'])

# Insertion of the data into the collection
collection.insert_many(file_data)


In [None]:
from bson import ObjectId
from pymongo import MongoClient

# Conversion of IDs into ObjectId

for document in file_data:
    if '_id' in document and '$oid' in document['_id']:
        document['_id'] = ObjectId(document['_id']['$oid'])

# Insertion of the data into the collection

collection.insert_many(file_data)

In [None]:
# Let's retrieve the first document in the collection

document = collection.find_one()
print(document)

In [None]:
from datetime import datetime

# Delete all the articles published before 2019

date_prior = datetime(2019, 1, 1, 0, 0)
result = collection.delete_many({"date": {"$lt": date_prior}})
print(f"Number of deleted articles: {result.deleted_count}")


In [None]:
# Compute the number of articles with only one author

mono_author = collection.count_documents({"authors": {"$regex": "name ml"}})
print(f"Number of articles with only one author : {mono_author}")

In [None]:
# Establishing counting system for articles with one or two authors

nb_1_author = 0
nb_2_authors = 0

# Number of articles with one or two authors

for document in collection.find():
    authors = document['authors'].split(', ')
    num_authors = len(authors)
    if num_authors == 1:
        nb_1_author += 1
    elif num_authors == 2:
        nb_2_authors += 1

# Results

print("Number of articles with only one author :", nb_1_author)
print("Number of articles with two authors :", nb_2_authors)

In [None]:
# Finding the last article of the collection

final_paper = collection.find_one(sort=[('_id', -1)])

# Details about the final article

print("Last inserted document :")
print("Title :", final_paper['title'])
print("Authors :", final_paper['authors'])
print("Date :", final_paper['date'])

In [None]:
from pymongo import MongoClient

# Connect to the database and select the collection

client = MongoClient('localhost', 27017)
db = client['Homework_1']
collection = db['pubmed']

# Find articles with null meshwords

query = {"meshwords": {"$in": [None, ""]}}
null_meshwords_articles = collection.find(query)

# Results

print("Articles with null meshwords:")
for article in null_meshwords_articles:
    print("Title:", article['title'])

In [None]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['homework']
collection = db['pubmed']
keyword = "machine learning" 

# Count the number of articles containing the keyword in the meshwords, title or abstract

nombre_articles = collection.count_documents({
    "$or": [
        {"meshwords": {"$regex": f".*{keyword}.*", "$options": "i"}},
        {"abstract": {"$regex": f".*{keyword}.*", "$options": "i"}},
        {"title": {"$regex": f".*{keyword}.*", "$options": "i"}}
    ]
})

print(f"Number of articles containing the selected key word '{keyword}': {nombre_articles}")

In [None]:
article_count = collection.count_documents({
    "$and": [
        {"affiliations": {"$exists": True}},
        {"meshwords": {"$exists": True}}
    ]
})

print(f"Number of articles with at least one affiliation and meshwords: {article_count}")

In [None]:
# Number of articles with a publishing date after 2020

article_count = collection.count_documents({
    "date": {"$gt": "year 2020"}
})

print(f"Number of articles with a publishing date after 2020: {article_count}")

In [None]:
article_count = collection.count_documents({
    "authors": {"$regex": "China"}})
print("Number of articles with at least one affiliation from Hungary:", article_count)
article_country= collection.find({"authors":{"$regex": "Hungary"}})
for article in article_country:
    print("ID:", article["_id"],"Title :", article["title"])

In [None]:
# Check for duplicates

distinct_pmids = collection.distinct("pmid")

duplicates = len(distinct_pmids) != collection.count_documents({})

# Results

if duplicates:
    print("Duplicates have been detected.")
else:
    print("No duplicates have been detected.")


In [None]:
# Detect every article where the abstract starts with an "R" and then remove them

r_abstract = list(collection.find({"abstract": {"$regex": "^R"}}))

# Delete those articles

for article in r_abstract:
    collection.delete_one({"_id": article["_id"]})

# Check how many articles are left in the collection

article_count = collection.count_documents({})
print(f"{article_count} articles remaining in collection.")

In [None]:
articles_filtres = collection.find({"abstract": {"$regex": "^\\s*\"R"}})
for article in articles_filtres:
    print("title :", article["title"])
    print("abstract :", article["abstract"])

In [None]:
print("init count: ",collection.count_documents({"abstract": {"$regex": "^\\s*\"R"}}))
collection.delete_many({"abstract": {"$regex": "^\\s*\"R"}})
print("after the deleting process : ",collection.count_documents({"abstract": {"$regex": "^\\s*\"R"}}))

In [None]:
# Returns the list of articles (pmid) where each author has at least one affiliation

articles = collection.find(
    {"authors": {"$exists": True}}
)
liste_articles = [article["pmid"] for article in articles if len(article.get("authors", [])) > 0]
print(liste_articles)

In [None]:
#Create 500 random samples from the dataset and then calculate a statistic of interest and check its behavior across the different samples

# Define the number of samples

num_samples = 500

#Define the statistic of interest

statistics = []

# Sample and calculate the statistic for each sample

for i in range(num_samples):
    # Sample documents randomly from the collection
    sample = collection.aggregate([
        {"$sample": {"size": 100}}  # Adjust sample size as needed
    ])

    # Calculate the statistic of interest for the sample
    # Here we calculate the average team size

    team_sizes = [doc.get("team_size", 0) for doc in sample]
    average_team_size = sum(team_sizes) / len(team_sizes)
    
    # Add the calculated statistic to the list

    statistics.append(average_team_size)

# Show statistics
print(statistics)