In [9]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
# Load environment variables
dotenv_path = r'C:\Users\Soko\Documents\GitHub\VUACode\.env'
load_dotenv(dotenv_path)
mongodb_uri = os.getenv('MONGODB_URI')

# Connect to MongoDB
client = MongoClient(mongodb_uri)
db = client['twinning_papers']
collection = db['papers']


In [22]:
from transformers import pipeline
from tqdm import tqdm
# Global Variables

CLASSIFIER = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device="cuda:0")
INPUT_LABELS = [
    "Genetic Factors",
    "Hormonal Factors",
    "Epidemiological Factors",
    "Methodologies in DZ twinning",
    "Comparative Studies",
    "Reproductive Traits",
    "Animal Models in Twinning Research",
    "Twinning Rates"]

BATCH_SIZE = 256

def fetch_documents(collection):
    print("Fetching documents from the collection...")
    documents = collection.find({"classification": {"$exists": False}})
    all_docs = []
    if documents is not None:
        for doc in documents:
            doc_text = f"{doc['title']} {doc['abstract']}"
            all_docs.append((doc["_id"], doc_text))
    return all_docs


def classify_document(document, doc_id):
    result = CLASSIFIER(document, INPUT_LABELS)
    processed_result = process_classification_result(result['labels'], result['scores'])
    return doc_id, processed_result


def classify_documents(collection, all_docs):
    print("Classifying the documents...")

    total_docs = len(all_docs)
    total_batches = (total_docs + BATCH_SIZE - 1) // BATCH_SIZE

    results = []
    for i in tqdm(range(0, total_docs, BATCH_SIZE), desc="Classifying", total=total_batches):
        batch = all_docs[i:i + BATCH_SIZE]
        classified_batch = CLASSIFIER([doc[1] for doc in batch], INPUT_LABELS)
        for doc, result in zip(batch, classified_batch):
            doc_id = doc[0]
            processed_result = process_classification_result(result['labels'], result['scores'])
            results.append((doc_id, processed_result))

    print("Updating documents with classification data...")
    for i, (doc_id, update_data) in enumerate(tqdm(results, desc="Updating Documents"), 1):
        update_document(doc_id, update_data, collection)


def process_classification_result(labels, scores):
    classification = {}
    for label, score in zip(labels, scores):
        new_label = label.lower().replace(" ", "_")
        classification[new_label] = round(score, 4)

    return {"classification": classification}


def update_document(doc_id, update_data, collection):
    collection.update_one({"_id": doc_id},
                          {"$set": update_data})

def main():
    all_docs = fetch_documents(collection)
    classify_documents(collection, all_docs)


if __name__ == "__main__":
    main()


Fetching documents from the collection...
Classifying the documents...


Classifying: 100%|██████████| 18/18 [13:28<00:00, 44.94s/it]


Updating documents with classification data...


Updating Documents: 100%|██████████| 4548/4548 [07:20<00:00, 10.31it/s]


In [21]:
documents_to_update = collection.find({"classification": {"$exists": True}})
print(f"Number of documents found for updating: {documents_to_update.count()}")
from pymongo import errors

for doc in documents_to_update:
    try:
        doc_id = doc["_id"]
        result = collection.update_one({"_id": doc_id}, {"$unset": {"classification": ""}})
        if result.modified_count:
            print(f"Removed classification from document ID: {doc_id}")
        else:
            print(f"No changes made for document ID: {doc_id}")
    except errors.PyMongoError as e:
        print(f"Error updating document ID: {doc_id}. Error: {e}")


  print(f"Number of documents found for updating: {documents_to_update.count()}")


Number of documents found for updating: 100
Removed classification from document ID: 656d48da38303de27998efec
Removed classification from document ID: 656d48db38303de27998efed
Removed classification from document ID: 656d48dc38303de27998efee
Removed classification from document ID: 656d48dc38303de27998efef
Removed classification from document ID: 656d48dc38303de27998eff0
Removed classification from document ID: 656d48dd38303de27998eff1
Removed classification from document ID: 656d48dd38303de27998eff2
Removed classification from document ID: 656d48dd38303de27998eff3
Removed classification from document ID: 656d48de38303de27998eff4
Removed classification from document ID: 656d48de38303de27998eff5
Removed classification from document ID: 656d48de38303de27998eff6
Removed classification from document ID: 656d48de38303de27998eff7
Removed classification from document ID: 656d48df38303de27998eff8
Removed classification from document ID: 656d48df38303de27998eff9
Removed classification from docu