In [1]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
# Load environment variables
dotenv_path = r'C:\Users\Soko\Documents\GitHub\VUACode\.env'
load_dotenv(dotenv_path)
mongodb_uri = os.getenv('MONGODB_URI')

# Connect to MongoDB
client = MongoClient(mongodb_uri)
db = client['twinning_papers']
collection = db['papers']


In [2]:
from transformers import pipeline
# Global Variables

CLASSIFIER = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device="cuda:0")
INPUT_LABELS = [
    "Genetic Factors",
    "Hormonal Factors",
    "Epidemiological Factors",
    "Methodologies in DZ twinning",
    "Comparative Studies",
    "Reproductive Traits",
    "Animal Models in Twinning Research",
    "Twinning Rates"]

BATCH_SIZE = 128

def fetch_documents(collection, limit=10):
    print("Fetching documents from the collection...")
    documents = collection.find({"classification": {"$exists": False}}).limit(limit)
    all_docs = []
    if documents is not None:
        for doc in documents:
            doc_text = f"{doc['title']} {doc['abstract']}"
            all_docs.append((doc["_id"], doc_text))
    return all_docs


def classify_document(document, doc_id):
    result = CLASSIFIER(document, INPUT_LABELS, multi_label=True)
    processed_result = process_classification_result(result['labels'], result['scores'])
    return doc_id, processed_result


def classify_documents(collection, all_docs):
    print("Classifying the documents...")

    total_docs = len(all_docs)
    total_batches = (total_docs + BATCH_SIZE - 1) // BATCH_SIZE

    results = []
    for i in range(0, total_docs, BATCH_SIZE):
        batch_num = i // BATCH_SIZE + 1
        print(f"Processing batch {batch_num}/{total_batches}...")
        batch = all_docs[i:i + BATCH_SIZE]
        classified_batch = CLASSIFIER([doc[1] for doc in batch], INPUT_LABELS, multi_label=True)
        for doc, result in zip(batch, classified_batch):
            doc_id = doc[0]
            processed_result = process_classification_result(result['labels'], result['scores'])
            results.append((doc_id, processed_result))

    print("Updating documents with classification data...")
    for i, (doc_id, update_data) in enumerate(results, 1):
        print(f"Updating document {i}/{total_docs}...")
        update_document(doc_id, update_data, collection)


def process_classification_result(labels, scores):
    classification = {}
    for label, score in zip(labels, scores):
        new_label = label.lower().replace(" ", "_")
        classification[new_label] = round(score, 4)

    return {"classification": classification}


def update_document(doc_id, update_data, collection):
    collection.update_one({"_id": doc_id},
                          {"$set": update_data})

def main():
    all_docs = fetch_documents(collection)
    classify_documents(collection, all_docs)


if __name__ == "__main__":
    main()





Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForSequenceClassification: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing TFBartForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForSequenceClassification were initialized from the PyTor

Downloading tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Fetching documents from the collection...
Classifying the documents...
Processing batch 1/1...


ValueError: Unknown attribute 'device' is encountered while parsing the device spec: '/device:GPU:cuda:0'.

In [4]:
import torch

def check_cuda_compatibility():
    cuda_available = torch.cuda.is_available()
    if cuda_available:
        print("CUDA is available.")
        print(f"Number of GPUs available: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        return True
    else:
        print("CUDA is not available.")
        return False

if __name__ == "__main__":
    cuda_compatible = check_cuda_compatibility()
    if cuda_compatible:
        print("You can run the script with CUDA.")
    else:
        print("You cannot run the script with CUDA. Check your CUDA installation or GPU compatibility.")


CUDA is not available.
You cannot run the script with CUDA. Check your CUDA installation or GPU compatibility.
