In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model
tokenizer = AutoTokenizer.from_pretrained("ncbi/MedCPT-Cross-Encoder")
model = AutoModelForSequenceClassification.from_pretrained("ncbi/MedCPT-Cross-Encoder")

# Define possible categories and representative descriptions
labels = {
    "business": "Questions directly related to the biopharma industry — including drug development, clinical trials, disease treatment protocols, regulatory affairs, medical devices, healthcare data, or pharmaceutical R&D.",
    "greeting": "This message is just a greeting like hello or hi.",
    "other": "This message is casual, off-topic, or not related to pharma or greetings."
}

def classify_question(question: str):
    # Prepare input pairs
    pairs = [[question, desc] for desc in labels.values()]

    with torch.no_grad():
        encoded = tokenizer(pairs, truncation=True, padding=True, return_tensors="pt", max_length=512)
        logits = model(**encoded).logits.squeeze(dim=1)
        probs = F.softmax(logits, dim=0).tolist()

    # Map probabilities to labels
    label_probs = dict(zip(labels.keys(), probs))
    sorted_labels = sorted(label_probs.items(), key=lambda x: x[1], reverse=True)

    predicted_label = sorted_labels[0][0]
    print(f"Predicted Category: {predicted_label}\n")

    for label, prob in sorted_labels:
        print(f"{label.ljust(8)}: {prob:.4f}")

    return predicted_label


In [None]:

# Example usage

queries = [
    "What does UK regulatory guidance say about the appropriate length of a clinical trial informed consent form?",
    "What clinical trials are currently enrolling patients in the United States for Waldenstrom’s macroglobulinemia?",
    "What were the 3 most recently approved drugs for esophageal cancer in EU?",
    "Is there recent precedent for oncology drugs to receive accelerated approval from FDA based on a single arm trial?",
    "What is the process for applying for FDA approval of a companion diagnostic?",
    "What sites in California enroll the most chronic lymphocytic leukemia patients in clinical trials? Name five of the highest performing principle investigators",
    "What clinical development plan was followed leading to the initial regulatory approval of Zanubrutinib? How many dose levels were explored to determine optimal biological dose?",
    "biomedical pharma is a great domain wher biomedice is food and biomedice is air and and and food",
    "hi",
    "hello how are you buysiness",
    "thank you for choosing us",
    "does optum deals with biopharma"
    
]
biopharma_queries = [
    "What are the current FDA guidelines for accelerated drug approvals in oncology?",
    "Can you explain the mechanism of action for GLP-1 receptor agonists used in diabetes treatment?",
    "How does biosimilar approval differ from a generic drug approval in the US and EU?",
    "What are the pharmacokinetic differences between immediate-release and extended-release formulations of metformin?",
    "What are the common adverse events reported in Phase III trials for monoclonal antibodies targeting PD-1?",
    "What is the typical CRO engagement model for a global Phase IIb trial?",
    "How is companion diagnostic development integrated into targeted cancer therapies?",
    "What are the latest developments in mRNA vaccine platforms beyond COVID-19?",
    "How do pharmacovigilance teams assess risk-benefit during post-marketing surveillance?",
    "What are the key patent cliffs expected in the biologics market between 2025 and 2030?",
    "What’s the impact of EMA’s new transparency rules on clinical trial data disclosure?",
    "How do CDMO partnerships evolve during late-stage drug development?",
    "Can you compare the efficacy endpoints used in biosimilar vs originator trials?",
    "What role does real-world evidence play in regulatory submissions for rare diseases?",
    "What are the typical inclusion/exclusion criteria in immuno-oncology basket trials?"
]

non_business_queries = [
    "Is there a movie about a pharmaceutical company doing secret experiments?",
    "What’s the best font for a medical-themed PowerPoint presentation?",
    "Can I name my dog Pfizer?",
    "Are there any pharma-themed escape rooms in Bangalore?",
    "What’s that song in the Tylenol commercial?",
    "Can I cosplay as a clinical trial volunteer for Comic-Con?",
    "Which TV shows are set in hospitals but aren't too gory?",
    "What perfume smells like a hospital?",
    "Is there a bio-pharma museum in Europe?",
    "What’s the most expensive-looking lab coat for Halloween?",
    "Do any dating apps match people by blood type?",
    "What pharma logos would look good as tattoos?",
    "How do I make a biotech-themed birthday cake?",
    "Which pharmaceutical brand has the best jingle?",
    "Are there any anime characters named after medicines?"
]


confusing_queries = [
    "What are some cool biotech startups working on futuristic stuff?",
    "Is it true that pharma companies are hiding cancer cures?",
    "Why do clinical trials take so long even with AI nowadays?",
    "How do I get a job at a pharmaceutical company in Europe?",
    "What are some conspiracy theories related to the vaccine industry?",
    "Why does the FDA take forever to approve things?",
    "What are people saying about mRNA on Reddit lately?",
    "How is AI being hyped in drug discovery without results?",
    "Can you believe how expensive insulin is in the US?",
    "Do you think pharma is doing enough for rare diseases?",
    "What if CRISPR was open-sourced — would it change the game?",
    "Should I invest in biotech ETFs or wait for a downturn?",
    "Do big pharma companies actually innovate or just acquire?",
    "I heard pharma CEOs make millions — is that ethical?",
    "Is pharma more about profit than patients nowadays?"
]


In [None]:
result = []
for question in confusing_queries:
    ress = classify_question(question)
    result.append(ress)
print(result)    