In [79]:
from sentence_transformers import SentenceTransformer, util
from category_examples import category_examples

# 1. Define the research fields (optional if you use keys from category_examples)
research_fields = list(category_examples.keys())

# 2. Flatten examples and store labels
paper_titles = []
paper_labels = []

for field, examples in category_examples.items():
    paper_titles.extend(examples)
    paper_labels.extend([field] * len(examples))

# 3. Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 4. Encode examples
example_embeddings = model.encode(paper_titles, convert_to_tensor=True)

# 5. Classify function
def classify_text(text):
    input_embedding = model.encode(text, convert_to_tensor=True)
    cosine_scores = util.cos_sim(input_embedding, example_embeddings)

    similarity_scores = {}
    for i, label in enumerate(paper_labels):
        score = cosine_scores[0][i].item()
        similarity_scores[label] = max(similarity_scores.get(label, 0), score)

    best_field = max(similarity_scores, key=similarity_scores.get)
    return best_field, similarity_scores

In [82]:
# Example usage
text = "Dynamic single cell transcriptomics defines kidney FGF23/KL bioactivity and novel segment-specific inflammatory targets.. Phosphorus-independent role of FGF23 in erythropoiesis and iron homeostasis. Challenges and opportunities for conceiving genetically diverse sickle cell mice. The klotho F/C AD risk haplotype drives distinct phenotypes in a novel mouse model. Induced hypophosphatemia causes genomic reprogramming across the osteolineage as detected by single-cell RNAseq"
label, scores = classify_text(text)
print("Predicted Category:", label)
for k, v in sorted(scores.items(), key=lambda x: -x[1]):
    print(f"{k}: {v:.3f}")

Predicted Category: Biology
Biology: 0.437
Public Policy: 0.101
Chemistry: 0.098
Computer Science: 0.098
Economics: 0.079
Engineering: 0.065
Physics: 0.060
