# BEFORE RUNNING THIS FILE:<br>
- Note that this notebook is set up for use in Google Colab. You can make the necessary mods to run locally if you like.
- Ensure you have the following files inside **\<your google drive root folder\>/Colab Notebooks**:
<br> - **small_talk_train.csv**
<br> - **symptoms.csv**

In [1]:
from google.colab import drive
import random
from __future__ import annotations
from dataclasses import dataclass, field
from typing import List, Dict, Set, Tuple, Optional, Any
import re
import sys
import json
from collections import defaultdict, deque
from pathlib import Path
import os
from datasets import load_dataset
import spacy
from spacy.training import Example
import random
from sklearn.model_selection import train_test_split
from spacy.util import minibatch
!pip install datasets==3.6.0
!hf auth login

drive.mount('/content/drive')



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
The token `425-read` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `425-read`
Mounte

In [2]:
# ----------------------------
# Intent Classification
# ----------------------------

try:
    import spacy
    SPACY_AVAILABLE = True
except ImportError:
    SPACY_AVAILABLE = False

class IntentClassifier:
    """Detects if user input is symptom-related or off-topic."""

    def __init__(self, use_spacy: bool = True):
        self.use_spacy = use_spacy and SPACY_AVAILABLE
        self.nlp = None

        if self.use_spacy:
            try:
                self.nlp = spacy.load("en_core_web_sm")
            except OSError:
                print("Warning: spaCy model not found, falling back to rule-based intent detection")
                self.use_spacy = False

        # Medical keywords for rule-based fallback
        self.medical_terms = {
            'feel', 'feeling', 'pain', 'hurt', 'hurts', 'ache', 'aches', 'aching',
            'sick', 'ill', 'unwell', 'symptom', 'symptoms', 'doctor', 'fever',
            'cough', 'nausea', 'dizzy', 'vomit', 'diarrhea', 'tired', 'fatigue',
            'headache', 'sore', 'swollen', 'rash', 'itchy', 'bleeding', 'chest',
            'stomach', 'throat', 'breathe', 'breathing', 'experienced', 'experiencing',
            'suffering', 'diagnosed', 'started', 'began', 'having'
        }

        # Common off-topic patterns
        self.off_topic_patterns = {
            'weather', 'joke', 'time', 'date', 'hello', 'hi', 'hey',
            'thanks', 'thank you', 'bye', 'goodbye', 'how are you',
            'what can you do', 'who are you', 'your name', 'help me with',
            'tell me about', 'what is', 'how to', 'recipe', 'news'
        }


    def is_symptom_related_spacy(self, text: str) -> bool:
        """Use spaCy NLP to detect medical intent."""
        doc = self.nlp(text)

        # Check for health/medical entities (if any medical NER)
        medical_ents = {'DISEASE', 'SYMPTOM'}
        if any(ent.label_ in medical_ents for ent in doc.ents):
            return True

        # Check for medical-related verbs and their objects
        health_verbs = {'feel', 'hurt', 'ache', 'experience', 'have', 'get', 'suffer'}
        for token in doc:
            if token.lemma_ in health_verbs:
                # Check if it has medical-related dependencies
                for child in token.children:
                    if child.pos_ in {'NOUN', 'ADJ'} and child.lemma_ in self.medical_terms:
                        return True

        # Check for body parts (strong signal for medical)
        body_parts = {'head', 'chest', 'stomach', 'throat', 'nose', 'ear', 'eye',
                     'arm', 'leg', 'back', 'neck', 'skin', 'body', 'abdomen'}
        if any(token.lemma_ in body_parts for token in doc):
            return True

        # Fallback to rule-based for this text
        return self.is_symptom_related_rules(text)

    def is_symptom_related_rules(self, text: str) -> bool:
        """Rule-based fallback for intent detection."""
        text_lower = text.lower()

        # Check for explicit off-topic keywords first
        if any(pattern in text_lower for pattern in self.off_topic_patterns):
            # But allow if medical terms also present (e.g., "tell me about my fever")
            if not any(term in text_lower for term in self.medical_terms):
                return False

        # Check for medical keywords
        words = set(text_lower.split())
        if words & self.medical_terms:
            return True

        # Check for partial matches in medical terms
        if any(term in text_lower for term in self.medical_terms):
            return True

        # Default: if message is substantial (>3 words), assume medical context
        # (We're a symptom checker, so benefit of doubt)
        return len(text.split()) > 3

    def classify(self, text: str) -> str:
        """
        Returns:
            'symptom' - medical/symptom related
            'off_topic' - not related to symptoms
        """

        if self.use_spacy:
            is_medical = self.is_symptom_related_spacy(text)
        else:
            is_medical = self.is_symptom_related_rules(text)

        return 'symptom' if is_medical else 'off_topic'

#### The current problem with solely using spaCy's core EN model

In [3]:
c = IntentClassifier()
s = "i got chills in my bones"
print(c.classify(s))
s = "i feel on top of the world"
print(c.classify(s))
s = "hi"
print(c.classify(s))

symptom
symptom
off_topic


### Preprocessing

In [4]:
cats_small_talk = {"report_symptom": 0.0, "small_talk": 1.0}
cats_report_symptom = {"report_symptom": 1.0, "small_talk": 0.0}

# figurative speech that could be mistaken for symptoms
training_data_fl = [
    {"text": "I got chills just thinking about it!", "cats": cats_small_talk},
    {"text": "That movie gave me goosebumps!", "cats": cats_small_talk},
    {"text": "It sends shivers down my spine every time.", "cats": cats_small_talk},
    {"text": "I feel it in my bones.", "cats": cats_small_talk},
    {"text": "I got butterflies in my stomach", "cats": cats_small_talk},
    {"text": "My heart skipped a beat when I saw it.", "cats": cats_small_talk},
    {"text": "This song gives me a headache.", "cats": cats_small_talk},
    {"text": "My brain just froze.", "cats": cats_small_talk},
    {"text": "It broke my heart.", "cats": cats_small_talk},
    {"text": "I hit a nerve.", "cats": cats_small_talk},
    {"text": "dying of laughter right now!", "cats": cats_small_talk},
    {"text": "It made my skin crawl just hearing that story.", "cats": cats_small_talk},
    {"text": "My heart is pounding just thinking about it.", "cats": cats_small_talk},
    {"text": "That cracked me up so bad!", "cats": cats_small_talk},
    {"text": "This news makes my head spin.", "cats": cats_small_talk},
    {"text": "That situation gives me a headache just thinking about it.", "cats": cats_small_talk},
    {"text": "He's giving me a heart attack with those jokes!", "cats": cats_small_talk},
    {"text": "My stomach turns when I hear that name.", "cats": cats_small_talk},
    {"text": "That story sent chills all over my body.", "cats": cats_small_talk},
    {"text": "I nearly had a heart attack when I saw the bill!", "cats": cats_small_talk},
    {"text": "i'm coughing up a lung", "cats": cats_small_talk},
    {"text": "you're going to give me an aneurysm", "cats": cats_small_talk},
    {"text": "i've got cabin fever", "cats": cats_small_talk},
    {"text": "I'm feeling under the weather", "cats": cats_small_talk},
    {"text": "As pale as a ghost", "cats": cats_small_talk},
    {"text": "you're a pain in my butt", "cats": cats_small_talk},
    {"text": "this is driving me crazy", "cats": cats_small_talk},
    {"text": "i have a heavy heart", "cats": cats_small_talk},
    {"text": "i feel so lonely", "cats": cats_small_talk},
    {"text": "im sick of this", "cats": cats_small_talk},
]

# valid symptoms, but not related to the user themselves
training_data_not_first_person = [
    {"text": "my friend has covid", "cats": cats_small_talk},
    {"text": "jenny is down with a fever", "cats": cats_small_talk},
    {"text": "I did not sneeze last week.", "cats": cats_small_talk},
    {"text": "didnt feel anything over the past week", "cats": cats_small_talk},
    {"text": "i have a family history of sinus", "cats": cats_small_talk},
    {"text": "My friend had a cold last week.", "cats": cats_small_talk},
    {"text": "She said her head was hurting yesterday.", "cats": cats_small_talk},
    {"text": "Dad caught a cold during the trip.", "cats": cats_small_talk},
    {"text": "He's been coughing for days now.", "cats": cats_small_talk},
    {"text": "Grandma used to get dizzy sometimes.", "cats": cats_small_talk},
    {"text": "My coworker had stomach pain after lunch.", "cats": cats_small_talk},
    {"text": "They told me their throat felt sore.", "cats": cats_small_talk},
    {"text": "Mom said she couldn't sleep because of back pain.", "cats": cats_small_talk},
    {"text": "A patient I met complained about headaches.", "cats": cats_small_talk},
    {"text": "My brother was sneezing a lot last night.", "cats": cats_small_talk},
    {"text": "She caught the flu earlier this month.", "cats": cats_small_talk},
    {"text": "He had a terrible migraine last weekend.", "cats": cats_small_talk},
    {"text": "My sister felt nauseous after dinner.", "cats": cats_small_talk},
    {"text": "They were shivering from the fever.", "cats": cats_small_talk},
    {"text": "I heard my neighbor has a sore throat.", "cats": cats_small_talk},
    {"text": "Mom was feeling dizzy yesterday morning.", "cats": cats_small_talk},
    {"text": "Her kid vomited all night but is fine now.", "cats": cats_small_talk},
    {"text": "He said his chest was hurting earlier today.", "cats": cats_small_talk},
    {"text": "My uncle had back pain for a while.", "cats": cats_small_talk},
    {"text": "The teacher had to leave because she felt unwell.", "cats": cats_small_talk},
]

#### Prebuilt Datasets analyses

In [5]:
import pandas as pd
import ast

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/small_talk_train.csv")
data = []
for _, row in df.iterrows():
    dialog = ast.literal_eval(row['dialog'])
    for utt in dialog:
        data.append({"text": utt.strip(), "cats": cats_small_talk})

smalltalk_df = pd.DataFrame(data)
smalltalk_df.to_csv("/content/drive/MyDrive/Colab Notebooks/small_talk_prepared.csv", index=False)

In [6]:
symptoms_df = pd.read_csv("drive/MyDrive/Colab Notebooks/symptoms.csv")

common_phrasings = [
    "i got {symptom}",
    "i've got {symptom}",
    "i am having {symptom}",
    "i'm experiencing {symptom}",
    "i feel {symptom}",
    "i'm down with {symptom}",
    "i caught {symptom}",
    "i developed {symptom}",
    "i'm suffering from {symptom}",
    "i'm dealing with {symptom}",
    "lately i've had {symptom}",
    "been feeling {symptom} lately",
    "my {symptom} is getting worse",
    "it feels like {symptom}",
    "having trouble with {symptom}",
    "i keep getting {symptom}",
    "having issues like {symptom}",
    "there's this {symptom} that won't go away"
]

training_data_symptoms = []

for symptom in symptoms_df['symptoms'].dropna():
    symptom = symptom.strip().lower()
    # Randomly select 2–4 phrasings for each symptom
    selected_phrasings = random.sample(common_phrasings, k=random.randint(2, 4))

    for phrasing in selected_phrasings:
        text = phrasing.format(symptom=symptom)
        training_data_symptoms.append({"text": text, "cats": cats_report_symptom})

# Optional: add the plain symptom itself (useful for terse inputs)
for symptom in symptoms_df['symptoms'].dropna():
    training_data_symptoms.append({"text": symptom.strip().lower(), "cats": cats_report_symptom})

print(f"✅ Generated {len(training_data_symptoms)} symptom training examples.")
print(random.sample(training_data_symptoms, 5))

✅ Generated 1497 symptom training examples.
[{'text': 'ear pain', 'cats': {'report_symptom': 1.0, 'small_talk': 0.0}}, {'text': "i've got jaundice", 'cats': {'report_symptom': 1.0, 'small_talk': 0.0}}, {'text': "i'm experiencing redness in or around nose", 'cats': {'report_symptom': 1.0, 'small_talk': 0.0}}, {'text': 'been feeling swelling of scrotum lately', 'cats': {'report_symptom': 1.0, 'small_talk': 0.0}}, {'text': 'symptoms of the face', 'cats': {'report_symptom': 1.0, 'small_talk': 0.0}}]


In [7]:

ds = load_dataset("roskoN/dailydialog")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/581 [00:00<?, ?B/s]

dailydialog.py: 0.00B [00:00, ?B/s]

full/train/0000.parquet:   0%|          | 0.00/3.67M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/340k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/337k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
off_topic_texts = []
for conv in ds['train']:
    utterances = conv['utterances']  # list of strings
    if len(off_topic_texts) < 1200:
      combined_text = " ".join(utterances)
      off_topic_texts.append(combined_text)
    else:
      break
off_topic_train_data = [{"text": text, "cats": cats_small_talk}
                        for text in off_topic_texts]

print(len(off_topic_texts))


1200


### HyperParams

In [9]:
BATCH_SIZE = 32
DROPOUT = 0.2
NUM_EPOCHS = 4

In [10]:

training_data_combined = training_data_fl + training_data_not_first_person + training_data_symptoms + off_topic_train_data
random.shuffle(training_data_combined)
print(len(training_data_combined))

2752


In [11]:
# Count the labels
count_small_talk = 0
count_report_symptom = 0

for example in training_data_combined:
    cats = example["cats"]
    if cats.get("small_talk", 0.0) > cats.get("report_symptom", 0.0):
        count_small_talk += 1
    else:
        count_report_symptom += 1

total = len(training_data_combined)
ratio_small_talk = count_small_talk / total
ratio_report_symptom = count_report_symptom / total

print(f"Total examples: {total}")
print(f"Small talk: {count_small_talk} ({ratio_small_talk:.2%})")
print(f"Report symptom: {count_report_symptom} ({ratio_report_symptom:.2%})")


Total examples: 2752
Small talk: 1255 (45.60%)
Report symptom: 1497 (54.40%)


In [12]:
# putting it all together
training_data_csv = smalltalk_df.to_dict(orient="records")
training_data_combined = training_data_fl + training_data_not_first_person + training_data_csv[:500] + training_data_symptoms + off_topic_train_data[:500]
random.shuffle(training_data_combined)

train_data, temp_data = train_test_split(training_data_combined, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

####

In [13]:
spacy.require_gpu()
nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat")
textcat.add_label("report_symptom")
textcat.add_label("small_talk")

optimizer = nlp.initialize()

def evaluate(model, data):
    correct = 0
    for record in data:
        doc = model(record["text"])
        pred = max(doc.cats, key=doc.cats.get)
        true = max(record["cats"], key=record["cats"].get)
        if pred == true:
            correct += 1
    return correct / len(data)

for epoch in range(NUM_EPOCHS):
    random.shuffle(train_data)
    losses = {}
    print(f"\n===== Epoch {epoch+1}/{NUM_EPOCHS} =====")

    for i in range(0, len(train_data), BATCH_SIZE):
        batch = train_data[i:i+BATCH_SIZE]
        examples = [Example.from_dict(nlp.make_doc(r["text"]), {"cats": r["cats"]}) for r in batch]
        nlp.update(examples, sgd=optimizer, losses=losses, drop=DROPOUT)

    print("Training loss:", losses)
    val_acc = evaluate(nlp, val_data)
    print(f"Validation accuracy: {val_acc:.5f}")



===== Epoch 1/4 =====
Training loss: {'textcat': 2.494079191362599}
Validation accuracy: 0.98824

===== Epoch 2/4 =====
Training loss: {'textcat': 0.16252449844718964}
Validation accuracy: 0.99608

===== Epoch 3/4 =====
Training loss: {'textcat': 0.0350021783644795}
Validation accuracy: 0.99608

===== Epoch 4/4 =====
Training loss: {'textcat': 0.03346457203384953}
Validation accuracy: 0.99608


In [14]:
nlp.to_disk(f"/content/drive/MyDrive/Colab Notebooks/intent_model_e{NUM_EPOCHS}_v1")

In [15]:
# modified intent classifier
class IntentClassifier:
    """Detects if user input is symptom-related or off-topic."""

    def __init__(self, model_path: str | None = "models/intent_classifier", use_spacy: bool = True):
        self.use_spacy = use_spacy and SPACY_AVAILABLE
        self.nlp = None

        if self.use_spacy:
            try:
                if model_path and Path(model_path).exists():
                    print(f"Loading fine-tuned spaCy model from {model_path}...")
                    self.nlp = spacy.load(model_path)
                else:
                    print("Fine-tuned model not found, using base model en_core_web_sm")
                    self.nlp = spacy.load("en_core_web_sm")
            except OSError:
                print("Warning: spaCy model not found, falling back to rule-based intent detection")
                self.use_spacy = False

    # Medical keywords for rule-based fallback
        self.medical_terms = {
            'feel', 'feeling', 'pain', 'hurt', 'hurts', 'ache', 'aches', 'aching',
            'sick', 'ill', 'unwell', 'symptom', 'symptoms', 'doctor', 'fever',
            'cough', 'nausea', 'dizzy', 'vomit', 'diarrhea', 'tired', 'fatigue',
            'headache', 'sore', 'swollen', 'rash', 'itchy', 'bleeding', 'chest',
            'stomach', 'throat', 'breathe', 'breathing', 'experienced', 'experiencing',
            'suffering', 'diagnosed', 'started', 'began', 'having'
        }

        # Common off-topic patterns
        self.off_topic_patterns = {
            'weather', 'joke', 'time', 'date', 'hello', 'hi', 'hey',
            'thanks', 'thank you', 'bye', 'goodbye', 'how are you',
            'what can you do', 'who are you', 'your name', 'help me with',
            'tell me about', 'what is', 'how to', 'recipe', 'news'
        }

        # Greetings (allow these)
        self.greetings = {'hello', 'hi', 'hey', 'greetings'}

    def is_greeting(self, text: str) -> bool:
        """Check if text is just a greeting."""
        text_lower = text.lower().strip()
        # Check for simple greetings
        words = text_lower.split()
        if len(words) <= 2 and any(g in words for g in self.greetings):
            return True
        return False

    def is_symptom_related_spacy(self, text: str) -> bool:
        """Use fine-tuned spaCy textcat model to detect medical intent."""
        doc = self.nlp(text)

        if "textcat" not in self.nlp.pipe_names:
            # fallback to entity/rule-based logic if model doesn't include textcat
            return self.is_symptom_related_rules(text)

        cats = doc.cats
        # Example categories: {'report_symptom': 0.92, 'small_talk': 0.04, 'greetings': 0.04}

        if cats.get("report_symptom", 0.0) > 0.7:
            return True
        elif cats.get("small_talk", 0.0) > 0.7:
            return False
        elif cats.get("greetings", 0.0) > 0.7:
            return False

        # fallback if uncertain
        return self.is_symptom_related_rules(text)
    def classify(self, text: str) -> str:
        """
        Returns one of: 'greetings', 'report_symptom', 'small_talk', 'off_topic'
        """
        if self.is_greeting(text):
            return 'greetings'

        if not self.use_spacy:
            is_medical = self.is_symptom_related_rules(text)
            return 'report_symptom' if is_medical else 'off_topic'

        doc = self.nlp(text)
        if "textcat" in self.nlp.pipe_names:
            cats = doc.cats
            best_label = max(cats, key=cats.get)
            if cats[best_label] > 0.6:
                return best_label
            else:
                return "off_topic"
        else:
            return "report_symptom" if self.is_symptom_related_spacy(text) else "off_topic"


In [17]:
clf = IntentClassifier(model_path=f"/content/drive/MyDrive/Colab Notebooks/intent_model_e{NUM_EPOCHS}_v1")
tests = [
    "sore throat",
    "you has a fever",
    "hey there",
    "hows the weather",
    "weather is nice today",
    "my dog vomited the other day",
    "dizzy",
    "i have herpetitis"
]

for t in tests:
    label = clf.classify(t)
    print(f"{t!r} -> {label}")


Loading fine-tuned spaCy model from /content/drive/MyDrive/Colab Notebooks/intent_model_e4_v1...
'sore throat' -> report_symptom
'you has a fever' -> small_talk
'hey there' -> greetings
'hows the weather' -> small_talk
'weather is nice today' -> report_symptom
'my dog vomited the other day' -> small_talk
'dizzy' -> report_symptom
'i have herpetitis' -> report_symptom


In [18]:
# for exporting models: downloading directly from google drive messes up the model due to compression

import shutil
shutil.make_archive(
    f"/content/intent_model_e{NUM_EPOCHS}_v1_backup",  # output
    'zip',
    f'/content/drive/MyDrive/Colab Notebooks/intent_model_e{NUM_EPOCHS}_v1'  # source
)


'/content/intent_model_e4_v1_backup.zip'