<a href="https://colab.research.google.com/github/Nimrat4/fewshotlearning/blob/main/fsl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Necessary imports
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

# Download NLTK resources
nltk.download('punkt')

# Load the dataset (ensure the file path is correct)
# Note: Replace '/path/to/mtsamples.csv' with the actual path to your dataset file
dataset = pd.read_csv('/content/mtsamples.csv')

# Drop rows where any column has a null value (adjust as needed)
dataset = dataset.dropna()

# Strip spaces and remove any rows with empty strings in the 'transcription' column
dataset = dataset[dataset['transcription'].str.strip() != '']

# Check for any remaining null or empty values in the 'transcription' column
print(f"Remaining None values: {dataset['transcription'].isnull().sum()}")
print(f"Remaining empty string values: {dataset['transcription'].apply(lambda x: x.strip() == '').sum()}")

# Display the first few rows to confirm
print(dataset.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Remaining None values: 0
Remaining empty string values: 0
   Sno                                        description  \
0    0   A 23-year-old white female presents with comp...   
1    1           Consult for laparoscopic gastric bypass.   
2    2           Consult for laparoscopic gastric bypass.   
3    3                             2-D M-Mode. Doppler.     
4    4                                 2-D Echocardiogram   

             medical_specialty                                sample_name  \
0         Allergy / Immunology                         Allergic Rhinitis    
1                   Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2                   Bariatrics   Laparoscopic Gastric Bypass Consult - 1    
3   Cardiovascular / Pulmonary                    2-D Echocardiogram - 1    
4   Cardiovascular / Pulmonary                    2-D Echocardiogram - 2    

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   


In [None]:
# Tokenization
# Download NLTK resources
nltk.download('punkt')
# Download the punkt_tab data package
nltk.download('punkt_tab')
dataset['transcription'] = dataset['transcription'].apply(word_tokenize)

# Stemming
sb = SnowballStemmer('english')
def stem_it(text):
    return [sb.stem(word) for word in text]

dataset['transcription'] = dataset['transcription'].apply(stem_it)

# Stopword removal based on word length (removing words with length <= 2 for simplicity)
def stopword_removal(text):
    return [word for word in text if len(word) > 2]

dataset['transcription'] = dataset['transcription'].apply(stopword_removal)

# Join tokens back into a single string after processing
dataset['transcription'] = dataset['transcription'].apply(' '.join)

# Limit text to the first 512 characters (helps with transformer models)
dataset['transcription'] = dataset['transcription'].apply(lambda x: x[:512])

# Display the first few rows to confirm
print(dataset.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


   Sno                                        description  \
0    0   A 23-year-old white female presents with comp...   
1    1           Consult for laparoscopic gastric bypass.   
2    2           Consult for laparoscopic gastric bypass.   
3    3                             2-D M-Mode. Doppler.     
4    4                                 2-D Echocardiogram   

             medical_specialty                                sample_name  \
0         Allergy / Immunology                         Allergic Rhinitis    
1                   Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2                   Bariatrics   Laparoscopic Gastric Bypass Consult - 1    
3   Cardiovascular / Pulmonary                    2-D Echocardiogram - 1    
4   Cardiovascular / Pulmonary                    2-D Echocardiogram - 2    

                                       transcription  \
0  subject this 23-year-old white femal present w...   
1  past medic histori has difficulti climb stair ...   
2 

In [None]:
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split

# Step 1: Load and clean the dataset (if not done previously)
mtsamples = pd.read_csv('/content/mtsamples.csv')
mtsamples = mtsamples.dropna(subset=['transcription', 'medical_specialty'])
mtsamples = mtsamples[mtsamples['transcription'].str.strip() != '']

# Step 2: Few-shot sampling - Select a small balanced subset (e.g., 10 samples per class)
target_samples_per_class = 10
few_shot_samples = mtsamples.groupby('medical_specialty').apply(
    lambda x: x.sample(n=min(len(x), target_samples_per_class), random_state=42)
).reset_index(drop=True)

# Step 3: Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Step 4: Tokenize the texts and prepare them for the model
encodings = tokenizer(list(few_shot_samples['transcription']), truncation=True, padding=True, max_length=512)

# Step 5: Convert `medical_specialty` labels to numerical format
label_mapping = {label: idx for idx, label in enumerate(few_shot_samples['medical_specialty'].unique())}
few_shot_samples['label'] = few_shot_samples['medical_specialty'].map(label_mapping)

# Step 6: Split into training and testing sets (80/20 split)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    encodings['input_ids'], few_shot_samples['label'], test_size=0.2, random_state=42
)

# Step 7: Confirm label mapping and encoded input samples
print("Label Mapping:", label_mapping)
print("Sample Encoded Input:", train_texts[0][:10])  # Display first 10 tokens of the first sample



  few_shot_samples = mtsamples.groupby('medical_specialty').apply(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Label Mapping: {' Allergy / Immunology': 0, ' Autopsy': 1, ' Bariatrics': 2, ' Cardiovascular / Pulmonary': 3, ' Chiropractic': 4, ' Consult - History and Phy.': 5, ' Cosmetic / Plastic Surgery': 6, ' Dentistry': 7, ' Dermatology': 8, ' Diets and Nutritions': 9, ' Discharge Summary': 10, ' ENT - Otolaryngology': 11, ' Emergency Room Reports': 12, ' Endocrinology': 13, ' Gastroenterology': 14, ' General Medicine': 15, ' Hematology - Oncology': 16, ' Hospice - Palliative Care': 17, ' IME-QME-Work Comp etc.': 18, ' Lab Medicine - Pathology': 19, ' Letters': 20, ' Nephrology': 21, ' Neurology': 22, ' Neurosurgery': 23, ' Obstetrics / Gynecology': 24, ' Office Notes': 25, ' Ophthalmology': 26, ' Orthopedic': 27, ' Pain Management': 28, ' Pediatrics - Neonatal': 29, ' Physical Medicine - Rehab': 30, ' Podiatry': 31, ' Psychiatry / Psychology': 32, ' Radiology': 33, ' Rheumatology': 34, ' SOAP / Chart / Progress Notes': 35, ' Sleep Medicine': 36, ' Speech - Language': 37, ' Surgery': 38, ' Ur

In [None]:
!pip install datasets
from datasets import Dataset

import torch

# Check if GPU is available and set device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Step 1: Convert tokenized texts to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_texts,
    'attention_mask': encodings['attention_mask'][:len(train_texts)],
    'labels': list(train_labels)
})
test_dataset = Dataset.from_dict({
    'input_ids': test_texts,
    'attention_mask': encodings['attention_mask'][len(train_texts):],
    'labels': list(test_labels)
})

print("Training and Testing Datasets prepared.")


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

# Step 2: Load the model with the number of labels equal to the number of unique medical specialties
num_labels = len(label_mapping)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels).to(device)

# Step 3: Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"  # Disables logging to WandB
)

print("Model and Training Arguments set up.")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and Training Arguments set up.




In [None]:
# Step 4: Initialize Trainer with model, arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Step 5: Train the model
trainer.train()
print("Training completed.")


Epoch,Training Loss,Validation Loss
1,No log,3.701222
2,No log,3.70994
3,No log,3.706327


Training completed.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 6: Make predictions on the test set
predictions = trainer.predict(test_dataset)

# Extract predicted labels
predicted_labels = predictions.predictions.argmax(axis=1)

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels, average='weighted')
recall = recall_score(test_labels, predicted_labels, average='weighted')
f1 = f1_score(test_labels, predicted_labels, average='weighted')

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
