# **Importing Libraries**

In [1]:
!pip install transformers datasets torch
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, pipeline
import torch

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

# **Loading Medical Transcription dataset**

In [2]:
file_path = "/content/mtsamples.csv"
df = pd.read_csv(file_path)

print(df.head())

   Unnamed: 0                                        description  \
0           0   A 23-year-old white female presents with comp...   
1           1           Consult for laparoscopic gastric bypass.   
2           2           Consult for laparoscopic gastric bypass.   
3           3                             2-D M-Mode. Doppler.     
4           4                                 2-D Echocardiogram   

             medical_specialty                                sample_name  \
0         Allergy / Immunology                         Allergic Rhinitis    
1                   Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2                   Bariatrics   Laparoscopic Gastric Bypass Consult - 1    
3   Cardiovascular / Pulmonary                    2-D Echocardiogram - 1    
4   Cardiovascular / Pulmonary                    2-D Echocardiogram - 2    

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   
1  PAST MEDICAL 

# **Preprocessing the data**

In [3]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
description,0
medical_specialty,0
sample_name,0
transcription,33
keywords,1068


In [4]:
df = df[["transcription"]].dropna()
df = df.rename(columns={"transcription": "text"})  # Rename for compatibility

# Convert to list format
train_texts = df["text"].tolist()

# Print an example
print("Sample text:", train_texts[0])

Sample text: SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without 

# **Tokenizing the data**

In [5]:


model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT-2 does not have a pad token, so set it manually
tokenizer.pad_token = tokenizer.eos_token

# Split dataset into 90% training, 10% validation
train_texts, val_texts = train_test_split(df["text"].tolist(), test_size=0.1, random_state=42)

# Convert to Hugging Face dataset format
dataset = DatasetDict({
    "train": Dataset.from_dict({"text": train_texts}),
    "validation": Dataset.from_dict({"text": val_texts}),
})

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Convert datasets to PyTorch tensors
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/4469 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

In [6]:
"""

model_name = "gpt2"  # Pretrained GPT-2 model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT-2 doesn't have a padding token, so we set it manually
tokenizer.pad_token = tokenizer.eos_token

# Tokenize function for dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Convert text into Hugging Face Dataset format
from datasets import Dataset

dataset = Dataset.from_dict({"text": train_texts})

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Print tokenized sample
print(tokenized_dataset[0])


# Convert tokenized dataset into a format suitable for PyTorch
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])"""


'\n\nmodel_name = "gpt2"  # Pretrained GPT-2 model\ntokenizer = AutoTokenizer.from_pretrained(model_name)\n\n# GPT-2 doesn\'t have a padding token, so we set it manually\ntokenizer.pad_token = tokenizer.eos_token\n\n# Tokenize function for dataset\ndef tokenize_function(examples):\n    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)\n\n# Convert text into Hugging Face Dataset format\nfrom datasets import Dataset\n\ndataset = Dataset.from_dict({"text": train_texts})\n\n# Apply tokenization\ntokenized_dataset = dataset.map(tokenize_function, batched=True)\n\n# Print tokenized sample\nprint(tokenized_dataset[0])\n\n\n# Convert tokenized dataset into a format suitable for PyTorch\ntokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])'

In [7]:




# Data collator for causal language modeling (Next-word prediction)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # False because we're doing causal (next-word) prediction
)


# **Fine-Tune GPT-2 on MT Samples**

In [8]:


# Load GPT-2 model for Causal Language Modeling
model = AutoModelForCausalLM.from_pretrained(model_name)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-mtsamples",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,  # Adjust based on available compute power
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="tensorboard"
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
     train_dataset=tokenized_dataset["train"],   # ✅ Train dataset
    eval_dataset=tokenized_dataset["validation"],  # ✅ Validation dataset
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start Training
trainer.train()


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,2.6073,2.430962
2,2.4094,2.302479
3,2.3206,2.270233


TrainOutput(global_step=6705, training_loss=2.506302610393072, metrics={'train_runtime': 2673.6315, 'train_samples_per_second': 5.015, 'train_steps_per_second': 2.508, 'total_flos': 3503142273024000.0, 'train_loss': 2.506302610393072, 'epoch': 3.0})

# **Save & Test the Model**

In [9]:
model.save_pretrained("./gpt2-mtsamples-model")
tokenizer.save_pretrained("./gpt2-mtsamples-model")

('./gpt2-mtsamples-model/tokenizer_config.json',
 './gpt2-mtsamples-model/special_tokens_map.json',
 './gpt2-mtsamples-model/vocab.json',
 './gpt2-mtsamples-model/merges.txt',
 './gpt2-mtsamples-model/added_tokens.json',
 './gpt2-mtsamples-model/tokenizer.json')

# **Test the model**

In [19]:
# Load fine-tuned model


fine_tuned_model = AutoModelForCausalLM.from_pretrained("./gpt2-mtsamples-model")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./gpt2-mtsamples-model")

# Create text generation pipeline
generator = pipeline("text-generation", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

# Test the model with a prompt
prompt = "has a history of"
generated_text = generator(prompt, max_length=15, num_return_sequences=1)

print("Generated Text:", generated_text[0]['generated_text'])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Text: has a history of alcohol use disorder as well as tobacco exposure and/or
