## **Train a model to generate coherent and contextually relevant text based on a given prompt.**

## Install Required Libraries

In [None]:
pip install transformers datasets torch pandas



## Loading and Inspecting the Dataset

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/annotated_coding_data.csv')

# Display the dataset
print(data.head())

                                       clinical_note   icd10_codes  \
0  Patient is a 25-year-old female diagnosed with...  J45.40;J30.2   
1  Patient is a Child diagnosed with asthma. Admi...        J45.40   
2  Patient is a 45-year-old male diagnosed with h...           I10   
3  Patient is a 30-year-old female diagnosed with...        J45.40   
4  Patient is a 30-year-old female diagnosed with...       M23.221   

           cpt_codes  
0  29881;D0120;D1110  
1              99213  
2              90832  
3        D0120;D1110  
4              97110  


## Concatenating Codes for Generation

In [None]:
# Convert 'icd10_codes' and 'cpt_codes' to strings
data['icd10_codes'] = data['icd10_codes'].astype(str)
data['cpt_codes'] = data['cpt_codes'].astype(str)

# Combine ICD-10 and CPT codes into a single string
data['codes'] = data['icd10_codes'] + ' ' + data['cpt_codes']

# Create the prompt by appending the codes to the clinical note
data['prompt'] = data['clinical_note'] + '###' + ' ' + data['codes']

# Display the updated dataset
print(data[['prompt']].head())

                                              prompt
0  Patient is a 25-year-old female diagnosed with...
1  Patient is a Child diagnosed with asthma. Admi...
2  Patient is a 45-year-old male diagnosed with h...
3  Patient is a 30-year-old female diagnosed with...
4  Patient is a 30-year-old female diagnosed with...


## Saving the Preprocessed Data

In [None]:
import json

# Function to convert a row to JSON
def row_to_json(row):
    return {
        "prompt": row['prompt']
    }

# Convert the DataFrame to JSONL
with open('preprocessed_coding_data.jsonl', 'w') as f:
    for _, row in data.iterrows():
        json.dump(row_to_json(row), f)
        f.write('\n')

## Fine-Tuning GPT-2

We'll use Hugging Face's transformers library to fine-tune GPT-2 on our sample dataset.

In [None]:
#Loading the Dataset
from datasets import load_dataset

# Load the dataset from the JSONL file
dataset = load_dataset('json', data_files='preprocessed_coding_data.jsonl', split='train')

# Inspect the dataset
print(dataset)
print(dataset[0])

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['prompt'],
    num_rows: 100
})
{'prompt': 'Patient is a 25-year-old female diagnosed with asthma. Performed knee arthroscopy. Scheduled follow-up appointment.### J45.40;J30.2 29881;D0120;D1110'}


## Initializing the Tokenizer and Model

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Since GPT-2 doesn't have a pad token, set it to eos_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

## Tokenizing the Dataset

In [None]:
def tokenize_function(examples):
    # Tokenize the prompt
    tokenized = tokenizer(
        examples['prompt'],
        padding='max_length',
        truncation=True,
        max_length=512
    )
    # Set labels equal to input_ids for language modeling
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

In [None]:
# Apply the tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove the original 'prompt' column to save memory
tokenized_dataset = tokenized_dataset.remove_columns(['prompt'])

# Set the format for PyTorch, including 'labels'
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## Setting Up Training Arguments

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2_medbilling",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Adjust based on your GPU
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
    prediction_loss_only=True,
)

## Initializing the Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

## Fine-Tuning the Model

In [None]:
# Start the training
trainer.train()

Step,Training Loss
5,3.6582
10,0.3498
15,0.2181
20,0.1683
25,0.1431
30,0.1297
35,0.1281
40,0.0913
45,0.1084
50,0.0993


TrainOutput(global_step=300, training_loss=0.1338209815820058, metrics={'train_runtime': 3680.7419, 'train_samples_per_second': 0.082, 'train_steps_per_second': 0.082, 'total_flos': 78387609600000.0, 'train_loss': 0.1338209815820058, 'epoch': 3.0})

In [None]:
#Save the model and tokenizer
model.save_pretrained("./gpt2_medbilling")
tokenizer.save_pretrained("./gpt2_medbilling")

('./gpt2_medbilling/tokenizer_config.json',
 './gpt2_medbilling/special_tokens_map.json',
 './gpt2_medbilling/vocab.json',
 './gpt2_medbilling/merges.txt',
 './gpt2_medbilling/added_tokens.json')

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_medbilling")
model = GPT2LMHeadModel.from_pretrained("./gpt2_medbilling")
model.config.pad_token_id = tokenizer.eos_token_id

In [None]:
import torch

def generate_codes(clinical_note, max_length=50):
    # Create the prompt
    prompt = clinical_note + "###"

    # Encode the input
    inputs = tokenizer.encode(prompt, return_tensors='pt')

    # Generate output
    outputs = model.generate(
        inputs,
        max_length=inputs.shape[1] + max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    # Decode the output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the codes
    codes = generated_text.split("###")[-1].strip()

    return codes

In [None]:
# Example clinical note
new_clinical_note = "Patient is a 45-year-old male diagnosed with asthma. Administered nasal corticosteroids."

# Generate codes
suggested_codes = generate_codes(new_clinical_note)

print("Suggested Codes:", suggested_codes)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Suggested Codes: J45.9 99195


In [None]:
# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_medbilling")
model = GPT2LMHeadModel.from_pretrained("./gpt2_medbilling")

In [None]:
# Ensure the model is in evaluation mode
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# Example clinical note
new_clinical_note = "Patient is a 45-year-old male diagnosed with allergic rhinitis. Initiated insulin therapy."

In [None]:
# Tokenize the input
inputs = tokenizer(new_clinical_note, return_tensors='pt', padding=True, truncation=True)

In [None]:
# Generate codes with adjusted parameters
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=50,            # Maximum length of generated codes
        num_return_sequences=1,   # Number of sequences to return
        pad_token_id=tokenizer.eos_token_id,  # Padding token ID
        temperature=0.5,          # Adjust the temperature for randomness
        top_k=50,                 # Only consider the top k tokens
        top_p=0.95                # Nucleus sampling
    )

In [None]:
# Decode the output
generated_codes = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print("Suggested Codes:", generated_codes)

Suggested Codes: Patient is a 45-year-old male diagnosed with allergic rhinitis. Initiated insulin therapy.### J18.9 99214
