## Importing libraries

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset,DataLoader
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments

In [2]:
train_df=pd.read_csv('/kaggle/input/medic-notes/train.csv')

In [3]:
train_df.head(2)

Unnamed: 0,ID,Note,json
0,1,**Clinical Notes**\n\n**Patient Information:**...,"{""patient_info"": {""age"": 41, ""gender"": ""Male""}..."
1,2,Clinical Notes:\n\nPatient: 56-year-old male\n...,"{""patient_info"": {""age"": 56, ""gender"": ""Male""}..."


In [4]:
train_df.Note[1], train_df.json[1]

('Clinical Notes:\n\nPatient: 56-year-old male\nChief Complaint: Allergies\n\nHistory of Present Illness:\nThe patient presents with complaints of allergies. He reports experiencing a runny nose, frequent sneezing, and itchy eyes. The patient also mentions having blurred vision and wheezing. These symptoms have been persistent and bothersome, prompting him to seek medical attention. No known recent changes in environment or new exposures reported.\n\nPast Medical History:\n- Seasonal allergies (longstanding)\n- Hypertension (well-controlled on medication)\n\nMedications:\n- Lisinopril 10mg daily for hypertension\n\nVital Signs:\nTemperature: 36.6Â°C (normal)\nRespiratory Rate: 13 breaths/min (normal)\nGlucose Level: 99.0 mg/dL (normal)\n\nPhysical Examination:\nGeneral: Alert and oriented, in no acute distress\nHEENT: Conjunctival injection noted, nasal mucosa appears erythematous and edematous with clear discharge\nLungs: Wheezing heard bilaterally, no crackles or rhonchi\nCardiovascu

In [5]:
train_df.isnull().sum()

ID      0
Note    0
json    0
dtype: int64

In [6]:
train_df.duplicated().sum()

0

In [7]:
train_df,val_df=train_test_split(train_df, test_size=0.1, random_state=42)

In [8]:
len(train_df),len(val_df)

(9739, 1083)

## Text Pre-processing

In [9]:
# conver to dict
train_data = train_df.to_dict(orient='records')
val_data = val_df.to_dict(orient='records')

In [10]:
train_data[0]

{'ID': 4609,
 'Note': '**Clinical Notes:**\n\n**Patient Information:**\n- Age: 28\n- Gender: Female\n\n**Visit Motivation:**\n- Patient presents with concerns related to potential COVID-19 infection.\n\n**Symptoms:**\n- Fever (Temperature: 39.7Â°C)\n- Cough\n- Fatigue\n- Sore throat\n- Difficulty breathing\n- Joint pain\n- Dizziness\n- Loss of taste and smell\n- Increased thirst\n- Night sweats\n- Swollen lymph nodes\n\n**Vital Signs:**\n- Temperature: 39.7Â°C (elevated)\n- Heart Rate: 89 bpm (within normal range)\n- Respiratory Rate: 17 breaths/min (within normal range)\n- Oxygen Saturation: 93.4% (slightly low, indicating potential respiratory compromise)\n- Glucose Level: 83.4 mg/dL (within normal range)\n\n**Assessment:**\nThe patient exhibits multiple symptoms consistent with a viral infection, particularly COVID-19, including elevated fever, respiratory distress (difficulty breathing and low oxygen saturation), and loss of taste and smell. The presence of swollen lymph nodes and 

In [11]:
# train_data[:2]

In [12]:
train_ds = []
for data in train_data:
    temp_dict = {}
    temp_dict['ID'] = data['ID']
    temp_dict['context'] = data['Note']  # Clinical notes as context
    temp_dict['answers'] = data['json']  # Expected structured JSON output
    temp_dict['question'] = ("Analyze the clinical notes provided in the context and return the answer as a strutured JSON object. You create a JSON object by enclosing key-value pairs within curly braces {}")  

    train_ds.append(temp_dict)


In [13]:
val_ds = []
for data in val_data:
    temp_dict = {}
    temp_dict['ID'] = data['ID']
    temp_dict['context'] = data['Note']  # Clinical notes as context
    temp_dict['answers'] = data['json']  # Expected structured JSON output
    temp_dict['question'] = ("Analyze the clinical notes provided in the context and return the answer  as a strutured JSON object. You create a JSON object by enclosing key-value pairs within curly braces {}")
  
    val_ds.append(temp_dict)

In [14]:
# train_ds[0]

### Tokenization

In [15]:
# tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer = T5Tokenizer.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
def preprocess(data):
    contexts = []
    questions = []
    answers = []

    for entry in data:
        context = entry['context']
        question = entry['question']
        answer = entry['answers']
        contexts.append(context)
        questions.append(question)
        answers.append(answer)

    return contexts, questions, answers

In [17]:
traincontext, trainquestions, trainanswers = preprocess(train_ds)
validcontext, validquestions, validanswers = preprocess(val_ds)

In [18]:
# trainanswers[200:202],traincontext[200:202]

In [19]:
class QADataset(Dataset):
    def __init__(self, trainquestions, traincontext, trainanswers, tokenizer, max_length=512, target_tokens=512):
        self.question = trainquestions
        self.context = traincontext
        self.answer = trainanswers
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.target_tokens = target_tokens

    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        question = self.question[idx]
        context = self.context[idx]
        answer = self.answer[idx]
        input_text = f"question: {question} context: {context}"
        target_text = answer
        inputs = self.tokenizer(input_text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors="pt")
        targets = self.tokenizer(target_text, max_length=self.target_tokens, truncation=True, padding='max_length', return_tensors="pt")

        # Flatten the tensors to remove extra dimensions
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        target_ids = targets['input_ids'].squeeze()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': target_ids
        }


In [20]:
traindataset = QADataset(trainquestions,traincontext,trainanswers, tokenizer)
valdataset = QADataset(validquestions,validcontext,validanswers, tokenizer)

In [21]:
# traindataset[0]

## ModelTraining

In [22]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
# model = BertLMHeadModel.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
len(trainanswers)

9739

In [25]:
class CustomTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            shuffle=True, 
            num_workers=self.args.dataloader_num_workers,
            collate_fn=self.data_collator
        )

In [26]:
training_args = TrainingArguments(
    output_dir='./model',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=70,
    evaluation_strategy="steps",
    eval_steps=70,
    save_steps=70,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    save_total_limit=2,
    learning_rate=5e-5,
    lr_scheduler_type='cosine',
    gradient_accumulation_steps=5,  
    fp16=False, 
    report_to=["none"],
    max_grad_norm=1.0,
    seed=42
    
)



In [27]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=traindataset,
    eval_dataset=valdataset
)

In [28]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss
70,1.9746,0.40222
140,0.3931,0.131392
210,0.1928,0.074631
280,0.1288,0.047031
350,0.093,0.034862
420,0.0725,0.028824
490,0.0594,0.025079
560,0.0528,0.0226
630,0.0461,0.021375
700,0.0423,0.020255


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

TrainOutput(global_step=1215, training_loss=0.191404133274722, metrics={'train_runtime': 3524.9163, 'train_samples_per_second': 13.815, 'train_steps_per_second': 0.345, 'total_flos': 6574904715509760.0, 'train_loss': 0.191404133274722, 'epoch': 4.987684729064039})

In [29]:
eval_results = trainer.evaluate()
print(eval_results)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.01846572384238243, 'eval_runtime': 29.3306, 'eval_samples_per_second': 36.924, 'eval_steps_per_second': 3.716, 'epoch': 4.987684729064039}


In [30]:
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/spiece.model',
 './model/added_tokens.json')

In [31]:
# load model
loadmodel = T5ForConditionalGeneration.from_pretrained('./model')
loadmodel.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [32]:
tokenizer = T5Tokenizer.from_pretrained('./model')

In [33]:
max_tokens = 512
targettokens = 512

In [34]:
def extractor(question, context, tokenizer, max_tokens):
    input_text = f"question: {question} medical context: {context}"
    inputs = tokenizer(input_text, max_length=max_tokens, truncation=True, padding='max_length', return_tensors="pt")
    input_ids = inputs['input_ids'].squeeze()
    attention_mask = inputs['attention_mask'].squeeze()
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
    }

In [35]:
result = []
samplesize = 100
loadmodel.eval()
with torch.no_grad():
    bar = tqdm(zip(validquestions[:samplesize], validcontext[:samplesize]), total=samplesize)
    for question, context in bar:
        # Extract the tokenized input and attention mask
        batch = extractor(question, context, tokenizer, max_tokens)
        input_ids = batch['input_ids'].to(device)  
        attention_mask = batch['attention_mask'].to(device)  
        input_ids = input_ids.view(1, -1)
        attention_mask = attention_mask.view(1, -1)
        outputs = loadmodel.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=targettokens)
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if decoded_output == '':
            decoded_output = 'na'

        result.append(decoded_output)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [03:28<00:00,  2.09s/it]


In [36]:
a = validanswers[:samplesize]
b = result

In [37]:
result[:2]

['"patient_info": "age": 45, "gender": "Female", "visit_motivation": "Hypertension (High Blood Pressure)", "symptoms": ["headache", "difficulty_breathing", "chest_pain", "dizziness", "sneezing", "blurred_vision", "wheezing", "pale_skin"], "vital_signs": "blood_pressure": "systolic": "value": 136, "unit": "mmHg", "diastolic": "value": 115, "unit": "mmHg", "diastolic": "value": 88, "unit": "bpm", "respiratory_rate": "value": 16, "unit": "breaths/min", "cholesterol_level": "value": 187.5, "unit": "mg/dL"',
 '"patient_info": "age": 18, "gender": "Female", "visit_motivation": "Common Cold", "symptoms": ["cough", "fatigue", "runny_nose", "rash", "sneezing", "blurred_vision", "wheezing", "swollen_lymph_nodes", "anxiety"], "vital_signs": "respiratory_rate": "value": 19, "unit": "breaths/min", "oxygen_saturation": "value": 98.0, "unit": "%"']

In [38]:
result[5]

'"patient_info": "age": 33, "gender": "Male", "visit_motivation": "Allergies", "symptoms": ["cough", "sore_throat", "nausea", "runny_nose", "rash", "sneezing", "itchy_eyes", "loss_of_taste_smell", "weight_loss"], "vital_signs": "blood_pressure": "systolic": "value": 104, "unit": "mmHg", "diastolic": "value": 71, "unit": "mmHg", "temperature": "value": 37.3, "unit": "u00b0C", "respiratory_rate": "value": 16, "unit": "breaths/min"'

In [39]:
# validcontext[5]

## inference

In [40]:
testpred = pd.read_csv('/kaggle/input/medic-notes/test.csv')

In [41]:
testpred1=testpred.Note.tolist()

In [42]:
# testpred1[:2]

In [43]:
def extractor1(context, tokenizer, max_tokens=512):
    # Format the input for T5
    input_text = f'question: "Analyze the clinical notes provided in the context and return the answer as a structured JSON object. You create a JSON object by enclosing key-value pairs within curly braces {{}} context: {context}'
    inputs = tokenizer(input_text, max_length=max_tokens, truncation=True, padding='max_length', return_tensors="pt")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
    }


In [44]:
result = []
loadmodel.eval()
with torch.no_grad():
    bar = tqdm(testpred1)
    for item in bar:
        batch = extractor1(item,tokenizer)
        input_ids = batch['input_ids'].to(device)
        input_ids = input_ids.view(1,-1)
        attention_mask = batch['attention_mask'].to(device)
        attention_mask = attention_mask.view(1,-1)

        outputs = loadmodel.generate(input_ids=input_ids, attention_mask=attention_mask,
                                 max_length=targettokens)
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if decoded_output == '':
            decoded_output = 'na'

        result.append(decoded_output)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3796/3796 [2:22:09<00:00,  2.25s/it]  


In [45]:
results_df = pd.DataFrame({
    'ID': testpred['ID'],
    'json': result
})

print(results_df.tail(5))

file_path = 'test_submission1.csv'
results_df.to_csv(file_path, index=False)

         ID                                               json
3791  14614  "patient_info": "age": 35, "gender": "Female",...
3792  14615  "patient_info": "age": 59, "gender": "Female",...
3793  14616  "patient_info": "age": 67, "gender": "Female",...
3794  14617  "patient_info": "age": 55, "gender": "Female",...
3795  14618  "patient_info": "age": 21, "gender": "Female",...
