# Lets Start NLP Module

In [18]:
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn import metrics
from tqdm import tqdm
import numpy as np
import re

# print(data.columns)
# Extract 'problems' and 'findings' columns
df= pd.read_csv("indiana_reports.csv")[["Problems","findings"]]
#print df train element by element
print(df.iloc[0])




Problems                                               normal
findings    The cardiac silhouette and mediastinum size ar...
Name: 0, dtype: object


In [19]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

print(device)
#Also print gpu name
if device == 'cuda':
    print(cuda.get_device_name(0))

cuda
NVIDIA GeForce RTX 3060 Laptop GPU


In [20]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])         
    return text

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_username(text):
    url = re.compile(r'@[A-Za-z0-9_]+')
    return url.sub(r'',text)

def pre_process_text(text):
    text = remove_URL(text)
    text = remove_numbers(text)
    text = remove_html(text)
    text = remove_username(text)
    return " ".join(text.split())

In [21]:
df.dropna(subset=['findings'], inplace=True)  # Remove rows with NaN in 'findings'
df = df[df['findings'].str.strip().astype(bool)]  # Remove rows with empty or whitespace 'findings'

df['Problems'] = df['Problems'].str.replace(';', ' ')


In [22]:
print(df['Problems'].value_counts())
print(df['findings'].value_counts())

Problems
normal                                                      1197
No Indexing                                                   87
Lung                                                          79
Calcified Granuloma                                           72
Thoracic Vertebrae                                            59
                                                            ... 
Pulmonary Atelectasis Foreign Bodies Density                   1
Calcified Granuloma Lung Markings                              1
Lung Opacity Markings                                          1
Hernia, Diaphragmatic Bone Diseases, Metabolic Deformity       1
Opacity Granuloma                                              1
Name: count, Length: 1244, dtype: int64
findings
The heart and lungs have XXXX XXXX in the interval. Both lungs are clear and expanded. Heart and mediastinum normal.                                                                                                               

In [23]:
from transformers import T5Tokenizer, AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer


# Splitting source and target into train and eval sets (80-20 split)

# Assuming df is your DataFrame
source = df['Problems'].tolist()
target = df['findings'].tolist()

source_train, source_eval, target_train, target_eval = train_test_split(
    source, target, test_size=0.2, random_state=42
)

# Convert the data into strings
source_train_str = '\n'.join(source_train)
target_train_str = '\n'.join(target_train)
source_eval_str = '\n'.join(source_eval)
target_eval_str = '\n'.join(target_eval)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


# Tokenize the source and target texts
tokenized_source = tokenizer(source_train_str,truncation=True, return_tensors="pt")
tokenized_target = tokenizer(target_train_str, truncation=True, return_tensors="pt")

eval_source = tokenizer(source_eval_str, truncation=True, return_tensors="pt")
eval_target = tokenizer(target_eval_str,truncation=True, return_tensors="pt")

print('Tokenized source has shape', tokenized_source['input_ids'].shape)
print('Tokenized target has shape', tokenized_target['input_ids'].shape)

print('Eval source has shape', eval_source['input_ids'].shape)
print('Eval target has shape', eval_target['input_ids'].shape)


Tokenized source has shape torch.Size([1, 1024])
Tokenized target has shape torch.Size([1, 1024])
Eval source has shape torch.Size([1, 1024])
Eval target has shape torch.Size([1, 1024])


In [24]:
print(tokenized_source)
print(tokenized_target)

{'input_ids': tensor([[11265,   198, 11265,  ...,   198, 18257,  4355]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}
{'input_ids': tensor([[  464, 21726,   290,  ...,   914, 15880,    11]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}


In [25]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, source_input_ids, source_attention_mask, target_input_ids, target_attention_mask):
        self.source_input_ids = source_input_ids
        self.source_attention_mask = source_attention_mask
        self.target_input_ids = target_input_ids
        self.target_attention_mask = target_attention_mask

    def __len__(self):
        return len(self.source_input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.source_input_ids[idx],
            'attention_mask': self.source_attention_mask[idx],
            'labels': self.target_input_ids[idx],
            'labels_attention_mask': self.target_attention_mask[idx]
        }

# Assuming you have the tokenized tensors: tokenized_source and tokenized_target
train_dataset = CustomDataset(
    tokenized_source['input_ids'],
    tokenized_source['attention_mask'],
    tokenized_target['input_ids'],
    tokenized_target['attention_mask']
)
eval_dataset = CustomDataset(
    eval_source['input_ids'],
    eval_source['attention_mask'],
    eval_target['input_ids'],
    eval_target['attention_mask']
)


In [26]:
print(train_dataset)
print(eval_dataset)




<__main__.CustomDataset object at 0x000001E5AD775150>
<__main__.CustomDataset object at 0x000001E5AD955990>


# Finetuning

In [27]:
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset
from transformers import AutoModelForSeq2SeqLM

# Initialize the BioClinicalBERT model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=100,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use the created dataset here
    eval_dataset=eval_dataset
)

# Train the model
trainer.train()

# Save the trained model to a directory
model.save_pretrained("/FYP_DATASET/results/")


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 14.162193298339844, 'eval_runtime': 0.1884, 'eval_samples_per_second': 5.308, 'eval_steps_per_second': 5.308, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 14.154274940490723, 'eval_runtime': 0.2043, 'eval_samples_per_second': 4.894, 'eval_steps_per_second': 4.894, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 14.138603210449219, 'eval_runtime': 0.237, 'eval_samples_per_second': 4.219, 'eval_steps_per_second': 4.219, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 14.115185737609863, 'eval_runtime': 0.2006, 'eval_samples_per_second': 4.986, 'eval_steps_per_second': 4.986, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 14.084543228149414, 'eval_runtime': 0.2083, 'eval_samples_per_second': 4.8, 'eval_steps_per_second': 4.8, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 14.046225547790527, 'eval_runtime': 0.214, 'eval_samples_per_second': 4.673, 'eval_steps_per_second': 4.673, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 14.000151634216309, 'eval_runtime': 0.2053, 'eval_samples_per_second': 4.87, 'eval_steps_per_second': 4.87, 'epoch': 7.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.946674346923828, 'eval_runtime': 0.2075, 'eval_samples_per_second': 4.82, 'eval_steps_per_second': 4.82, 'epoch': 8.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.886106491088867, 'eval_runtime': 0.2116, 'eval_samples_per_second': 4.726, 'eval_steps_per_second': 4.726, 'epoch': 9.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.818580627441406, 'eval_runtime': 0.2139, 'eval_samples_per_second': 4.676, 'eval_steps_per_second': 4.676, 'epoch': 10.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.744010925292969, 'eval_runtime': 0.2099, 'eval_samples_per_second': 4.765, 'eval_steps_per_second': 4.765, 'epoch': 11.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.662920951843262, 'eval_runtime': 0.2143, 'eval_samples_per_second': 4.666, 'eval_steps_per_second': 4.666, 'epoch': 12.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.575539588928223, 'eval_runtime': 0.2105, 'eval_samples_per_second': 4.75, 'eval_steps_per_second': 4.75, 'epoch': 13.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.480911254882812, 'eval_runtime': 0.2099, 'eval_samples_per_second': 4.765, 'eval_steps_per_second': 4.765, 'epoch': 14.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.37917423248291, 'eval_runtime': 0.2144, 'eval_samples_per_second': 4.664, 'eval_steps_per_second': 4.664, 'epoch': 15.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.27104663848877, 'eval_runtime': 0.211, 'eval_samples_per_second': 4.739, 'eval_steps_per_second': 4.739, 'epoch': 16.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.157652854919434, 'eval_runtime': 0.2119, 'eval_samples_per_second': 4.72, 'eval_steps_per_second': 4.72, 'epoch': 17.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.038749694824219, 'eval_runtime': 0.2134, 'eval_samples_per_second': 4.685, 'eval_steps_per_second': 4.685, 'epoch': 18.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 12.914766311645508, 'eval_runtime': 0.2095, 'eval_samples_per_second': 4.774, 'eval_steps_per_second': 4.774, 'epoch': 19.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 12.78376293182373, 'eval_runtime': 0.2143, 'eval_samples_per_second': 4.666, 'eval_steps_per_second': 4.666, 'epoch': 20.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 12.6463623046875, 'eval_runtime': 0.2116, 'eval_samples_per_second': 4.727, 'eval_steps_per_second': 4.727, 'epoch': 21.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 12.501517295837402, 'eval_runtime': 0.2144, 'eval_samples_per_second': 4.664, 'eval_steps_per_second': 4.664, 'epoch': 22.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 12.346344947814941, 'eval_runtime': 0.2143, 'eval_samples_per_second': 4.666, 'eval_steps_per_second': 4.666, 'epoch': 23.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 12.183370590209961, 'eval_runtime': 0.2115, 'eval_samples_per_second': 4.728, 'eval_steps_per_second': 4.728, 'epoch': 24.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 12.01433277130127, 'eval_runtime': 0.2148, 'eval_samples_per_second': 4.655, 'eval_steps_per_second': 4.655, 'epoch': 25.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 11.843124389648438, 'eval_runtime': 0.2144, 'eval_samples_per_second': 4.663, 'eval_steps_per_second': 4.663, 'epoch': 26.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 11.6693754196167, 'eval_runtime': 0.2136, 'eval_samples_per_second': 4.682, 'eval_steps_per_second': 4.682, 'epoch': 27.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 11.487083435058594, 'eval_runtime': 0.218, 'eval_samples_per_second': 4.586, 'eval_steps_per_second': 4.586, 'epoch': 28.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 11.298809051513672, 'eval_runtime': 0.2159, 'eval_samples_per_second': 4.632, 'eval_steps_per_second': 4.632, 'epoch': 29.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 11.102838516235352, 'eval_runtime': 0.2199, 'eval_samples_per_second': 4.548, 'eval_steps_per_second': 4.548, 'epoch': 30.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 10.90517807006836, 'eval_runtime': 0.2103, 'eval_samples_per_second': 4.755, 'eval_steps_per_second': 4.755, 'epoch': 31.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 10.705077171325684, 'eval_runtime': 0.2144, 'eval_samples_per_second': 4.665, 'eval_steps_per_second': 4.665, 'epoch': 32.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 10.503195762634277, 'eval_runtime': 0.2115, 'eval_samples_per_second': 4.727, 'eval_steps_per_second': 4.727, 'epoch': 33.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 10.30024242401123, 'eval_runtime': 0.2135, 'eval_samples_per_second': 4.684, 'eval_steps_per_second': 4.684, 'epoch': 34.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 10.103870391845703, 'eval_runtime': 0.2133, 'eval_samples_per_second': 4.688, 'eval_steps_per_second': 4.688, 'epoch': 35.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 9.90880298614502, 'eval_runtime': 0.2203, 'eval_samples_per_second': 4.538, 'eval_steps_per_second': 4.538, 'epoch': 36.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 9.719947814941406, 'eval_runtime': 0.2169, 'eval_samples_per_second': 4.61, 'eval_steps_per_second': 4.61, 'epoch': 37.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 9.538312911987305, 'eval_runtime': 0.2155, 'eval_samples_per_second': 4.64, 'eval_steps_per_second': 4.64, 'epoch': 38.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 9.367167472839355, 'eval_runtime': 0.2165, 'eval_samples_per_second': 4.62, 'eval_steps_per_second': 4.62, 'epoch': 39.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 9.207033157348633, 'eval_runtime': 0.216, 'eval_samples_per_second': 4.63, 'eval_steps_per_second': 4.63, 'epoch': 40.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 9.060591697692871, 'eval_runtime': 0.221, 'eval_samples_per_second': 4.525, 'eval_steps_per_second': 4.525, 'epoch': 41.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.924861907958984, 'eval_runtime': 0.2148, 'eval_samples_per_second': 4.656, 'eval_steps_per_second': 4.656, 'epoch': 42.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.799956321716309, 'eval_runtime': 0.2189, 'eval_samples_per_second': 4.569, 'eval_steps_per_second': 4.569, 'epoch': 43.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.68592357635498, 'eval_runtime': 0.2178, 'eval_samples_per_second': 4.591, 'eval_steps_per_second': 4.591, 'epoch': 44.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.580901145935059, 'eval_runtime': 0.2164, 'eval_samples_per_second': 4.621, 'eval_steps_per_second': 4.621, 'epoch': 45.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.483758926391602, 'eval_runtime': 0.2179, 'eval_samples_per_second': 4.59, 'eval_steps_per_second': 4.59, 'epoch': 46.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.394822120666504, 'eval_runtime': 0.2208, 'eval_samples_per_second': 4.528, 'eval_steps_per_second': 4.528, 'epoch': 47.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.312089920043945, 'eval_runtime': 0.2167, 'eval_samples_per_second': 4.614, 'eval_steps_per_second': 4.614, 'epoch': 48.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.235048294067383, 'eval_runtime': 0.2174, 'eval_samples_per_second': 4.599, 'eval_steps_per_second': 4.599, 'epoch': 49.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.163315773010254, 'eval_runtime': 0.2193, 'eval_samples_per_second': 4.56, 'eval_steps_per_second': 4.56, 'epoch': 50.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.09622859954834, 'eval_runtime': 0.2145, 'eval_samples_per_second': 4.661, 'eval_steps_per_second': 4.661, 'epoch': 51.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.033099174499512, 'eval_runtime': 0.2205, 'eval_samples_per_second': 4.536, 'eval_steps_per_second': 4.536, 'epoch': 52.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.973479270935059, 'eval_runtime': 0.2206, 'eval_samples_per_second': 4.533, 'eval_steps_per_second': 4.533, 'epoch': 53.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.916803359985352, 'eval_runtime': 0.2189, 'eval_samples_per_second': 4.569, 'eval_steps_per_second': 4.569, 'epoch': 54.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.862581729888916, 'eval_runtime': 0.2134, 'eval_samples_per_second': 4.685, 'eval_steps_per_second': 4.685, 'epoch': 55.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.810714244842529, 'eval_runtime': 0.2223, 'eval_samples_per_second': 4.499, 'eval_steps_per_second': 4.499, 'epoch': 56.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.761175632476807, 'eval_runtime': 0.2184, 'eval_samples_per_second': 4.58, 'eval_steps_per_second': 4.58, 'epoch': 57.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.713438987731934, 'eval_runtime': 0.2132, 'eval_samples_per_second': 4.691, 'eval_steps_per_second': 4.691, 'epoch': 58.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.666943550109863, 'eval_runtime': 0.2143, 'eval_samples_per_second': 4.666, 'eval_steps_per_second': 4.666, 'epoch': 59.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.622136116027832, 'eval_runtime': 0.218, 'eval_samples_per_second': 4.588, 'eval_steps_per_second': 4.588, 'epoch': 60.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.578116416931152, 'eval_runtime': 0.2153, 'eval_samples_per_second': 4.645, 'eval_steps_per_second': 4.645, 'epoch': 61.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.535039901733398, 'eval_runtime': 0.2093, 'eval_samples_per_second': 4.778, 'eval_steps_per_second': 4.778, 'epoch': 62.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.493044376373291, 'eval_runtime': 0.2094, 'eval_samples_per_second': 4.777, 'eval_steps_per_second': 4.777, 'epoch': 63.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.452003479003906, 'eval_runtime': 0.2133, 'eval_samples_per_second': 4.687, 'eval_steps_per_second': 4.687, 'epoch': 64.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.412368297576904, 'eval_runtime': 0.2144, 'eval_samples_per_second': 4.665, 'eval_steps_per_second': 4.665, 'epoch': 65.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.373501777648926, 'eval_runtime': 0.2143, 'eval_samples_per_second': 4.665, 'eval_steps_per_second': 4.665, 'epoch': 66.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.335488796234131, 'eval_runtime': 0.2093, 'eval_samples_per_second': 4.777, 'eval_steps_per_second': 4.777, 'epoch': 67.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.297863483428955, 'eval_runtime': 0.2174, 'eval_samples_per_second': 4.601, 'eval_steps_per_second': 4.601, 'epoch': 68.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.260910511016846, 'eval_runtime': 0.2171, 'eval_samples_per_second': 4.607, 'eval_steps_per_second': 4.607, 'epoch': 69.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.224499702453613, 'eval_runtime': 0.2193, 'eval_samples_per_second': 4.56, 'eval_steps_per_second': 4.56, 'epoch': 70.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.188416004180908, 'eval_runtime': 0.2124, 'eval_samples_per_second': 4.708, 'eval_steps_per_second': 4.708, 'epoch': 71.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.1524338722229, 'eval_runtime': 0.2232, 'eval_samples_per_second': 4.481, 'eval_steps_per_second': 4.481, 'epoch': 72.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.117585182189941, 'eval_runtime': 0.2182, 'eval_samples_per_second': 4.583, 'eval_steps_per_second': 4.583, 'epoch': 73.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.083512306213379, 'eval_runtime': 0.2148, 'eval_samples_per_second': 4.655, 'eval_steps_per_second': 4.655, 'epoch': 74.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.050227642059326, 'eval_runtime': 0.2228, 'eval_samples_per_second': 4.488, 'eval_steps_per_second': 4.488, 'epoch': 75.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.017221927642822, 'eval_runtime': 0.2221, 'eval_samples_per_second': 4.503, 'eval_steps_per_second': 4.503, 'epoch': 76.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.984407901763916, 'eval_runtime': 0.2126, 'eval_samples_per_second': 4.704, 'eval_steps_per_second': 4.704, 'epoch': 77.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.952232837677002, 'eval_runtime': 0.2197, 'eval_samples_per_second': 4.552, 'eval_steps_per_second': 4.552, 'epoch': 78.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.921313285827637, 'eval_runtime': 0.2166, 'eval_samples_per_second': 4.616, 'eval_steps_per_second': 4.616, 'epoch': 79.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.891088962554932, 'eval_runtime': 0.2143, 'eval_samples_per_second': 4.666, 'eval_steps_per_second': 4.666, 'epoch': 80.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.860811710357666, 'eval_runtime': 0.2143, 'eval_samples_per_second': 4.666, 'eval_steps_per_second': 4.666, 'epoch': 81.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.830954551696777, 'eval_runtime': 0.2203, 'eval_samples_per_second': 4.539, 'eval_steps_per_second': 4.539, 'epoch': 82.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.80138635635376, 'eval_runtime': 0.2263, 'eval_samples_per_second': 4.419, 'eval_steps_per_second': 4.419, 'epoch': 83.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.772146701812744, 'eval_runtime': 0.2187, 'eval_samples_per_second': 4.572, 'eval_steps_per_second': 4.572, 'epoch': 84.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.742445468902588, 'eval_runtime': 0.2164, 'eval_samples_per_second': 4.621, 'eval_steps_per_second': 4.621, 'epoch': 85.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.712815284729004, 'eval_runtime': 0.2186, 'eval_samples_per_second': 4.574, 'eval_steps_per_second': 4.574, 'epoch': 86.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.683825969696045, 'eval_runtime': 0.2223, 'eval_samples_per_second': 4.498, 'eval_steps_per_second': 4.498, 'epoch': 87.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.655313968658447, 'eval_runtime': 0.219, 'eval_samples_per_second': 4.567, 'eval_steps_per_second': 4.567, 'epoch': 88.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.62772274017334, 'eval_runtime': 0.2229, 'eval_samples_per_second': 4.486, 'eval_steps_per_second': 4.486, 'epoch': 89.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.6012282371521, 'eval_runtime': 0.218, 'eval_samples_per_second': 4.588, 'eval_steps_per_second': 4.588, 'epoch': 90.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.574456214904785, 'eval_runtime': 0.2144, 'eval_samples_per_second': 4.663, 'eval_steps_per_second': 4.663, 'epoch': 91.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.548535346984863, 'eval_runtime': 0.2213, 'eval_samples_per_second': 4.518, 'eval_steps_per_second': 4.518, 'epoch': 92.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.522976875305176, 'eval_runtime': 0.2203, 'eval_samples_per_second': 4.539, 'eval_steps_per_second': 4.539, 'epoch': 93.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.497988224029541, 'eval_runtime': 0.2184, 'eval_samples_per_second': 4.579, 'eval_steps_per_second': 4.579, 'epoch': 94.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.474128723144531, 'eval_runtime': 0.2238, 'eval_samples_per_second': 4.468, 'eval_steps_per_second': 4.468, 'epoch': 95.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.450955390930176, 'eval_runtime': 0.2195, 'eval_samples_per_second': 4.557, 'eval_steps_per_second': 4.557, 'epoch': 96.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.430056571960449, 'eval_runtime': 0.2195, 'eval_samples_per_second': 4.556, 'eval_steps_per_second': 4.556, 'epoch': 97.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.410269260406494, 'eval_runtime': 0.2225, 'eval_samples_per_second': 4.495, 'eval_steps_per_second': 4.495, 'epoch': 98.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.3911051750183105, 'eval_runtime': 0.2251, 'eval_samples_per_second': 4.443, 'eval_steps_per_second': 4.443, 'epoch': 99.0}
{'loss': 8.764, 'learning_rate': 1e-05, 'epoch': 100.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.373225688934326, 'eval_runtime': 0.1854, 'eval_samples_per_second': 5.394, 'eval_steps_per_second': 5.394, 'epoch': 100.0}
{'train_runtime': 70.1488, 'train_samples_per_second': 1.426, 'train_steps_per_second': 1.426, 'train_loss': 8.764032592773438, 'epoch': 100.0}


In [28]:
from transformers import AutoModelForSequenceClassification

loaded_model = AutoModelForSequenceClassification.from_pretrained("/FYP_DATASET/results/")
print(loaded_model)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at /FYP_DATASET/results/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)


In [29]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the trained model and tokenizer for BioClinicalBERT
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


# Get user input
user_input = input("Enter your input text: ")

model_input = user_input.replace(';', ' ')

# Tokenize the user input
tokenized_input = tokenizer.encode(model_input, return_tensors="pt", max_length=512, truncation=True)

output = model.generate(tokenized_input, max_length=150, num_return_sequences=1, early_stopping=True)

# Decode the output tokens to text
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)


print("Processed Input:", model_input)
# Display the generated text
print("Generated output:", decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processed Input: Mediastinum Aorta, Thoracic Scoliosis
Generated output: Mediastinum Aorta, Thoracic Scoliosis, and the Role of the Biliary System in the Treatment of Gastrointestinal Disorders. Gastroenterology. 2002;116(3):903-912.

[Crossref]

Klein, J.

Klein, J.

Klein, J.

Klein, J.

Klein, J.

Klein, J.

Klein, J.

Klein, J.

Klein, J.

Klein, J.

Klein, J.

Klein, J.

Klein, J.

K
