In [27]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments, OPTForQuestionAnswering, AutoTokenizer
from datasets import Dataset
import gc

torch.cuda.empty_cache()
print(torch.cuda.memory_summary())

# load data set
df = pd.read_csv('trainsetfull.csv').loc[:2000] # only importing 2000 examples
# combine question and answer into a single column field
df['text'] = df.apply(lambda x: f"Q: {x['question']} A: {x['answer']}", axis=1)

# get training and test data sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# convert datasets to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['text']])
test_dataset = Dataset.from_pandas(test_df[['text']])

# Load GPT-Neo model (GPT-2's input token limit was too small for dataset)
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')  # Choose the model size here
tokenizer.pad_token = tokenizer.eos_token
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')
print(torch.cuda.is_available())
device = torch.device('cuda:0')
model = model.to(device)
torch.cuda.empty_cache()

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation = True, padding='max_length', max_length=1150)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for Trainer
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_train_dataset.set_format("torch")
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])
tokenized_test_dataset.set_format("torch")

# Define a function to create labels
def create_labels(batch):
    batch['labels'] = batch['input_ids']  # Use the input_ids as labels
    return batch

# Apply the create_labels function
tokenized_train_dataset = tokenized_train_dataset.map(create_labels)
tokenized_test_dataset = tokenized_test_dataset.map(create_labels)

# 1. increase epoch size
# 2. add a padding to 
# 3. Use a pre-finetuned model


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 4            |        cudaMalloc retries: 5         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   1517 MiB |  21712 MiB | 449846 GiB | 449844 GiB |
|       from large pool |   1516 MiB |  21709 MiB | 449212 GiB | 449210 GiB |
|       from small pool |      1 MiB |      4 MiB |    634 GiB |    634 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   1517 MiB |  21712 MiB | 449846 GiB | 449844 GiB |
|       from large pool |   1516 MiB |  21709 MiB | 449212 GiB | 449210 GiB |
|       from small pool |      1 MiB |      4 MiB |    634 GiB |    634 GiB |
|---------------------------------------------------------------

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/401 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/401 [00:00<?, ? examples/s]

In [3]:
torch.cuda.empty_cache()

In [None]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./result',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit = 1,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,  # Add the eval_dataset
)

# Start training
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.8386,0.888623
2,0.031,0.015732
3,0.0116,0.012511
4,0.0089,0.011588
5,0.0103,0.011373
6,0.0077,0.010998
7,0.0074,0.01079
8,0.0072,0.010722
9,0.007,0.010698
10,0.0068,0.010702


TrainOutput(global_step=8000, training_loss=0.11656168040633201, metrics={'train_runtime': 2223.8634, 'train_samples_per_second': 7.195, 'train_steps_per_second': 3.597, 'total_flos': 9387130060800000.0, 'train_loss': 0.11656168040633201, 'epoch': 10.0})

In [30]:
# Save the model
trainer.save_model("finetuned_gptneo")
tokenizer.save_pretrained("finetuned_gptneo")

('finetuned_gptneo/tokenizer_config.json',
 'finetuned_gptneo/special_tokens_map.json',
 'finetuned_gptneo/vocab.json',
 'finetuned_gptneo/merges.txt',
 'finetuned_gptneo/added_tokens.json')

In [57]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

tokenizer = GPT2Tokenizer.from_pretrained('finetuned_gptneo')  # Choose the model size here
tokenizer.pad_token = 'eos_token_id'
model = GPTNeoForCausalLM.from_pretrained('finetuned_gptneo')

#Set the model to evaluation mode
model.eval()

# Define a function to generate responses
def generate_response(question, max_length=900):
    # Prepare the input
    input_text = f"Q: {question} A:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
   # attention_mask = input_ids['attention_mask']
    
    # Generate response
    with torch.no_grad():
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, do_sample=True)

    # Decode the output
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer from the response
    answer = response.split("A:")[-1].strip()  # Get the text after "A:"
    return answer

# testing
question = """Q: GP: Good morning! What brings you in today?

Patient: Hi, I’ve been feeling really tired and fatigued lately.

GP: I see. I’m going to ask you some questions to gather more information. Can I start with your Medicare card number?

Patient: Sure, it’s 1234 5678 9311.

GP: Great. And your last name?

Patient: Smith.

GP: Thank you, Ms. Smith. And your given name?

Patient: It’s Emily.

GP: And your date of birth?

Patient: It’s March 12, 1985.

GP: Emily, thank you. What is your sex?

Patient: Female.

GP: Perfect. Now, could you provide me with your address?

Patient: I live at 45 Oak Street, Springfield.

GP: Thanks. And what’s your home telephone number?

Patient: It’s 02 1234 5678.

GP: Great. Do you have a work phone number?

Patient: Yes, it’s 02 8765 4321.

GP: And your mobile number?

Patient: It’s 0400 123 056.

GP: Are you currently fasting?

Patient: No, I’m not fasting.

GP: Do you have a pension card, healthcare concession card, or a veteran affairs card?

Patient: I have a veteran affairs card.

GP: Could you please provide me with your veteran affairs card number?

Patient: It’s VA123456.

GP: Thank you. You mentioned you don’t have a pension card or a Repat Gold card, correct?

Patient: That’s right.

GP: Thank you for the information. Now, for billing, do you prefer private, concession, or bulk billing?

Patient: I’d prefer bulk billing, please.

GP: Alright, Emily. Based on your symptoms, I’ll recommend some blood tests to check for any underlying issues. Let’s see what we can find out.

GP’s Notes:
Emily Smith, born on March 12, 1985, resides at 45 Oak Street, Springfield. She is a female patient, with a Medicare card number of 1234 5678 9311 and a veteran affairs card number of VA123456. Her contact numbers include a home phone at 02 1234 5678, a work phone at 02 8765 4321, and a mobile number of 0400 123 456. The patient is not fasting and prefers bulk billing for her tests. The results will be collected by the patient. I will fax the bloodwork results to me upon completion, and my doctor number is 987654, with the surname Johnson and initials J.D. The report will be sent to my office at 20 Clinic Road, Springfield. The tests are not marked as urgent.


*What is the patient's address*\" A:"""
response = generate_response(question)
tokenizer.pad_token = tokenizer.eos_token
#print(f"Question: {question}")
print(f"Response: {response}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Response: 45 Oak Street, Springfield
