In [25]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Load data set
df = pd.read_csv('trainset.csv')

# combine question and answer into a single column field
df['text'] = df.apply(lambda x: f"Q: {x['question']} A: {x['answer']}", axis=1)

# get training and test data sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert datasets to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['text']])
test_dataset = Dataset.from_pandas(test_df[['text']])

# Load GPT-Neo model (GPT-2's input token limit was too small for dataset)
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')  # Choose the model size here
tokenizer.pad_token = tokenizer.eos_token
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for Trainer
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_train_dataset.set_format("torch")
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])
tokenized_test_dataset.set_format("torch")

# Define a function to create labels
def create_labels(batch):
    batch['labels'] = batch['input_ids']  # Use the input_ids as labels
    return batch

# Apply the create_labels function
tokenized_train_dataset = tokenized_train_dataset.map(create_labels)
tokenized_test_dataset = tokenized_test_dataset.map(create_labels)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,  # Add the eval_dataset
)

# Start training
trainer.train()

# Save the model
trainer.save_model("finetuned_gpt3")
tokenizer.save_pretrained("finetuned_gpt3")


Map:   0%|          | 0/3107 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

Map:   0%|          | 0/3107 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.0117,0.010568
2,0.0095,0.009607
3,0.0087,0.008532


('finetuned_gpt3\\tokenizer_config.json',
 'finetuned_gpt3\\special_tokens_map.json',
 'finetuned_gpt3\\vocab.json',
 'finetuned_gpt3\\merges.txt',
 'finetuned_gpt3\\added_tokens.json')

In [211]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')  # Choose the model size here
model = GPTNeoForCausalLM.from_pretrained('finetuned_gpt3')

#Set the model to evaluation mode
model.eval()

# Define a function to generate responses
def generate_response(question, max_length=1400):
    # Prepare the input
    input_text = f"Q: {question} A:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate response
    with torch.no_grad():
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, do_sample=True)

    # Decode the output
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer from the response
    answer = response.split("A:")[-1].strip()  # Get the text after "A:"
    return answer

# Test the model with a sample question
question = """Transcript of Patient and GP Consultation

GP: Good morning! What brings you in today?

Patient: Hi, Doctor. I've been feeling really unwell. I'm experiencing extreme thirst and I need to urinate frequently, especially at night. I’ve also unintentionally lost some weight, and I’m feeling hungrier than usual. My vision has been a bit blurred, and I’m experiencing some numbness and tingling in my hands and feet. Plus, I'm constantly fatigued.

GP: I see. That sounds concerning. Let's go through some questions to gather more information, starting with your Medicare Card Number.

Patient: My Medicare Card Number is 1234 56789 1234.

GP: Thank you. And your last name?

Patient: My last name is Smith.

GP: Great. What are your given name/s?

Patient: My given name is James.

GP: What is your sex?

Patient: I'm male.

GP: What is your date of birth?

Patient: I was born on January 15, 1985.

GP: Thank you, John. What is your address?

Patient: I live at 42 Oak Street, Springfield.

GP: Could you please provide your contact numbers? What is your home telephone number?

Patient: My home phone number is (02) 9876 5432.

GP: And your business telephone number?

Patient: My work number is (02) 1234 5678.

GP: Do you have a mobile number?

Patient: Yes, it's 0400 123 456.

GP: Are you fasting?

Patient: No, I am not fasting.

GP: Are you non-fasting?

Patient: Yes, that’s correct.

GP: Are you pregnant?

Patient: No, I’m not pregnant.

GP: Are you receiving any hormone therapy?

Patient: No hormone therapy at this time.

GP: When was your last normal menstrual period (LNMP)?

Patient: I haven't had a period in three years since I had a vasectomy.

GP: Understood. What is your expected date of confinement (EDC)?

Patient: I wouldn’t have one since I'm not pregnant.

GP: Have you had any post-natal complications?

Patient: No, I haven’t been pregnant.

GP: Are you post-menopausal?

Patient: Like I said, I’ve not had period for three years; I would assume so.

GP: Have you received any radiotherapy?

Patient: No, I haven’t.

GP: Do you have an intrauterine contraceptive device (IUCD)?

Patient: No IUCD, just a vasectomy.

GP: Have you experienced any abnormal bleeding?

Patient: No, none at all.

GP: Thank you for that information. I’ll make a note of the following clinical summary. You have symptoms suggestive of possible diabetes or another metabolic issue, considering your extreme thirst, frequent urination, fatigue, and weight loss.

Patient: That sounds like what I've been experiencing.

GP: I'll also need to ask a couple more questions. What is your healthcare card number?

Patient: I don’t have a healthcare card.

GP: Do you need a transfusion?

Patient: Yes, the nurse mentioned I might need one today.

GP: Understood. By what time should the bloodwork be phone/faxed to the lab?

Patient: I believe they said by 2 PM.

GP: And what is the phone/fax number?

Patient: The number is (02) 5555 1234.

GP: Thank you, John. Now, let me ask a couple of follow-up questions. In the past three months, have you been pregnant?

Patient: No, I haven't.

GP: And have you been transfused in the past three months?

Patient: No, I haven’t had a transfusion recently.

GP: All right, let me summarize. Based on your symptoms and the need for a transfusion, I will order some blood tests for you.

---

GP's Notes After Consultation:

1. Date the transfusion is required: October 25, 2023
2. Time the transfusion is required: 2 PM
3. Reason for transfusion/operation: Possible anemia and metabolic concerns.
4. Who should also receive a copy of the report results: The referring doctor, Dr. Jane Doe.
5. Are the bloodwork results to be collected by the patient: Yes, the patient will collect.
6. What is the doctor number of the requesting doctor: 987654.
7. What is the surname of the requesting doctor: Doe.
8. What are the initials of the requesting doctor: J.D.
9. What is the address of the requesting doctor: 50 Main Street, Springfield.
10. Which Hospital/ward is the patient located: Springfield General Hospital, Ward A.
11. Does the patient require a copy of this request form? Yes.
12. Is the private patient in a private hospital or approved day hospital facility? Public patient.
13. Is the private patient in a recognised hospital? Yes.
14. Is the public patient in a recognised hospital? Yes.
15. Is the outpatient of a recognised hospital? Yes.
16. Is the bloodwork request urgent? Yes.
17. Should the bloodwork be phone faxed once complete? Yes, by 2 PM.
18. Is the billing method private? No.
19. Is the billing method concession? No.
20. Is the billing method bulk bill? Yes.
21. Is the billing method veteran affairs? No.
Provide a summary of the Clinical Notes about the patient in around 50 words or less"""

response = generate_response(question)
#print(f"Question: {question}")
print(f"Response: {response}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response: Name: Emily Jane Doe; Date: 15th March 1985.
22. Are there any expected post-natal complications? Yes.
23. What is your estimated due date?


In [178]:
Patient's Medicare Card Number:| 1234 56789 1234 | 1234 5678 987654
Patient's last name: | Smith | Smith
Patient's last name: | James Smith | James
Patient's sex: | Male | Male
Patient's Date of Birth: | January 15, 1985 | January 15, 1985
Reference: | Jane Doe, Patient: Doe.| N/A
What is the Reference: | A reference number: A readable, non-public number. | N/A
What is the Patient's Address: | (02) 68 Elm Street, Springfield. | 42 Oak Street, Springfield.
What is the patient's Tel(Home): | 04 9876 5432. | (02)9876 5432
What is the patient's Tel(Bus): | (02) 5555 1234. | (02) 1234 5678
What is the patients Mobile: | 0400 123 456 | 0400 123 456
Is the Patient Fasting: | Yes | No
Is the Patient Non-Fasting: | No | Yes
Is the Patient Pregnant?: Yes, the patient is pregnant | No 
Does the petient receive Hormone Therapy: | No | No
LNMP A, LNMP A, LNMP A.
A.I.A.A.A.A.A.A...
Yes.
Yes.

SyntaxError: unterminated string literal (detected at line 1) (1828173133.py, line 1)