## Setup

In [None]:
# make sure to use the latest version of the openai python package
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.28.1-py3-none-any.whl (320 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/320.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m163.8/320.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-

# Connect Database

In [None]:
from google.colab import files, drive

drive.mount('/content/drive')

Mounted at /content/drive


# Open AI Connection

In [None]:
import json
import openai
import os
import pandas as pd
from pprint import pprint

client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))

# Data Cleaning

In [None]:
# Read in the dataset we'll use for this task.
disease_df = pd.read_csv("Symptoms_Dataset.csv")

disease_df.head()

Unnamed: 0,Disease,Specialist,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,Dermatologist,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,Dermatologist,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,Dermatologist,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,Dermatologist,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,Dermatologist,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [None]:
def append_non_nan(row):
    values = [str(val) for val in row[2:] if pd.notna(val)]
    return ' '.join(values)

disease_df['Symptom'] = disease_df.apply(append_non_nan, axis=1)
print(disease_df['Symptom'])

0       itching skin_rash nodal_skin_eruptions dischro...
1       skin_rash nodal_skin_eruptions dischromic _pat...
2        itching nodal_skin_eruptions dischromic _patches
3                   itching skin_rash dischromic _patches
4                  itching skin_rash nodal_skin_eruptions
                              ...                        
4915    vomiting headache nausea spinning_movements lo...
4916     skin_rash pus_filled_pimples blackheads scurring
4917    burning_micturition bladder_discomfort foul_sm...
4918    skin_rash joint_pain skin_peeling silver_like_...
4919    skin_rash high_fever blister red_sore_around_n...
Name: Symptom, Length: 4920, dtype: object


In [None]:
disease_df_randomized = disease_df.sample(frac=1, random_state=52)

disease_df_randomized.head()

Unnamed: 0,Disease,Specialist,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Symptom
3338,Dengue,Internal Medcine,skin_rash,chills,joint_pain,vomiting,fatigue,high_fever,headache,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,malaise,muscle_pain,red_spots_over_body,,,,skin_rash chills joint_pain vomiting fatigue h...
2322,Pneumonia,Pulmonologist,chills,fatigue,cough,high_fever,breathlessness,sweating,malaise,phlegm,chest_pain,fast_heart_rate,rusty_sputum,,,,,,,chills fatigue cough high_fever breathlessness...
1336,Hypertension,Cardiologist,headache,chest_pain,dizziness,loss_of_balance,lack_of_concentration,,,,,,,,,,,,,headache chest_pain dizziness loss_of_balance ...
3738,Diabetes,Endocrinologist,fatigue,weight_loss,restlessness,lethargy,irregular_sugar_level,blurred_and_distorted_vision,obesity,excessive_hunger,increased_appetite,polyuria,,,,,,,,fatigue weight_loss restlessness lethargy irre...
3270,Hypothyroidism,Endocrinologist,fatigue,weight_gain,cold_hands_and_feets,mood_swings,lethargy,dizziness,puffy_face_and_eyes,enlarged_thyroid,brittle_nails,swollen_extremeties,depression,irritability,abnormal_menstruation,,,,,fatigue weight_gain cold_hands_and_feets mood_...


In [None]:
training_data = []

system_message = "You are a helpful doctor. You are trying to diagnose disease based on symptoms."

def create_user_message(row):
    return f"""\n\nSymptom: {row['Symptom']}\n\nDisease & Specialization:"""

def prepare_example_conversation(row):
    messages = []
    messages.append({"role": "system", "content": system_message})

    user_message = create_user_message(row)
    messages.append({"role": "user", "content": user_message})

    messages.append({"role": "assistant", "content": "You possibly have " + row["Disease"] + " and should see a " + row['Specialist'] + " specialist"})

    return {"messages": messages}

pprint(prepare_example_conversation(disease_df_randomized.iloc[0]))

{'messages': [{'content': 'You are a helpful doctor. You are trying to '
                          'diagnose disease based on symptoms.',
               'role': 'system'},
              {'content': '\n'
                          '\n'
                          'Symptom: skin_rash chills joint_pain vomiting '
                          'fatigue high_fever headache nausea loss_of_appetite '
                          'pain_behind_the_eyes back_pain malaise muscle_pain '
                          'red_spots_over_body\n'
                          '\n'
                          'Disease & Specialization:',
               'role': 'user'},
              {'content': 'You possibly have Dengue and should see a Internal '
                          'Medcine specialist',
               'role': 'assistant'}]}


# Data Training & Validation

In [None]:
# use the first 100 rows of the dataset for training
training_df = disease_df_randomized.iloc[0:3444]

# apply the prepare_example_conversation function to each row of the training_df
training_data = training_df.apply(prepare_example_conversation, axis=1).tolist()

for example in training_data[:5]:
    print(example)

{'messages': [{'role': 'system', 'content': 'You are a helpful doctor. You are trying to diagnose disease based on symptoms.'}, {'role': 'user', 'content': '\n\nSymptom: skin_rash chills joint_pain vomiting fatigue high_fever headache nausea loss_of_appetite pain_behind_the_eyes back_pain malaise muscle_pain red_spots_over_body\n\nDisease & Specialization:'}, {'role': 'assistant', 'content': 'You possibly have Dengue and should see a Internal Medcine specialist'}]}
{'messages': [{'role': 'system', 'content': 'You are a helpful doctor. You are trying to diagnose disease based on symptoms.'}, {'role': 'user', 'content': '\n\nSymptom: chills fatigue cough high_fever breathlessness sweating malaise phlegm chest_pain fast_heart_rate rusty_sputum\n\nDisease & Specialization:'}, {'role': 'assistant', 'content': 'You possibly have Pneumonia and should see a Pulmonologist specialist'}]}
{'messages': [{'role': 'system', 'content': 'You are a helpful doctor. You are trying to diagnose disease bas

In [None]:
validation_df = disease_df_randomized.iloc[3445:4920]
validation_data = validation_df.apply(prepare_example_conversation, axis=1).tolist()

In [None]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

In [None]:
training_file_name = "tmp_disease_finetune_training.jsonl"
write_jsonl(training_data, training_file_name)

validation_file_name = "tmp_disease_finetune_validation.jsonl"
write_jsonl(validation_data, validation_file_name)

In [None]:
# print the first 5 lines of the training file
!head -n 5 tmp_disease_finetune_training.jsonl

{"messages": [{"role": "system", "content": "You are a helpful doctor. You are trying to diagnose disease based on symptoms."}, {"role": "user", "content": "\n\nSymptom: skin_rash chills joint_pain vomiting fatigue high_fever headache nausea loss_of_appetite pain_behind_the_eyes back_pain malaise muscle_pain red_spots_over_body\n\nDisease & Specialization:"}, {"role": "assistant", "content": "You possibly have Dengue and should see a Internal Medcine specialist"}]}
{"messages": [{"role": "system", "content": "You are a helpful doctor. You are trying to diagnose disease based on symptoms."}, {"role": "user", "content": "\n\nSymptom: chills fatigue cough high_fever breathlessness sweating malaise phlegm chest_pain fast_heart_rate rusty_sputum\n\nDisease & Specialization:"}, {"role": "assistant", "content": "You possibly have Pneumonia and should see a Pulmonologist specialist"}]}
{"messages": [{"role": "system", "content": "You are a helpful doctor. You are trying to diagnose disease bas

In [None]:
!head -n 5 tmp_disease_finetune_validation.jsonl

{"messages": [{"role": "system", "content": "You are a helpful doctor. You are trying to diagnose disease based on symptoms."}, {"role": "user", "content": "\n\nSymptom: vomiting yellowish_skin abdominal_pain swelling_of_stomach distention_of_abdomen history_of_alcohol_consumption fluid_overload\n\nDisease & Specialization:"}, {"role": "assistant", "content": "You possibly have Alcoholic hepatitis and should see a Hepatologist specialist"}]}
{"messages": [{"role": "system", "content": "You are a helpful doctor. You are trying to diagnose disease based on symptoms."}, {"role": "user", "content": "\n\nSymptom: stomach_pain acidity ulcers_on_tongue cough chest_pain\n\nDisease & Specialization:"}, {"role": "assistant", "content": "You possibly have GERD and should see a Gastroenterologist specialist"}]}
{"messages": [{"role": "system", "content": "You are a helpful doctor. You are trying to diagnose disease based on symptoms."}, {"role": "user", "content": "\n\nSymptom: fatigue weight_gain

In [None]:
with open(training_file_name, "rb") as training_fd:
    training_response = client.files.create(
        file=training_fd, purpose="fine-tune"
    )

training_file_id = training_response.id

with open(validation_file_name, "rb") as validation_fd:
    validation_response = client.files.create(
        file=validation_fd, purpose="fine-tune"
    )
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-Bd8w35VK5sIwKQwY4eMW4AcT
Validation file ID: file-fhNdpbtnPkgFeY9F69QWEuIf


In [None]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix="disease",
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

Job ID: ftjob-wQ4Af95nyigWjYTQLnG8JubI
Status: validating_files


In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", response.id)
print("Status:", response.status)
print("Trained Tokens:", response.trained_tokens)

Job ID: ftjob-wQ4Af95nyigWjYTQLnG8JubI
Status: succeeded
Trained Tokens: 788832


In [None]:
response = client.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

for event in events:
    print(event.message)

Created fine-tuning job: ftjob-gFaudh6LganVwF4XbuNbTsrL
Validating training file: file-AdQ5mrHucmfi2mrB83mmT6Gh and validation file: file-DU5yb4H2YnOghvcGWyCbju0J


In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = response.fine_tuned_model

if fine_tuned_model_id is None:
    raise RuntimeError("Fine-tuned model ID not found. Your job has likely not been completed yet.")

print("Fine-tuned model ID:", fine_tuned_model_id)

Fine-tuned model ID: ft:gpt-3.5-turbo-0125:personal:disease:9NUwB8kX


# Model Test

In [None]:
test_df = disease_df.loc[201:300]
test_row = test_df.iloc[0]
test_messages = []
test_messages.append({"role": "system", "content": system_message})
user_message = create_user_message(test_row)
test_messages.append({"role": "user", "content": user_message})

pprint(test_messages)

[{'content': 'You are a helpful doctor. You are trying to diagnose disease '
             'based on symptoms.',
  'role': 'system'},
 {'content': 'Specialist: Hepatologist\n'
             '\n'
             'Symptom: itching fatigue lethargy yellowish_skin dark_urine '
             'loss_of_appetite abdominal_pain yellow_urine yellowing_of_eyes '
             'malaise receiving_blood_transfusion '
             'receiving_unsterile_injections itching fatigue lethargy '
             'yellowish_skin dark_urine loss_of_appetite abdominal_pain '
             'yellow_urine yellowing_of_eyes malaise '
             'receiving_blood_transfusion receiving_unsterile_injections\n'
             '\n'
             'Disease: ',
  'role': 'user'}]


In [None]:
test_row = test_df.iloc[94]
test_messages = []
test_messages.append({"role": "system", "content": system_message})
user_message = create_user_message(test_row)
test_messages.append({"role": "user", "content": user_message})

pprint(test_messages)

[{'content': 'You are a helpful doctor. You are trying to diagnose disease '
             'based on symptoms.',
  'role': 'system'},
 {'content': 'Specialist: Cardiologist\n'
             '\n'
             'Symptom: vomiting breathlessness sweating vomiting '
             'breathlessness sweating\n'
             '\n'
             'Disease: ',
  'role': 'user'}]


In [18]:
#fine_tuned_model_id = "ft:gpt-3.5-turbo-0125:personal:disease:9NUwB8kX"

fine_tuned_model_id = ""

prompt = "What disease? Which specialist should I consult? What should I do before seeing the doctor? The symptoms I am experiencing are I have chills, vomiting, high fever, sweating, headache nausea, and muscle_pain."
test_messages = [
    {
        "role": "user",
        "content": prompt
    }
]
response = client.chat.completions.create(
    model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=500
)
print(response.choices[0].message.content)

You possibly have Malaria. See a Internal Medcine specialist. Before seeing the Internal Medcine specialist, take a Anti-malarial medication and a Mosquito repellent.
