<a href="https://colab.research.google.com/github/Pauullamm/OpenAI_Pill_Checker/blob/main/OpenAI_Pill_Checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description: Python scripts to prepare a text-based AI model fine-tuned on an OpenAI davinci-002 model

Have you ever wanted a pill identifer tool to check the name of a tablet/capsule by its description? This Jupyter notebook outlines steps to fine-tune an OpenAI model for this purpose, utilising pharmaceutical/manufacturer data from the Electronic Medicines Compendium

In [None]:
!pip install --upgrade openai -q

Step 1: Retrive url links of all medicines starting with a particular letter from the electronic medicines compendium

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm


letter_B_url = 'https://www.medicines.org.uk/emc/browse-medicines/B'

def get_elements_of_letter(url):
  """Gets the total number of drugs under the specific letter
  Args:
    url: url of the durgs of a specific letter
  """
  r = requests.get(url)
  letter_soup = BeautifulSoup(r.text, 'html.parser')
  total_elements = letter_soup.find(class_='latest-updates-results-header-summary-total')
  total_elements = total_elements.text.replace(" ", "")
  total_elements = int(total_elements.replace("resultsfound", ""))
  return total_elements

def get_urls(num, link, show_progress=False,):
  """
  Args:
      num (int): number of items on the page
      show_progress: prints the item being processed to the screen, default value is False

  Returns:
      A set with the links for each item
  """
  output_urls = set()
  for i in tqdm(range(1, num + 1, 50)):

    #iterate over over site number
    url_to_check = f'{link}?offset={i}&limit=50'
    response = requests.get(url_to_check)
    soup = BeautifulSoup(response.text, 'html.parser')

    url_title_links = soup.find_all(class_="search-results-product-info-title-link emc-link")
    for j in url_title_links:
      if "ablet" in j.text:
        if show_progress:
          print(f"Processing: {j.text}")
        href = 'https://www.medicines.org.uk/' + j.get('href')
        output_urls.add(href)
      if "apsule" in j.text:
        if show_progress:
          print(f"Processing: {j.text}")
        href_cap = 'https://www.medicines.org.uk/' + j.get('href')
        output_urls.add(href_cap)

  return output_urls


B_urls = get_urls(num=get_elements_of_letter(letter_B_url), link=letter_B_url)


Step 2: Screen through each drug link starting with a particular letter to obtain drug description and manufacturer details

In [None]:
# Gathering data from letter urls

import requests
from bs4 import BeautifulSoup
import pandas as pd

# nesting scraper in a single function for pharmaceutical form

def find_drug_description(url):
  discontinued = []
  r = requests.get(url)
  soup = BeautifulSoup(r.text, 'html.parser')
  # getting name of medicines
  title_tag = soup.find(id='PRODUCTINFO')
  try:
    title_parent = title_tag.parent
    title = title_parent.find(class_='sectionWrapper').text
  except Exception as e:
    title = ""
    print(f"DISCONTINUED/NO SPC: {url}")
    discontinued.append(url)
    pass

  # getting description of medicine
  tag = soup.find(id='FORM')
  try:
    desc_parent = tag.parent
    all_desc = desc_parent.find_all(recursive=False)  # Restrict search within the parent div
    dsc_output = ""
    for desc in all_desc:
        if desc != tag:
          # Exclude the target element itself # Process the sibling element
          dsc_output = desc.text + " What is this drug?"
  except Exception as e:
    dsc_output = ""
    pass

  # getting company name
  try:
    comp_name = soup.find(class_="product-header-company-name").text
  except Exception:
    comp_name = ""
    pass

  return title.replace("\n", ""), dsc_output.replace("\n", ""), comp_name.replace("\n", ""), discontinued




output_dict = []
for i in tqdm(B_urls):
  name, description, company, disc_med = find_drug_description(i)
  output_dict.append({"Name": name,
               "Description": description,
               "Company": company})

df = pd.DataFrame(output_dict)
print(df.to_string())
df.to_csv('OSD(B).csv', index=False)



Step 3: Convert the collected data to JSON format

In [None]:
import pandas as pd
import json

def df_to_training_data(df):
  """
  Converts a pandas DataFrame to a list of dictionaries in training_data format.

  Args:
      df (pandas.DataFrame): The DataFrame to convert.

  Returns:
      list: A list of dictionaries in training_data format.
  """
  training_data = []
  for index, row in df.iterrows():
    # Merge last column into first column
    completion = f"{row[0]} {str(row[2]).strip()}"
    merged_prompt = f"{row[1]}"

    # Create dictionary and append to training_data
    data_dict = {"prompt": merged_prompt, "completion": completion}
    training_data.append(data_dict)

  return training_data

df = pd.read_csv("OSD(B).csv")
training_data = df_to_training_data(df.copy())
print(json.dumps(training_data, indent=2))


Step 4: Filter through data to prepare training dataset

In [None]:
import json

training_file_name = "training_dataB.jsonl"

def prepare_data(dictionary_data, final_file_name):
  with open(final_file_name, 'w') as outfile:

    for entry in dictionary_data:
      json.dump(entry, outfile)
      outfile.write('\n')

def remove_nan_dicts(data):
  """
  Removes dictionaries containing "nan" values from a JSON object or list.

  Args:
      data (object): The JSON data to process (dict or list).

  Returns:
      object: The modified JSON data with "nan" dictionaries removed.
  """
  # Iterate through the list of dictionaries
  output_json = []
  for d in data:
    if 'nan' in d.values():
      print(d)
      continue
    output_json.append(d)

  return output_json


clean_data = remove_nan_dicts(training_data)
prepare_data(clean_data, training_file_name)
print(json.dumps(clean_data, indent=2))
print(clean_data)


Step 5: Upload data to OpenAI API fine-tuning

In [None]:
from openai import OpenAI
import openai

api_key = "YOUR OPENAI API KEY"

client = OpenAI(api_key=api_key)

training_file_id = client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)

Step 6: Initiate model fine-tuning

In [None]:
response = client.fine_tuning.jobs.create(
  training_file=training_file_id.id,
  model="davinci-002",
  hyperparameters={
    "n_epochs": 5,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
  }
)
job_id = response.id
status = response.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Step 7: Use fine-tuned model using prompts to describe tablet or capsule details

In [None]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

result = client.fine_tuning.jobs.list()
fine_tuned_model = result.data[0].fine_tuned_model

new_prompt = 'Yellow coloured, scored tablets with a one-sided embossment „BIS 5".What is this drug?'
answer = client.completions.create(
  model='ft:davinci-002:personal::92hKSsQ8',
  prompt=new_prompt,
  max_tokens=20
)

print(answer.choices[0].text)


Bisoprolol Fumarate 5 mg Tablets Sandoz Limited ->->->->
