<a href="https://colab.research.google.com/github/Pauullamm/OpenAI_Pill_Checker/blob/main/OpenAI_Pill_Checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description: Python scripts to prepare a text-based AI model fine-tuned on an OpenAI davinci-002 model

Have you ever wanted a pill identifer tool to check the name of a tablet/capsule by its description?

This Jupyter notebook outlines steps to fine-tune an OpenAI model for this purpose, utilising pharmaceutical/manufacturer data from the Electronic Medicines Compendium


# **Things to fix:**

1. Data extraction methods - checking for medicines that do not have spcs, only PILs
2. Retraining model on missing medicines (consider concatenating data for single training cycle? 3 epochs might be better?)
3. Formatting of data
4. Validation/Testing sets
5. Prompt variation adjustments for increased flexibility

# UPDATE: No longer using OpenAI fine-tuning but RAG instead with Langchain

In [36]:
!pip install --upgrade openai -q
!pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken
!pip install beautifulsoup4 -q
!pip install chromadb -q
!pip install unstructured -q
!pip install selenium -q
!pip install --upgrade numpy -q
!pip install jq

Collecting jq
  Downloading jq-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (657 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.6/657.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jq
Successfully installed jq-1.7.0


In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
import json
from openai import OpenAI
import openai
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_community.document_loaders import JSONLoader


!pip freeze > requirements.txt

In [3]:
#@title Step 1: Retrive url links of all medicines starting with a particular letter from the electronic medicines compendium

#Set letter of drug name to search for
letter = None #@param

letter_url = f'https://www.medicines.org.uk/emc/browse-medicines/{letter}'

def get_elements_of_letter(url):
  """Gets the total number of drugs under the specific letter
  Args:
    url: url of the durgs of a specific letter
  """
  r = requests.get(url)
  letter_soup = BeautifulSoup(r.text, 'html.parser')
  total_elements = letter_soup.find(class_='latest-updates-results-header-summary-total')
  total_elements = total_elements.text.replace(" ", "")
  total_elements = int(total_elements.replace("resultsfound", ""))
  return total_elements

def get_urls(num, link, show_progress=False,):
  """
  Args:
      num (int): number of items on the page
      show_progress: prints the item being processed to the screen, default value is False

  Returns:
      A set with the links for each item
  """
  output_urls = set()
  for i in tqdm(range(1, num + 1, 50)):

    #iterate over over site number
    url_to_check = f'{link}?offset={i}&limit=50'
    response = requests.get(url_to_check)
    soup = BeautifulSoup(response.text, 'html.parser')

    url_title_links = soup.find_all(class_="search-results-product-info-title-link emc-link")
    for j in url_title_links:
      if "ablet" in j.text:
        if show_progress:
          print(f"Processing: {j.text}")
        href = 'https://www.medicines.org.uk/' + j.get('href')
        output_urls.add(href)
      if "apsule" in j.text:
        if show_progress:
          print(f"Processing: {j.text}")
        href_cap = 'https://www.medicines.org.uk/' + j.get('href')
        output_urls.add(href_cap)

  return output_urls


urls_to_check = get_urls(num=get_elements_of_letter(letter_url), link=letter_url)
def append_to_file(filename, content):
  with open(filename, "a+") as file:
    # Check if file is empty (has no data)
    if file.tell() == 0:
      file.write("")  # Add an empty line if the file is empty
    else:
      file.write("\n")  # Add a newline if there's existing content
    file.write(content)

for url in urls_to_check:
  append_to_file("url_file.txt", url)

100%|██████████| 17/17 [00:19<00:00,  1.13s/it]


In [10]:
#@title Step 2: Screen through each drug link starting with a particular letter to obtain drug description and manufacturer details

#Output anomalous spcs to an error file for subsequent review
# nesting scraper in a single function for pharmaceutical form

def find_drug_description(url):
  r = requests.get(url)
  soup = BeautifulSoup(r.text, 'html.parser')
  # getting name of medicines
  title_tag = soup.find(id='PRODUCTINFO')
  try:
    title_parent = title_tag.parent
    title = title_parent.find(class_='sectionWrapper').text
  except Exception as e:
    title = ""
    print(f"DISCONTINUED/NO SPC: {url}")
    return url

  # getting description of medicine
  tag = soup.find(id='FORM')
  try:
    desc_parent = tag.parent
    all_desc = desc_parent.find_all(recursive=False)  # Restrict search within the parent div
    dsc_output = ""
    for desc in all_desc:
        if desc != tag:
          # Exclude the target element itself # Process the sibling element
          dsc_output = desc.text
  except Exception as e:
    dsc_output = ""
    pass

  # getting company name
  try:
    comp_name = soup.find(class_="product-header-company-name").text
  except Exception:
    comp_name = ""
    pass


  return title.replace("\n", "").replace("\r", ""), dsc_output.replace("\n", ""), comp_name.replace("\n", "")

output_dict = []
for i in tqdm(urls_to_check):
  try:

    if len(find_drug_description(i)) == 3:

      name, description, company = find_drug_description(i)
      output_dict.append({"Name": name,
                  "Description": description,
                  "Company": company})
    else:
      with open(f'error_spcs{letter}.txt', 'w') as f:
        f.write(f'{find_drug_description(i)}' + '\n')
  except Exception as e:
    print(str(e))
    continue

df = pd.DataFrame(output_dict)
print(df.to_string())
df.to_csv(f'OSD({letter}).csv', index=False)



  5%|▍         | 24/486 [01:04<18:48,  2.44s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/14995/pil


  5%|▌         | 25/486 [01:06<18:24,  2.40s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/14995/pil


 10%|▉         | 48/486 [02:00<12:48,  1.75s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/13961


 10%|█         | 49/486 [02:01<10:54,  1.50s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/13961


 40%|███▉      | 192/486 [06:58<10:50,  2.21s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/10704/pil


 40%|███▉      | 193/486 [06:59<09:05,  1.86s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/10704/pil


 43%|████▎     | 210/486 [07:43<13:42,  2.98s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/10777/pil


 43%|████▎     | 211/486 [07:45<13:09,  2.87s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/10777/pil


 56%|█████▌    | 272/486 [10:08<06:16,  1.76s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/14996/pil


 56%|█████▌    | 273/486 [10:10<06:18,  1.78s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/14996/pil


 61%|██████▏   | 298/486 [11:00<06:19,  2.02s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/14857/pil


 62%|██████▏   | 299/486 [11:01<05:46,  1.85s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/14857/pil


 65%|██████▍   | 314/486 [11:45<07:37,  2.66s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/13962


 65%|██████▍   | 315/486 [11:46<06:06,  2.15s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/13962


 85%|████████▍ | 412/486 [15:23<02:31,  2.04s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/14994/pil


 85%|████████▍ | 413/486 [15:26<03:02,  2.49s/it]

DISCONTINUED/NO SPC: https://www.medicines.org.uk//emc/product/14994/pil


100%|██████████| 486/486 [18:08<00:00,  2.24s/it]

                                                                                                                                                                                                                                                                       Name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             




In [None]:
#@title Step 2b: Read through spc errors and re-get pill data
with open(f'error_spcs{letter}.txt', 'r') as errorFile:
  for line in errorFile:
    try:
      if len(find_drug_description(line)) == 3:

        name, description, company = find_drug_description(line)
        output_dict.append({"Name": name,
                    "Description": description,
                    "Company": company})
      else:
        with open(f'error_spcs{letter}.txt', 'w') as f:
          f.write(f'{find_drug_description(i)}' + '\n')
    except Exception as e:
      print(str(e))
      continue

In [15]:
#@title Step 3: Convert the collected data to JSON format

def direct_json(l):
  training_data = []
  for i in l:
    merged_completion = i["Name"] + " "+ i["Company"].strip()
    prompt_description = r"" + i["Description"]
    data_dict = {"prompt": prompt_description.replace('\\', ''), "completion": merged_completion}
    training_data.append(data_dict)

  return training_data
# df = pd.read_csv(f"OSD({letter}).csv")
# training_data = df_to_training_data(df.copy())
training_data = direct_json(output_dict)
print(json.dumps(training_data, indent=2, ensure_ascii=False))

with open('loader_data.json', 'w') as a:
  json.dump(training_data, a)


[
  {
    "prompt": " Tablet.  Afinitor 2.5 mg tablets  White to slightly yellow, elongated tablets of approximately 10.1 mm in length and 4.1 mm in width, with a bevelled edge and no score, engraved with “ LCL” on one side and “ NVR” on the other.  Afinitor 5 mg tablets  White to slightly yellow, elongated tablets of approximately 12.1 mm in length and 4.9 mm in width, with a bevelled edge and no score, engraved with “ 5” on one side and “ NVR” on the other.  Afinitor 10 mg tablets  White to slightly yellow, elongated tablets of approximately 15.1 mm in length and 6.0 mm in width, with a bevelled edge and no score, engraved with “ UHE” on one side and “ NVR” on the other. ",
    "completion": " Afinitor®  2.5 mg tablets Afinitor®  5 mg tablets Afinitor®  10 mg tablets  Novartis Pharmaceuticals UK Ltd"
  },
  {
    "prompt": " Hard capsule. The capsule has a dark blue centre band, and a turquoise cap and body bearing the imprint of “ alli” . ",
    "completion": " alli 60 mg hard capsu

In [None]:
#@title Step 4: Filter through data to prepare training dataset

#@markdown Rename training_file_name to unique json1 file

training_file_name = f"training_data{letter}.jsonl"

def prepare_data(dictionary_data, final_file_name):
  with open(final_file_name, 'w') as outfile:

    for entry in dictionary_data:
      json.dump(entry, outfile)
      outfile.write('\n')

def remove_nan_dicts(data):
  """
  Removes dictionaries containing "nan" values from a JSON object or list.

  Args:
      data (object): The JSON data to process (dict or list).

  Returns:
      object: The modified JSON data with "nan" dictionaries removed.
  """
  # Iterate through the list of dictionaries
  output_json = []
  for d in data:
    if 'nan' in d.values():
      continue
    d.values().replace('\r', '')
    d.values().replace('\t', '')
    output_json.append(d)

  return output_json


nanless_data = remove_nan_dicts(training_data)
prepare_data(nanless_data, training_file_name)
print(json.dumps(nanless_data, indent=2))
print(nanless_data)


In [None]:
#@title Step 5: Upload data to OpenAI API fine-tuning

#@markdown Obtain your own api key from OpenAI
api_key = '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'


client = OpenAI(api_key=api_key)

training_file_id = client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)

In [None]:
#@title Step 6: Initiate model fine-tuning
response = client.fine_tuning.jobs.create(
  training_file=training_file_id.id,
  model="ft:davinci-002:personal::94QgQh4Z",
  hyperparameters={
    "n_epochs": 3,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
  }
)
job_id = response.id
status = response.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

In [None]:
#@title Step 7: Use fine-tuned model using prompts to describe tablet or capsule details
api_key = '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'

client = OpenAI(api_key=api_key)

result = client.fine_tuning.jobs.list()
fine_tuned_model = result.data[0].fine_tuned_model
for i in range(10):
  new_prompt = "I have a clear colourless/pale yellow coloured translucent oval shaped capsule. What is this drug?"
  answer = client.completions.create(
    model='ft:davinci-002:personal::94QddVCw',
    prompt=new_prompt,
    max_tokens=20
  )


  print(answer.choices[0].text)


 Auropregnic Forte 10 mg/150 mg Capsules Indications and dosage A
 I have been using either Silofast 10mg for slow release or Proamapical PL
Junel Fe 20 People+20/orange Fe2 Revision date: May 2010 Re
 PRENRON 10MG TABLETS - Patient Information Leaflet (PIL) by Booth A
 MYSOLINE 250 mg Granules for dispersion for oral solution Cadila Pharmaceuticals Ltd. An
 Chlorpropamide Hydrobromide 250 mg Sandcastle Tablets Sandcastle Pharmacy (UK) Ltd
 posted 11 Jan 2010 • 1 answer Votre Fetamine 3.5 (
 Sun Pharma do ano XXXX 10mg Simvastatin Tablets Watson Pharma Ltd 10 mg
 Strattera 60 mg capsules, opaque brown and orang .....More Sandoz Ltd Stratter
 Prednisolone 5mg 1mg Astra Zeneca sdb 1 mg A


# **NEW: Retrieval-Augmented Generation with LangChain**

Work in Progress....

In [21]:
OPENAI_API_KEY=None #@param

llm = ChatOpenAI(openai_api_key = OPENAI_API_KEY)

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are world class specialist in pharmaceutical products."),
    ("user", "{input}")
])

output_parser = StrOutputParser()
chain = prompt | llm | output_parser

In [39]:
import json
from pathlib import Path
from pprint import pprint

file_path='./loader_data.json'
JSONdata = json.loads(Path(file_path).read_text())

loader = JSONLoader(
    file_path=file_path,
    jq_schema='.',
    text_content=False)

data = loader.load()


embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)

retriever = vectorstore.as_retriever(k=len(urls_to_check))

# docs = retriever.invoke("I have a tablet with the following characteristics: White or almost white, flat, bevelled edges, barrel-shaped tablet debossed with 'C' on one side and '58' on the other side. The size is 8 mm x 6 mm. Could you tell me what this pill is, its manufacturer, and where you got that information from?")
# docs

  self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
  self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)


[Document(page_content='off-white, oval shaped, film coated tablet debossed with "E" on one side and "33" on the other side approximately 17.00 mm length x 9.00 mm width. \', \'completion\': \' Atorvastatin 60 mg film-coated tablets   Brown & Burk UK Ltd\'}, {\'prompt\': \' Film-coated tablet Oval-shaped purple film-coated tablet, approximately 19 mm long by 11 mm wide and debossed with “ A7TN” on one side and “ 500” on the other side. \', \'completion\': " Abiraterone Dr. Reddy\'s 500 mg Film-Coated Tablets  Dr. Reddy\'s', metadata={'seq_num': 1, 'source': '/content/loader_data.json'}),
 Document(page_content='Film-coated tablet.  White to almost white, oval, biconvex, film-coated tablets, tablet dimensions 16 mm x 8.5 mm.      \', \'completion\': \' Atorvastatin 60 mg film-coated tablets       Krka UK Ltd\'}, {\'prompt\': " Tablet  White to off white, round biconvex, uncoated tablets marked \'Y2\' on one side and plain on other side. The tablet dimensions are 6.0 mm ", \'completion\'

In [48]:
from langchain_core.runnables import RunnablePassthrough

template = """Answer the question based only on the following context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY)


def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

question_to_ask = '''I have a tablet with the following characteristics:
White to off-white round, flat tablets, with bevelled edges, embossed with '10' on one side and plain on the other.
Could you suggest 5 drugs that it could be and the manufacturer?
Give a confidence score with each answer'''


chain.invoke(question_to_ask)

'1. Amlodipine 10 mg Tablets - FDC International Ltd (Confidence score: 100%)\n2. Amlodipine 10 mg tablets - Mylan (Confidence score: 90%)\n3. Amlodipine 10 mg Tablets - Ipca Laboratories UK Ltd (Confidence score: 80%)\n4. Amlodipine 10 mg tablets - Zentiva (Confidence score: 70%)\n5. Amlodipine 10 mg Tablets - Aurobindo Pharma - Milpharm Ltd (Confidence score: 60%)'