# Description: Python scripts to prepare a text-based AI model fine-tuned on an OpenAI davinci-002 model

Have you ever wanted a pill identifer tool to check the name of a tablet/capsule by its description?

This Jupyter notebook outlines steps to fine-tune an OpenAI model for this purpose, utilising pharmaceutical/manufacturer data from the Electronic Medicines Compendium


# **Things to fix:**

1. Data extraction methods - checking for medicines that do not have spcs, only PILs
2. Retraining model on missing medicines (consider concatenating data for single training cycle? 3 epochs might be better?)
3. Formatting of data
4. Validation/Testing sets
5. Prompt variation adjustments for increased flexibility

# UPDATE: No longer using OpenAI fine-tuning but RAG instead with Langchain

In [3]:
#@title DEPENDENCIES
!pip install --upgrade openai -q
!pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken
!pip install beautifulsoup4 -q
!pip install chromadb -q
!pip install unstructured -q
!pip install selenium -q
!pip install --upgrade numpy -q
!pip install jq -q

In [4]:
#@title IMPORTS
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
import json
from openai import OpenAI
import openai
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_community.document_loaders import JSONLoader


!pip freeze > requirements.txt

In [5]:
with open('./loader_data.json', mode='w', encoding='utf-8') as f:
    json.dump([], f)

In [24]:
#@title Step 1: Retrive url links of all medicines starting with a particular letter from the electronic medicines compendium

#Set letter of drug name to search for
letter = "C" #@param

letter_url = f'https://www.medicines.org.uk/emc/browse-medicines/{letter}'

def get_elements_of_letter(url):
  """Gets the total number of drugs under the specific letter
  Args:
    url: url of the durgs of a specific letter
  """
  r = requests.get(url)
  letter_soup = BeautifulSoup(r.text, 'html.parser')
  total_elements = letter_soup.find(class_='latest-updates-results-header-summary-total')
  total_elements = total_elements.text.replace(" ", "")
  total_elements = int(total_elements.replace("resultsfound", ""))
  return total_elements

def get_urls(num, link, show_progress=False,):
  """
  Args:
      num (int): number of items on the page
      show_progress: prints the item being processed to the screen, default value is False

  Returns:
      A set with the links for each item
  """
  output_urls = set()
  for i in tqdm(range(1, num + 1, 50)):

    #iterate over over site number
    url_to_check = f'{link}?offset={i}&limit=50'
    response = requests.get(url_to_check)
    soup = BeautifulSoup(response.text, 'html.parser')

    url_title_links = soup.find_all(class_="search-results-product-info-title-link emc-link")
    for j in url_title_links:
      if "ablet" in j.text:
        if show_progress:
          print(f"Processing: {j.text}")
        href = 'https://www.medicines.org.uk/' + j.get('href')
        output_urls.add(href)
      if "apsule" in j.text:
        if show_progress:
          print(f"Processing: {j.text}")
        href_cap = 'https://www.medicines.org.uk/' + j.get('href')
        output_urls.add(href_cap)

  return output_urls


urls_to_check = get_urls(num=get_elements_of_letter(letter_url), link=letter_url)
def append_to_file(filename, content):
  with open(filename, "a+") as file:
    # Check if file is empty (has no data)
    if file.tell() == 0:
      file.write("")  # Add an empty line if the file is empty
    else:
      file.write("\n")  # Add a newline if there's existing content
    file.write(content)

for url in urls_to_check:
  append_to_file("url_file.txt", url)

100%|██████████| 17/17 [00:07<00:00,  2.21it/s]


In [None]:
#@title Step 2: Screen through each drug link starting with a particular letter to obtain drug description and manufacturer details

#Output anomalous spcs to an error file for subsequent review
# nesting scraper in a single function for pharmaceutical form

def find_drug_description(url):
  r = requests.get(url)
  soup = BeautifulSoup(r.text, 'html.parser')
  # getting name of medicines
  title_tag = soup.find(id='PRODUCTINFO')
  try:
    title_parent = title_tag.parent
    title = title_parent.find(class_='sectionWrapper').text
  except Exception as e:
    title = ""
    print(f"DISCONTINUED/NO SPC: {url}")
    return url

  # getting description of medicine
  tag = soup.find(id='FORM')
  try:
    desc_parent = tag.parent
    all_desc = desc_parent.find_all(recursive=False)  # Restrict search within the parent div
    dsc_output = ""
    for desc in all_desc:
        if desc != tag:
          # Exclude the target element itself # Process the sibling element
          dsc_output = desc.text
  except Exception as e:
    dsc_output = ""
    pass

  # getting company name
  try:
    comp_name = soup.find(class_="product-header-company-name").text
  except Exception:
    comp_name = ""
    pass


  return title.replace("\n", "").replace("\r", ""), dsc_output.replace("\n", ""), comp_name.replace("\n", "")

output_dict = []
for i in tqdm(urls_to_check):
  try:

    if len(find_drug_description(i)) == 3:

      name, description, company = find_drug_description(i)
      output_dict.append({"Name": name,
                  "Description": description,
                  "Company": company})
    else:
      with open(f'error_spcs{letter}.txt', 'w') as f:
        f.write('\n' + f'{find_drug_description(i)}' + '\n')
  except Exception as e:
    print(str(e))
    continue

df = pd.DataFrame(output_dict)
print(df.to_string())
df.to_csv(f'OSD({letter}).csv', index=False)



In [None]:
#@title Step 2b: Read through spc errors and re-get pill data
with open(f'error_spcs{letter}.txt', 'r') as errorFile:
  for line in errorFile:
    try:
      if len(find_drug_description(line)) == 3:

        name, description, company = find_drug_description(line)
        output_dict.append({"Name": name,
                    "Description": description,
                    "Company": company})
      else:
        with open(f'error_spcs{letter}.txt', 'w') as f:
          f.write(f'{find_drug_description(i)}' + '\n')
    except Exception as e:
      print(str(e))
      continue

In [23]:
#@title Step 3: Convert the collected data to JSON format

def direct_json(l):
  training_data = []
  for i in l:
    merged_completion = i["Name"] + " "+ i["Company"].strip()
    prompt_description = r"" + i["Description"]
    # data_dict = {"prompt": prompt_description.replace('\\', ''), "completion": merged_completion}
    data_dict = {merged_completion : prompt_description.replace('\\', '')} # Testing with only json, removed "prompt" and "completion"

    training_data.append(data_dict)

  return training_data


def append_to_json(filename, new_dict):
  """
  Appends a dictionary to a list in a JSON file.

  Args:
    filename: The path to the JSON file.
    new_dict: The dictionary to append to the list.
  """
  try:
    # Open the file in read mode with 'r'
    with open(filename, "r") as f:
      # Try to load the data
      try:
        data = json.load(f)
      except json.JSONDecodeError:
        # Empty JSON file, create an empty list
        data = []
  except FileNotFoundError:
    # File doesn't exist, create an empty list
    data = []

  # Ensure data is a list
  if not isinstance(data, list):
    raise ValueError("JSON data must be a list")

  # Append the new dictionary
  data.append(new_dict)

  # Open the file in write mode with 'w' to overwrite the content
  with open(filename, "w") as f:
    # Dump the updated data with indentation for readability
    json.dump(data, f, indent=4, ensure_ascii=False)

filename='loader_data.json'
training_data = direct_json(output_dict)

for entry in tqdm(training_data):
  append_to_json(filename, entry)

print(training_data[:5])

100%|██████████| 56/56 [00:00<00:00, 367.38it/s]

[{' Bumetanide/Amiloride 1mg/5mg Tablets  Chemidex  Pharma  Ltd': " Tablet. Cream coloured, flat, oval with bevelled edge tablets, scored on one side and engraved with '149' on the reverse. The score line is not intended for breaking the tablet. "}, {' Bylvay 600 micrograms hard capsules  Albireo Pharma': ' Hard capsule  Size 0 capsule (21.7 mm × 7.64 mm) with ivory opaque cap and body; imprinted “ A600” with black ink. '}, {' Travel Calm Tablets  THE BOOTS COMPANY PLC': ' Tablets. '}, {' Buspirone Hydrochloride 5 mg Tablets   Mylan': ' Tablet. Buspirone Hydrochloride 5 mg Tablets are white, round, bevelled edge tablets, embossed “ BR 5” on one side, “ G” on the reverse. '}, {' Busulfan 2 mg tablets  Aspen': ' Film coated tablet Busulfan 2 mg tablets are white, film-coated, round biconvex tablets engraved “ GX EF3” on one side and “ M” on the other. '}]





In [None]:
#@title Step 4: Filter through data to prepare training dataset

#@markdown Rename training_file_name to unique jsonl file

training_file_name = f"training_data{letter}.jsonl" #@param

def prepare_data(dictionary_data, final_file_name):
  with open(final_file_name, 'w') as outfile:

    for entry in dictionary_data:
      json.dump(entry, outfile)
      outfile.write('\n')

def remove_nan_dicts(data):
  """
  Removes dictionaries containing "nan" values from a JSON object or list.

  Args:
      data (object): The JSON data to process (dict or list).

  Returns:
      object: The modified JSON data with "nan" dictionaries removed.
  """
  # Iterate through the list of dictionaries
  output_json = []
  for d in data:
    if 'nan' in d.values():
      continue
    d.values().replace('\r', '')
    d.values().replace('\t', '')
    output_json.append(d)

  return output_json


nanless_data = remove_nan_dicts(training_data)
prepare_data(nanless_data, training_file_name)
print(json.dumps(nanless_data, indent=2))
print(nanless_data)


In [None]:
#@title Step 5: Upload data to OpenAI API fine-tuning

#@markdown Obtain your own api key from OpenAI
api_key = None #@param

client = OpenAI(api_key=api_key)

training_file_id = client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)

In [None]:
#@title Step 6: Initiate model fine-tuning

base_model = None #@param
response = client.fine_tuning.jobs.create(
  training_file=training_file_id.id,
  model=base_model,
  hyperparameters={
    "n_epochs": 3,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
  }
)
job_id = response.id
status = response.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

In [None]:
#@title Step 7: Use fine-tuned model using prompts to describe tablet or capsule details
OPENAI_API_KEY=None #@param

client = OpenAI(api_key=api_key)

result = client.fine_tuning.jobs.list()
fine_tuned_model = result.data[0].fine_tuned_model
for i in range(10):
  new_prompt = "I have a clear colourless/pale yellow coloured translucent oval shaped capsule. What is this drug?"
  answer = client.completions.create(
    model=None #@param
    prompt=new_prompt,
    max_tokens=20
  )


  print(answer.choices[0].text)


# **NEW: Retrieval-Augmented Generation with LangChain**

Work in Progress....

In [10]:
!pip install langchain_text_splitters -q
from langchain_text_splitters import RecursiveJsonSplitter

In [17]:
splitter = RecursiveJsonSplitter(max_chunk_size=300)
# Recursively split json data - If you need to access/manipulate the smaller json chunks
json_chunks = splitter.split_json(json_data=training_data, convert_lists=True)


In [11]:
OPENAI_API_KEY=None #@param

llm = ChatOpenAI(openai_api_key = OPENAI_API_KEY)

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are world class specialist in pharmaceutical products."),
    ("user", "{input}")
])

output_parser = StrOutputParser()
chain = prompt | llm | output_parser

In [12]:
import json
from pathlib import Path

file_path='/content/loader_data.json'


JSONdata = json.loads(Path(file_path).read_text())

loader = JSONLoader(
    file_path=file_path,
    jq_schema='.',
    text_content=False)

data = loader.load()


embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)


In [None]:
def get_json_len(fileName):

  try:
    with open(fileName, "r") as f:
      data_len = len(json.load(f))
      return data_len
  except FileNotFoundError:
    print(f"Error: File '{filename}' not found.")
    return None
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON data in '{filename}'.")
    return None

retriever = vectorstore.as_retriever(k=get_json_len(file_path))
docs = retriever.invoke("Opaque white and light brown hard gelatin capsule containing white to off- white powder (length: 14.5 mm).")
docs

In [26]:
from langchain_core.runnables import RunnablePassthrough

template = """Answer the question based only on the following context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY)


def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

question_to_ask = '''I have a pill with the following characteristics:
Opaque white and light brown hard gelatin capsule containing white to off- white powder  (length: 14.5 mm).
Could you tell me what this pill could be?
'''


chain.invoke(question_to_ask)

'Based on the characteristics described, the pill could be Adoport 1 mg Capsules by Sandoz Limited.'