In [None]:
#@title ICD9 Code Installs
%%capture
!git clone https://github.com/kshedden/icd9.git
!pip install --upgrade --quiet langchain lark chromadb langchain-openai
!pip install --upgrade --quiet huggingface_hub langchain_community

import os
from langchain_openai import OpenAI

from icd9 import icd9
codes = icd9.icd9Hierarchy

In [None]:
import random
def get_code_str(k = 2):
  global codes
  code_sample = random.choices(codes, k=k)
  code_strs = [c['descLong'] for c in code_sample]
  return "; ".join(code_strs)

get_code_str(k = 1)

In [None]:
OPENAI_API_KEY = "<your_openai_key>"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
#@title Prompt ChatGPT For Message Prompt
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.schema import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain_community.llms import HuggingFaceEndpoint
import langchain

import time
import re

class PromptWriter:
  def __init__(self, model_name = "gpt-3.5-turbo-instruct"):
    langchain.debug = False 
    self.model = OpenAI(model_name=model_name)

    prompt_template = '''Given an ICD9 code for a given patient, write a short description of a message that a patient might send to their doctor which may or may not be related to the code. Here are examples.

Example Code: Shoulder joint replacement

Example Message Description: Patient heard a snap while trying to lift heavy boxes after shoulder surgery, and is experiencing pain.

Example Code: Open wound of external ear, unspecified site, complicated

Example Message Description: Patient is inquiring about cleaning strategies for their ear wound.

Example Code: Obstetrical air embolism, delivered, with or without mention of antepartum condition

Example Message Description: Patient wishes to reschedule an upcoming appointment due to a conflict.

Example Code: Hypocalcemia

Example Message Description: Patient is vomiting and over-the-counter medications are not helping.

Write one single-sentence message description of the following code.

Code: {code_str}

Message Description:'''

    self.prompt_prompt = PromptTemplate(
        input_variables=["message"],
        template=prompt_template
    )

    self.prompt_chain = self.prompt_prompt | self.model | StrOutputParser()

  def _clean_str(self, response_str):
    return response_str.split('.')[0]+'.'


  def get_prompt(self, code_str):
    while True:
      try:
        response_str = self.prompt_chain.invoke({'code_str': code_str})
        message = self._clean_str(response_str)
        break
      except Exception as e:
        time.sleep(3)
    return message

In [None]:
import pandas as pd
from numpy.random import choice

def to_section(code):
  code = str(code)
  if pd.isna(code): return "None"
  if code[0] == "I": return "None"
  if code[0] == "V": return "V"
  if code[0] == "E": return "E"
  if code[0] == "T": return "T"
  code_num = int(code[0:3])
  if 1 <= code_num and code_num <= 139: return "100"
  elif code_num <= 239: return "140"
  elif code_num <= 279: return "240"
  elif code_num <= 289: return "280"
  elif code_num <= 319: return "290"
  elif code_num <= 389: return "320"
  elif code_num <= 459: return "390"
  elif code_num <= 519: return "460"
  elif code_num <= 579: return "520"
  elif code_num <= 629: return "580"
  elif code_num <= 679: return "630"
  elif code_num <= 709: return "680"
  elif code_num <= 739: return "710"
  elif code_num <= 759: return "740"
  elif code_num <= 779: return "760"
  elif code_num <= 799: return "780"
  elif code_num <= 999: return "800"

code2sec = {}
sec2code = {}

for c in codes:
  code = c['icd9']
  sec = to_section(code)
  code2sec[code] = sec
  if sec not in sec2code: sec2code[sec] = []
  sec2code[sec].append(code)

In [None]:
def random_code():
  global code2sec, sec2code, codes

  start =  [290, 780, 710, 740, 240, 320, 630, 800, 520, 460, 680, 'V', 140, 'T', 280, 580, 390, 100, 'None', 'E', 760]
  end =    [319, 799, 739, 759, 279, 389, 679, 999, 579, 519, 709, 'V', 239, 'T', 289, 629, 459, 139, 'None', 'E', 779]
  prob =   [.0585, .1237, .1535, .0055, .0622,.1151,.0088,.0289,.0477,.0365,.0590,.0971,.0278,.0463,.0099,.0561,.0456,.0147,.0018,.0010,.0003]
  prob_dist = pd.DataFrame({'start':start,'end':end,'prob':prob})

  # first pick the region with prob according to prob_dist
  sec = -1
  while sec not in sec2code:
    sec = choice(list(prob_dist.start), 1, p=list(prob_dist.prob))[0]
  # pick the code uniformly at random
  code = choice(sec2code[sec], 1)[0]
  # get the description
  for c in codes:
    if c['icd9'] == code:
      desc = c['descLong']
      break # these are only one
  return code, desc

random_code()

In [None]:
from tqdm.auto import trange
k = 1
data = {'icd9_code':[], 'desc':[], 'prompt':[]}

pr = PromptWriter()
for i in trange(k):
  code, desc = random_code()
  message = pr.get_prompt(desc)
  data['icd9_code'].append(code)
  data['desc'].append(desc)
  data['prompt'].append(message)

df = pd.DataFrame(data)