In [6]:
# Run before to install required libraries
#!pip install langchain langchain_community ollama

In [26]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
import pandas as pd
import torch
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [13]:
# Check device
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
else:
    print("CUDA is not available. Using CPU.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


CUDA is available. Using GPU.


In [24]:
# Load the dataset
data_path = 'data/data_original.csv'
data = pd.read_csv(data_path)

# Assuming non-numeric placeholders in 'nrs_pain', convert them to NaN
data['nrs_pain'] = pd.to_numeric(data['nrs_pain'], errors='coerce')

# Save the modified data back to the CSV, replacing the original file
data.to_csv(data_path, index=False)

print("Placeholders in 'nrs_pain' have been replaced with NaN and the file has been updated.")


Placeholders in 'nrs_pain' have been replaced with NaN and the file has been updated.


In [15]:
# Downloaded llama3.1:8b-instruct-q4_K_M from ollama.com
llm = Ollama(model = "llama3.1:8b-instruct-q4_K_M")

In [4]:
for chunk in llm.stream("Hello"):
    print(chunk, end = "")

Hello! How can I assist you today?

In [16]:
embedder = OllamaEmbeddings(model="llama3.1:8b-instruct-q4_K_M")

In [17]:
print(dir(embedder))


['Config', '__abstractmethods__', '__annotations__', '__class__', '__class_vars__', '__config__', '__custom_root_type__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__exclude_fields__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_validators__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__include_fields__', '__init__', '__init_subclass__', '__iter__', '__json_encoder__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__post_root_validators__', '__pre_root_validators__', '__pretty__', '__private_attributes__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__', '__rich_repr__', '__schema_cache__', '__setattr__', '__setstate__', '__signature__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__try_update_forward_refs__', '__validators__', '__weakref__', '_abc_impl', '_calculate_keys', '_copy_and_set_values', '_decompose_class', '_default_params', '_embed', '_enforce_dict_if_r

In [21]:
chief_complaints = data['chief_complain'].tolist()
print(chief_complaints)

['right ocular pain', 'right forearm burn', 'arm pain, Lt', 'ascites tapping', 'distension, abd', 'fever', 'With chest discomfort', 'pain, chest', 'LBP - Low back pain', 'Eczema, Eyelid', 'acute epigastric pain', 'pain, leg', 'epigastric pain', 'abd pain', 'headache', 'headache', '??', 'Open Wound', 'RUQ pain', 'Gingival swelling', 'chin pain', 'Finger Injury', 'skin rash', 'Rt. side motor weakness', 'Lt. hip joint pain', 'face laceration', 'throat pain', 'chest wall pain right', 'pain, arm', 'Seizure Like Activity', 'post seizure', 'vomiting', 'palpitation', 'syncope', '??', 'hematochezia', 'injury, finger', 'syncope', '??', 'hyperventilation', 'involuntary movt.', 'Urticarial rash', 'Foreign body sensation in neck', 'throat pain', 'swelling, facial', 'post seizure', 'dyspnea', 'mental change', 'fever', 'melena', 'Abdominal pain (finding)', 'both leg pain', 'Left leg pain', 'knee pain', 'H-Headache', 'needle stick injury', 'Left chest pain', 'palpitation', 'fever', 'abd pain', 'fever'

In [23]:
# Function to save embeddings
def save_embeddings(embeddings, filename):
    torch.save(embeddings, filename)

# Function to load embeddings if available
def load_embeddings(filename):
    if os.path.exists(filename):
        return torch.load(filename)
    return None

# Function to process embeddings in batches with error handling
def process_in_batches(texts, batch_size=100, max_attempts=3):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = None
        for attempt in range(max_attempts):
            try:
                batch_embeddings = embedder.embed_documents(batch_texts)
                break
            except Exception as e:
                if attempt < max_attempts - 1:
                    print(f"Attempt {attempt + 1} failed, retrying...")
                else:
                    print(f"Failed after {max_attempts} attempts: {e}")
                    raise
        all_embeddings.extend(batch_embeddings)
        print(f"Processed batch {i//batch_size + 1}/{len(texts)//batch_size + 1}")
    return all_embeddings

# Path to save embeddings
embeddings_file = 'data/chief_complaint_embeddings.pt'

# Check if embeddings already exist
embeddings = load_embeddings(embeddings_file)

if embeddings is None:
    # Extract the text data from the 'chief_complain' column
    chief_complaints = data['chief_complain'].tolist()

    # Generate embeddings for the textual data
    embeddings = process_in_batches(chief_complaints, batch_size=100)

    # Save the newly created embeddings for future use
    save_embeddings(embeddings, embeddings_file)
    print("Embeddings created and saved successfully.")
else:
    print("Loaded existing embeddings.")


Processed batch 1/13
Processed batch 2/13
Processed batch 3/13
Processed batch 4/13
Processed batch 5/13
Processed batch 6/13
Processed batch 7/13
Processed batch 8/13
Processed batch 9/13
Processed batch 10/13
Processed batch 11/13
Processed batch 12/13
Processed batch 13/13


In [27]:
# Assume embeddings is a list or array of embeddings loaded from previous cell
# Assume data is your DataFrame loaded with 'nrs_pain' values possibly as integers where NaN indicates missing

# Convert embeddings to a tensor for better performance with PyTorch
embeddings_tensor = torch.tensor(embeddings)

# Separate known and unknown pain indices
known_indices = data[~data['nrs_pain'].isna()].index.tolist()
unknown_indices = data[data['nrs_pain'].isna()].index.tolist()

# Get embeddings for known and unknown 'nrs_pain' values
known_embeddings = embeddings_tensor[known_indices]
unknown_embeddings = embeddings_tensor[unknown_indices]

# Calculate cosine similarity between unknown and known embeddings
cos_similarities = cosine_similarity(unknown_embeddings, known_embeddings)

# Determine top k most similar entries for each missing 'nrs_pain' entry
k = 5  # This can be adjusted on specific needs
top_k_indices = np.argsort(-cos_similarities, axis=1)[:, :k]  # Get top k indices for each unknown embedding

In [28]:
# Constructing few-shot prompts
prompts = []
for idx, indices in enumerate(top_k_indices):
    prompt = "Based on the following complaints and pain levels of a patient that has arrived to the emergency department, predict the missing pain level:\n"
    for i in indices:
        prompt += f"- Complaint: {data.loc[known_indices[i], 'chief_complain']}, Pain level: {data.loc[known_indices[i], 'nrs_pain']}\n"
    prompt += f"Predict pain level for this complaint: {data.loc[unknown_indices[idx], 'chief_complain']}"
    prompts.append(prompt)

In [None]:
# Llama model object which was initialized earlier and is ready to make predictions
predicted_pain_levels = [llm.invoke(prompt) for prompt in tqdm(prompts, desc="Predicting pain levels")]

In [30]:
# Llama model object which was initialized earlier and is ready to make predictions
predicted_pain_levels = [llm.invoke(prompt) for prompt in prompts]

In [31]:
# Impute the predicted values into the dataset
for idx, pain_level in zip(unknown_indices, predicted_pain_levels):
    data.at[idx, 'nrs_pain'] = pain_level


  data.at[idx, 'nrs_pain'] = pain_level


In [32]:
data.to_csv('data/updated_nrs_pain_data.csv', index=False)