In [1]:
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']='1,2,3,4'
import gc
import re
import json
import torch
import pickle
import string
import argparse
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import softmax
from collections import OrderedDict
from vllm import LLM, SamplingParams
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoConfig
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

# Load data
---

In [2]:
df_test = pd.read_pickle('/data/datasets/outcome/breast/breast_test_180_1.pkl')
y_test = list(df_test['label'])
del y_test[37]

df_train = pd.read_pickle('/data/datasets/outcome/breast/breast_train_180_1.pkl')
y_train = list(df_train['label'])
del y_train[62]
len(y_train), len(y_test)

(400, 101)

In [3]:
train_templates = pd.read_pickle('/data/llm_shared/breast_summaries/templates_train.pkl')
test_templates = pd.read_csv('/data/llm_shared/breast_summaries/templates_test.csv', header=None)
test_templates = list(test_templates[1][1:])

del test_templates[37]
del train_templates[62]
templates = train_templates + test_templates
len(templates), len(train_templates), len(test_templates)

(501, 400, 101)

# Folds
---

In [4]:
X = np.concatenate((train_templates, test_templates))
y = np.concatenate((y_train, y_test))

strat = StratifiedKFold(n_splits=5)

In [5]:
idxs_tr = []
idxs_t = []
for fold, (train_idx, test_idx) in enumerate(strat.split(X, y), 1):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]
    idxs_tr.append(train_idx)
    idxs_t.append(test_idx)

In [29]:
fold = 1
with open('/data/llm_shared/jina_embs/jina_breast_templates.pkl', 'rb') as f:
   train_embeddings = pickle.load(f)
fold = fold - 1
notes = X
labels = y
ids = idxs_t[fold]
notes_test = [notes[i] for i in ids]
y_test = [labels[i] for i in ids]
embs_test = [train_embeddings[i] for i in ids]
ids = idxs_tr[fold]
notes_train = [notes[i] for i in ids]
y_train = [labels[i] for i in ids]
embs_train = [train_embeddings[i] for i in ids]

len(embs_test), len(embs_train)

(101, 400)

# Run the pipeline - 8B Experiments
---

In [30]:
fold = 5
fold = fold - 1
ids = idxs_t[fold]
notes_test = [notes[i] for i in ids]
y_test = [labels[i] for i in ids]
embs_test = [train_embeddings[i] for i in ids]
ids = idxs_tr[fold]
notes_train = [notes[i] for i in ids]
y_train = [labels[i] for i in ids]
embs_train = [train_embeddings[i] for i in ids]

In [31]:
df_train = pd.DataFrame()
df_train['note'] = notes_train
df_train['label'] = y_train

df_test = pd.DataFrame()
df_test['note'] = notes_test
df_test['label'] = y_test

In [32]:
df_train.to_csv("../../example_file_5.csv")
df_test.to_csv("../../test_file_5.csv")

In [None]:
!/home/srinivasb/.conda/envs/radoncB/bin/python3 run_dynamic_prompt_B.py \
    --exp_type zero_shot_summary \
    --large False \
    --num_gpus 4 \
    --zero_shot True \
    --examples ../../example_file_3.csv \
    --test_data ../../test_file_3.csv \
    --summary True

Starting script execution...

Processing data...
Loaded 401 examples and 100 test samples.

Loading embedding model...

Creating vector database...
Generating embeddings: 100%|██████████████████| 401/401 [00:34<00:00, 11.72it/s]
Vector database created successfully.

Initializing LLM model...
INFO 02-25 16:45:48 config.py:890] Defaulting to use mp for distributed inference
INFO 02-25 16:45:48 config.py:999] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 02-25 16:45:48 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='gradientai/Llama-3-8B-Instruct-262k', speculative_config=None, tokenizer='gradientai/Llama-3-8B-Instruct-262k', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disa

# Results
---