In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("sg-server-write")

In [2]:
# test-with-predictions-unsloth/Phi-3-medium-4k-it-pretrained

model_id = 'Phi-3-medium-4k-instruct'
model_type = 'pretrained'
model_name = "unsloth/Phi-3-medium-4k-instruct-bnb-4bit"

In [3]:
import pandas as pd
import os

p_test_df = pd.read_csv('/kaggle/input/symptom-disease-classification/test_primary.csv')
p_test_df = p_test_df[['text','disease']]
# Perform transformations on p_test_df
p_test_df['text'] = "Patient's query or symptoms : " + p_test_df['text']
p_test_df['disease'] = "Identified Health Problem : " + p_test_df['disease']

# Rename columns
p_test_df = p_test_df.rename(columns={'text': 'input', 'disease': 'output'})

# Add new column 'instruction'
instruction = ("You're an Advanced Medical Chatbot - Chatbot Dr. MCX, so assume yourself as a Healthcare Professional "
               "whose job is to listen to the Patient's query or symptoms and predict the underlying Health Problem the patient is suffering. "
               "Carefully listen to the Patient's query or symptoms and output only the Identified Health Problem in the following format: "
               "Identified Health Problem : [Health Problem Identified by you].")
p_test_df['instruction'] = instruction

p_test_df = p_test_df[['output','input','instruction']]


# Create the directory if it doesn't exist
output_dir = '/kaggle/working/test_dataset'
os.makedirs(output_dir, exist_ok=True)

# Define the file path
output_file = os.path.join(output_dir, 'transformed_test_primary.csv')

# Save the DataFrame to CSV
p_test_df.to_csv(output_file, index=False)

# Display the file path
print("Saved transformed test dataset to:", output_file)

Saved transformed test dataset to: /kaggle/working/test_dataset/transformed_test_primary.csv


In [4]:
%%capture
!mamba install --force-reinstall aiohttp -y
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

import os
os.environ["WANDB_DISABLED"] = "true"


In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
] # More models at https://huggingface.co/unsloth

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-06-11 13:50:38.305394: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 13:50:38.305530: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 13:50:38.551653: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name, 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token, 
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # 4x longer contexts auto supported!
    random_state = 3407,
    use_rslora = False,  # rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

# Inference Template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

# Setting the model for inference
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN



# generating response and cleaning it
def get_raw_response(instruction, input_text):
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                f"{instruction}", # instruction
                f"{input_text}", # input
                "", # output - blank for generation!
            )
        ], return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=16, use_cache=True)
    raw_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return raw_response[0]

def get_response(instruction, input_text):
    raw_response = get_raw_response(instruction=instruction, input_text=input_text)
    # Extract Response from raw_response and return it as a string
    response = raw_response.split("### Response:\n")[1].strip()
    return response

# demo
instruction = ("You're an Advanced Medical Chatbot - Chatbot Dr. MCX, so assume yourself as a Healthcare Professional "
               "whose job is to listen to the Patient's query or symptoms and predict the underlying Health Problem the patient is suffering. "
               "Carefully listen to the Patient's query or symptoms and output only the Identified Health Problem in the following format: "
               "Identified Health Problem : [Health Problem Identified by you].")
input_text = "Patient's query or symptoms : skin rash, blackheads, scarring"

print(get_response(instruction=instruction, input_text=input_text))



config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.72G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


Identified Health Problem: Acne


### Instruction:


In [7]:
# Function to generate prediction using get_response function
def generate_prediction(row):
    instruction = row['instruction']
    input_text = row['input']
    prediction = get_response(instruction=instruction, input_text=input_text)
    print(f'{model_type} {model_id} prediction ::--> ', prediction)
    return prediction

# Function to generate prediction using get_response function
def generate_prediction_chunk(chunk, batch_index, i):
    chunk['prediction'] = chunk.apply(generate_prediction, axis=1)
    
    # Create the directory if it doesn't exist
    output_dir = '/kaggle/working/output/'
    os.makedirs(output_dir, exist_ok=True)

    # Define the file path
    output_file = os.path.join(output_dir, f'predictions-{model_id}-{model_type}_batch_{batch_index}_i={i}.csv')
    chunk.to_csv(output_file, index=False)

# Load the test.csv file
test_df = pd.read_csv('/kaggle/working/test_dataset/transformed_test_primary.csv')

# Apply the generate_prediction_chunk function to create the prediction column in batches
batch_size = 100
for i in range(0, len(test_df), batch_size):
    chunk = test_df.iloc[i:i+batch_size]
    generate_prediction_chunk(chunk, i // batch_size, i)

pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Vertigo


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Vertigo


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Vestibular Neuritis


##
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Acne Vulgaris


###
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Acne Vulgaris


###
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Acne Vulgaris


###
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Acne Vulgaris


###
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Acne Vulgaris


###
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Acne

### Instruction:
You
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['prediction'] = chunk.apply(generate_prediction, axis=1)


pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Cervical Spondylosis


##
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Potential Spinal Infection or Meningitis
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Multiple Sclerosis


### Inst
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Multiple Sclerosis


### Inst
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Potential Spinal Disorder or Neurological
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Multiple Sclerosis


### Inst
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Cervical Spondylosis


##
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Potential Spinal Cord Issue or Neurological
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Po

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['prediction'] = chunk.apply(generate_prediction, axis=1)


pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Peripheral Neuropathy
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Allergic Rhinitis and possible Immune
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Diabetes Mellitus


##
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Peripheral Neuropathy
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Intertrigo


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Attention Deficit Hyperactivity Disorder (AD
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Dermatitis


### Instruction
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Attention Deficit Hyperactivity Disorder (AD
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Diabetes Mellitus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['prediction'] = chunk.apply(generate_prediction, axis=1)


pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Gastroesophageal Reflux Dise
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Angina Pectoris


### Inst
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Gastroesophageal Reflux Dise
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Gastroesophageal Reflux Dise
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Gastroesophageal Reflux Dise
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Gastroesophageal Reflux Dise
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Gastroesophageal Reflux Dise
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Gastroesophageal Reflux Dise
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Gastroesophageal Reflux Dise
pretrained P

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['prediction'] = chunk.apply(generate_prediction, axis=1)


pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Hepatitis


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Liver Disease (possible Hepatitis)
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Liver Disease (possibly Hepatitis)
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Liver Disease


### Instruction
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Acute Hepatitis


### Inst
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Liver Disease (Potential Hepatitis
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Liver Disease


### Instruction
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Liver Disease (Potential Jaund
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Liver Disease (

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['prediction'] = chunk.apply(generate_prediction, axis=1)


pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Pneumonia


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Pneumonia


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Respiratory Infection

### Inst
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Respiratory Infection


###
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Pneumonia


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Pneumonia


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Pneumonia


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Pneumonia


### Instruction:
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Pneumonia


### Instruction:
pretrain

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['prediction'] = chunk.apply(generate_prediction, axis=1)


pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Peripheral Venous Insufficiency
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Varicose Veins


### Inst
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Chronic Venous Insufficiency
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Chronic Venous Insufficiency
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Peripheral Artery Disease (P
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Varicose Veins

### Instruction
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Peripheral Artery Disease (P
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Chronic Venous Insufficiency
pretrained Phi-3-medium-4k-instruct prediction ::-->  Identified Health Problem: Peripheral Edema


##
pretrained Phi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['prediction'] = chunk.apply(generate_prediction, axis=1)


In [8]:
import os
import pandas as pd

# Directory containing the CSV files
directory = '/kaggle/working/output/'

# List of file names to merge
file_names = [
    f"predictions-{model_id}-{model_type}_batch_0_i=0.csv",
    f"predictions-{model_id}-{model_type}_batch_1_i=100.csv",
    f"predictions-{model_id}-{model_type}_batch_2_i=200.csv",
    f"predictions-{model_id}-{model_type}_batch_3_i=300.csv",
    f"predictions-{model_id}-{model_type}_batch_4_i=400.csv",
    f"predictions-{model_id}-{model_type}_batch_5_i=500.csv",
    f"predictions-{model_id}-{model_type}_batch_6_i=600.csv"
]

# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()

# Merge the CSV files in sequence
for file_name in file_names:
    file_path = os.path.join(directory, file_name)
    df = pd.read_csv(file_path)
    merged_df = pd.concat([merged_df, df], ignore_index=True)
merged_df = merged_df[['output','prediction','input','instruction']]

# Create the directory if it doesn't exist
output_dir = '/kaggle/working/results/'
os.makedirs(output_dir, exist_ok=True)

# Define the file path
output_file = os.path.join(output_dir, f"predictions-{model_id}-{model_type}-output.csv")

# Save the merged DataFrame to a new CSV file
merged_df.to_csv(output_file, index=False)