In [98]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import accelerate

from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [114]:
dataset = pd.read_csv('../data/bmi.csv')
dataset.drop(columns=['Bmi'], inplace=True)
# dataset.drop_duplicates(subset=['Sentence'], inplace=True)
# dataset.reset_index(drop=True, inplace=True)
dataset = dataset.iloc[:100]
dataset

Unnamed: 0,Age,Height,Weight,BmiClass
0,61,1.85,109.30,Obese Class 1
1,60,1.71,79.02,Overweight
2,60,1.55,74.70,Obese Class 1
3,60,1.46,35.90,Underweight
4,60,1.58,97.10,Obese Class 2
...,...,...,...,...
95,45,1.73,74.38,Normal Weight
96,45,1.75,83.58,Overweight
97,45,1.68,140.00,Obese Class 3
98,45,1.84,68.00,Normal Weight


In [46]:
dataset.columns

Index(['Age', 'Height', 'Weight', 'BmiClass'], dtype='object')

In [115]:
dataset['BmiClass'].value_counts()

BmiClass
Overweight       49
Underweight      19
Obese Class 2    13
Obese Class 1     9
Normal Weight     6
Obese Class 3     4
Name: count, dtype: int64

In [116]:
target_label = 'Weight'

print(f'No. of samples: {len(dataset)}')
print(f'No. of features: {dataset.drop(columns=target_label).shape[1]}')
print(f'Features: {dataset.drop(columns=target_label).columns.to_list()}')
print(f'Target: {target_label}')
print(f'Mean of target: {dataset[target_label].mean()}')
print(f'Standard dev of target: {dataset[target_label].std()}')

No. of samples: 100
No. of features: 3
Features: ['Age', 'Height', 'BmiClass']
Target: Weight
Mean of target: 77.5539
Standard dev of target: 24.64397280771659


In [117]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=[target_label]), dataset[target_label], test_size=.2)

X_train = X_train.astype(str)
X_test = X_test.astype(str)
y_train = y_train.astype(np.float16)
y_test = y_test.astype(np.float16)

In [118]:
y_train = y_train.astype(float).to_numpy()
y_test = y_test.astype(float).to_numpy()

In [3]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type='nf8',  # Can be 'nf4' or 'fp4'
    bnb_8bit_compute_dtype=torch.bfloat16  # Adjust compute type if needed
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map='auto',
    quantization_config=quantization_config,
    output_hidden_states=True,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [7]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    total_size_bytes = total_params * 2
    total_size_gb = total_size_bytes / (1024 ** 3)
    print(f"Model size: {total_size_gb:.2f} GB")

get_model_size(model)

Model size: 14.96 GB


In [8]:
model.device

device(type='cuda', index=0)

In [59]:
def prompt_llm(prompt, max_tokens=20):
    messages = [
        {"role": "system", "content": "You are weight predicting chatbot that can predict weight (in Kg) of a person based on the given height (in meter) and Weight category (amongst [Underweight, Normal, Overweight, Obese Class 1, Obese Class 2, Obese Class 3])."},
        {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            eos_token_id=terminators,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.2,
            top_p=0.9,
            output_hidden_states=True,  # Enable hidden states
            return_dict_in_generate=True  # Ensure hidden states are included in the output
        )

    generated_tokens = outputs.sequences[0][input_ids.shape[-1]:]
    
    hidden_states = outputs.hidden_states  # Shape: (num_layers, batch_size, seq_len, hidden_size)
    num_layers = len(hidden_states)
    # return hidden_states
    
    # Last Token Embedding
    first_layer_hidden_state = hidden_states[-1][0][:, -1, :].squeeze().cpu().numpy()
    middle_layer_hidden_state = hidden_states[-1][num_layers // 2][:, -1, :].squeeze().cpu().numpy()
    last_layer_hidden_state = hidden_states[-1][-1][:, -1, :].squeeze().cpu().numpy()

    response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    del input_ids
    torch.cuda.empty_cache()

    return {
        'prompt': prompt,
        'response': response_text,
        'first_layer_embedding': first_layer_hidden_state,  # Embedding from the first layer
        'middle_layer_embedding': middle_layer_hidden_state,  # Embedding from the middle layer
        'last_layer_embedding': last_layer_hidden_state  # Embedding from the last layer
    }

In [55]:
dataset.iloc[11]

Age                 58
Height             1.7
Weight           73.41
BmiClass    Overweight
Name: 11, dtype: object

In [62]:
prompt_llm(f"What will be the weight of given person:\n Age={dataset.iloc[11]['Age']}, Height={dataset.iloc[11]['Height']}, Weight Category={dataset.iloc[11]['BmiClass']}? Just give output number.", max_tokens=40)['response']

'73.5'

In [None]:
for x in X_train.iterrows():
    prompt = f"{', '.join([': '.join(i) for i in zip(x[1].index, x[1].values)])}"
    output = prompt_llm(prompt)
    print(output)
    print(len(output))
    print(len(output[0]))
    print(output[0][0].shape)
    break

In [127]:
train_embeddings_first = []
train_embeddings_middle = []
train_embeddings_last = []
train_prompts = []
train_responses = []

for x in tqdm(X_train.iterrows(), total=len(X_train), desc='Generating embeddings for training data'):
    prompt = f"What is the expected weight of the given person:\n Age={x[1]['Age']}, Height={x[1]['Height']}, Category={x[1]['BmiClass']}? Just give output number."
    output = prompt_llm(prompt)
    train_prompts.append(output['prompt'])
    train_responses.append(output['response'])
    train_embeddings_first.append(output['first_layer_embedding'])
    train_embeddings_middle.append(output['middle_layer_embedding'])
    train_embeddings_last.append(output['last_layer_embedding'])

train_embeddings_first = np.array(train_embeddings_first)
train_embeddings_middle = np.array(train_embeddings_middle)
train_embeddings_last = np.array(train_embeddings_last)

Generating embeddings for training data: 100%|██████████| 80/80 [01:06<00:00,  1.20it/s]


In [128]:
test_embeddings_first = []
test_embeddings_middle = []
test_embeddings_last = []
test_prompts = []
test_responses = []

for x in tqdm(X_test.iterrows(), total=len(X_test), desc='Generating embeddings for testing data'):
    prompt = f"What will be the weight of given person:\nHeight={x[1]['Height']}, Age={x[1]['Age']}, Category={x[1]['BmiClass']}? Just give output number."
    output = prompt_llm(prompt)
    test_prompts.append(output['prompt'])
    test_responses.append(output['response'])
    test_embeddings_first.append(output['first_layer_embedding'])
    test_embeddings_middle.append(output['middle_layer_embedding'])
    test_embeddings_last.append(output['last_layer_embedding'])

test_embeddings_first = np.array(test_embeddings_first)
test_embeddings_middle = np.array(test_embeddings_middle)
test_embeddings_last = np.array(test_embeddings_last)

Generating embeddings for testing data: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]


In [129]:
# rg_model_first = LinearRegression()
# rg_model_middle = LinearRegression()
# rg_model_last = LinearRegression()

rg_model_first = RandomForestRegressor(warm_start=True)
rg_model_middle = RandomForestRegressor(warm_start=True)
rg_model_last = RandomForestRegressor(warm_start=True)

In [118]:
# y_train = y_train.to_numpy(dtype='object')
# y_test = y_test.to_numpy(dtype='object')

In [130]:
rg_model_first.fit(train_embeddings_first, y_train)
rg_model_middle.fit(train_embeddings_middle, y_train)
rg_model_last.fit(train_embeddings_last, y_train)

In [131]:
y_pred_train_first = rg_model_first.predict(train_embeddings_first)
y_pred_train_middle = rg_model_middle.predict(train_embeddings_middle)
y_pred_train_last = rg_model_last.predict(train_embeddings_last)

y_train, y_pred_train_first, y_pred_train_middle, y_pred_train_last

(array([ 84.875  , 109.3125 ,  79.     ,  73.5    ,  91.1875 ,  36.59375,
         36.40625, 100.3125 ,  85.125  ,  79.     , 160.     ,  77.875  ,
        110.     ,  79.3125 ,  83.8125 ,  36.1875 ,  36.09375,  85.     ,
         36.6875 ,  73.6875 ,  79.3125 ,  79.3125 , 140.     ,  79.     ,
         73.3125 ,  73.4375 ,  40.     ,  79.6875 ,  74.125  , 111.1875 ,
         84.375  ,  74.375  ,  78.125  ,  79.3125 ,  35.90625,  74.1875 ,
         99.6875 ,  70.1875 ,  36.1875 , 106.875  ,  79.3125 ,  36.     ,
        104.6875 , 103.1875 ,  36.     , 107.3125 ,  79.3125 ,  79.     ,
         78.3125 , 112.1875 ,  74.1875 ,  74.6875 ,  36.90625,  79.3125 ,
         85.3125 ,  75.     ,  83.6875 ,  79.3125 ,  83.9375 ,  76.3125 ,
         36.8125 ,  83.5625 , 110.875  ,  98.5    ,  72.5    ,  36.09375,
         99.125  , 110.5    ,  79.     ,  70.     ,  74.0625 , 100.     ,
         75.8125 ,  73.8125 ,  95.     ,  42.     ,  36.5    ,  74.     ,
         84.     ,  85.1875 ]),
 array

In [132]:
print(f'First Layer MSE: {mean_squared_error(y_train, y_pred_train_first)}')
print(f'Middle Layer MSE: {mean_squared_error(y_train, y_pred_train_middle)}')
print(f'Last Layer MSE: {mean_squared_error(y_train, y_pred_train_last)}')

First Layer MSE: 574.3517347884845
Middle Layer MSE: 12.078587659912106
Last Layer MSE: 21.49789121215821


In [133]:
y_pred_test_first = rg_model_first.predict(test_embeddings_first)
y_pred_test_middle = rg_model_middle.predict(test_embeddings_middle)
y_pred_test_last = rg_model_last.predict(test_embeddings_last)

y_test, y_pred_test_first, y_pred_test_middle, y_pred_test_last

(array([ 46.    ,  79.    ,  75.3125, 101.6875,  74.    , 135.    ,
        105.125 ,  73.    ,  79.25  ,  79.25  ,  44.    ,  97.125 ,
         36.    ,  36.3125,  68.    ,  98.    ,  73.5625,  77.    ,
         79.5   ,  74.3125]),
 array([79.64702442, 79.64702442, 79.64702442, 79.64702442, 79.64702442,
        79.64702442, 79.64702442, 79.64702442, 79.64702442, 79.64702442,
        79.64702442, 79.64702442, 79.64702442, 79.64702442, 79.64702442,
        79.64702442, 79.64702442, 79.64702442, 79.64702442, 79.64702442]),
 array([ 44.5609375,  78.334375 ,  78.530625 ,  91.161875 ,  81.6028125,
        106.025625 ,  90.90875  ,  77.0184375,  80.12625  ,  79.944375 ,
         46.4346875, 105.825625 ,  41.855    ,  38.41625  ,  77.2253125,
         88.7734375,  82.31125  ,  82.6465625,  79.84125  ,  77.6153125]),
 array([41.1009375, 54.7553125, 87.6359375, 93.0078125, 79.59     ,
        97.3809375, 88.405625 , 86.4853125, 90.354375 , 85.1334375,
        44.7340625, 84.25375  , 42.89125  

In [134]:
print(f'First Layer MSE: {mean_squared_error(y_test, y_pred_test_first)}')
print(f'Middle Layer MSE: {mean_squared_error(y_test, y_pred_test_middle)}')
print(f'Last Layer MSE: {mean_squared_error(y_test, y_pred_test_last)}')

First Layer MSE: 566.5351669907324
Middle Layer MSE: 82.52481687499998
Last Layer MSE: 189.62282027832035
