In [None]:
from datasets import Dataset
import pandas as pd

In [None]:
test_df = pd.read_csv('data/test.csv')
train_df = pd.read_csv('data/training.csv')
train_df, test_df

In [None]:
train_dataset = Dataset.from_dict({'text': train_df['sentence'], 'label': train_df['label']})
test_dataset = Dataset.from_dict({'id': test_df['id'], 'text': test_df['sentence']})
# shuffle
train_dataset = train_dataset.shuffle()

validation_dataset = train_dataset.train_test_split(test_size=0.1)
train_dataset = validation_dataset['train']
validation_dataset = validation_dataset['test']
# Save the datasets to disk
train_dataset.save_to_disk('data/train')
test_dataset.save_to_disk('data/test')
validation_dataset.save_to_disk('data/validation')

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

train_dataset = train_dataset.map(
    lambda examples: {"len":len(tokenizer.encode(
        examples['text'],
    )),
    }
)

In [None]:
# get max length
max_length = max(train_dataset['len'])
print(max_length)

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(model="checkpoints/0.5b-rl/checkpoint-620", max_model_len=1500, gpu_memory_utilization=0.8)

In [23]:
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn as nn

tokenizer = AutoTokenizer.from_pretrained(
    "HuggingFaceTB/SmolLM2-360M",
    padding_side="left",
    use_fast=True,
)
model = AutoModelForSequenceClassification.from_pretrained(
    "HuggingFaceTB/SmolLM2-360M",
    num_labels=3,
    torch_dtype=torch.bfloat16,

).to("cuda")
model.config.use_cache = False
model.config.pad_token_id = tokenizer.pad_token_id
print(model)
print("Loading llm weights for head initialization.")
llm = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceTB/SmolLM2-360M",
    torch_dtype=torch.bfloat16,
).to("cuda")
label2id = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}
classes = list(label2id.keys())
tokens = [tokenizer.encode(c)[-1] for c in classes]

model.score.weight = nn.Parameter(llm.lm_head.weight[tokens].clone())
# print("Updated weights of model with lm_head.")
# del llm
model.score

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-360M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((960,), eps=1e-05)
    (rotary_emb):

Linear(in_features=960, out_features=3, bias=False)

In [24]:
model(input_ids=torch.tensor([tokenizer.encode("I love you")]).to("cuda"),
      )

SequenceClassifierOutputWithPast(loss=None, logits=tensor([[1.8594, 0.2676, 2.1875]], device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<IndexBackward0>), past_key_values=None, hidden_states=None, attentions=None)

In [None]:
from datasets import load_from_disk
val_dataset = load_from_disk('data/validation')
test_dataset = load_from_disk('data/test')

In [None]:
sampling_params = SamplingParams(
    n = 1,
    temperature=1.0,
    top_p=0.9,
    max_tokens=1500,
)
tokenizer = llm.get_tokenizer()
prompt = open("prompt_templates/sentiment.txt", "r").read()
val_dataset = val_dataset.map(lambda conversation: {"conv": tokenizer.apply_chat_template([
    {"role": "user", "content": prompt.format(conversation['text'])}
    ], tokenize=False, add_generation_prompt=True)
}
)

to_process = val_dataset['conv']
outputs = llm.generate(
    to_process,
    sampling_params=sampling_params,
)

In [None]:
verdicts = []
for i in range(len(outputs)):
  type_of_verdicts = ['boxed{negative}', 'boxed{positive}', 'boxed{neutral}']
  texts = [outputs[i].outputs[j].text for j in range(len(outputs[i].outputs))]
  # We do a majority vote to get the final verdict
  verdict = max(type_of_verdicts, key=lambda x: sum([1 for text in texts if x in text]))
  verdicts.append(verdict.replace('boxed{', '').replace('}', ''))

In [None]:
import numpy as np
def score(label, verdict):
    if label == 'positive':
        if verdict == 'positive':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    elif label == 'negative':
        if verdict == 'negative':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    else:
        if verdict == 'neutral':
            return 1
        elif verdict == 'positive':
            return 0.5
        else:
            return 0.5
scores = []
labels = val_dataset['label']
for i in range(len(labels)):
    scores.append(score(labels[i], verdicts[i]))
# get the mean of the scores
mean_score = np.mean(scores)
print(mean_score, np.std(scores))

In [None]:
sampling_params = SamplingParams(
    n = 1,
    temperature=1.0,
    top_p=0.9,
    max_tokens=1500,
)
tokenizer = llm.get_tokenizer()
prompt = open("prompt_templates/sentiment.txt", "r").read()
test_dataset = test_dataset.map(lambda conversation: {"conv": tokenizer.apply_chat_template([
    {"role": "user", "content": prompt.format(conversation['text'])}
    ], tokenize=False, add_generation_prompt=True)
}
)

to_process = test_dataset['conv']
outputs = llm.generate(
    to_process,
    sampling_params=sampling_params,
)

In [None]:
verdicts = []
for i in range(len(outputs)):
  type_of_verdicts = ['boxed{negative}', 'boxed{positive}', 'boxed{neutral}']
  texts = [outputs[i].outputs[j].text for j in range(len(outputs[i].outputs))]
  # We do a majority vote to get the final verdict
  verdict = max(type_of_verdicts, key=lambda x: sum([1 for text in texts if x in text]))
  verdicts.append(verdict.replace('boxed{', '').replace('}', ''))

In [None]:
# save verdicts to file
import pandas as pd
submission_df = pd.DataFrame({'id': test_dataset['id'], 'label': verdicts})
submission_df.to_csv('submission.csv', index=False)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader
import tqdm

model = AutoModelForSequenceClassification.from_pretrained("rd211/custom-trainer", torch_dtype=torch.bfloat16, device_map="auto", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("rd211/custom-trainer")


In [None]:
from hydra import compose, initialize
from omegaconf import OmegaConf
from data_loader import get_dataset

with initialize(version_base=None, config_path="config/classifier", job_name="train"):
    cfg = compose(config_name="train")


cfg.data.path = './data/validation'
ds_val, collator = get_dataset(cfg, tokenizer=tokenizer)

batch_size = 1
ds_val = ds_val['train'].batch(batch_size)

model.eval()

all_predictions = []
all_logits = []


with torch.no_grad():

    for batch in tqdm.tqdm(ds_val, desc="Inference"):
        outputs = model(input_ids=torch.tensor(batch['input_ids']).to('cuda'), attention_mask=torch.tensor(batch['attention_mask']).to('cuda'))

        logits = outputs.logits


        predictions = torch.argmax(logits, dim=-1)
        all_predictions.append(predictions.cpu())
        all_logits.append(logits.cpu())

final_predictions = torch.cat(all_predictions)
final_logits = torch.cat(all_logits)

In [None]:
from data_loader import id2label
verdicts = final_predictions.tolist()
verdicts = [id2label[i] for i in verdicts]
print(verdicts[:10])

In [None]:
import numpy as np
def score(label, verdict):
    if label == 'positive':
        if verdict == 'positive':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    elif label == 'negative':
        if verdict == 'negative':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    else:
        if verdict == 'neutral':
            return 1
        elif verdict == 'positive':
            return 0.5
        else:
            return 0.5
        
import random
scores = []
labels = val_dataset['label']
# random.shuffle(labels)
for i in range(len(labels)):
    scores.append(score(labels[i], verdicts[i]))
# get the mean of the scores
mean_score = np.mean(scores)
print(mean_score, np.std(scores))

In [None]:
from hydra import compose, initialize
from omegaconf import OmegaConf
from data_loader import get_dataset

with initialize(version_base=None, config_path="config/classifier", job_name="train"):
    cfg = compose(config_name="train")


cfg.data.path = './data/test'
ds_test, collator = get_dataset(cfg, tokenizer=tokenizer)

batch_size = 1
ds_test = ds_test['train'].batch(batch_size)

model.eval()

all_predictions = []
all_logits = []


with torch.no_grad():

    for batch in tqdm.tqdm(ds_test, desc="Inference"):
        outputs = model(input_ids=torch.tensor(batch['input_ids']).to('cuda'), attention_mask=torch.tensor(batch['attention_mask']).to('cuda'))

        logits = outputs.logits


        predictions = torch.argmax(logits, dim=-1)
        all_predictions.append(predictions.cpu())
        all_logits.append(logits.cpu())

final_predictions = torch.cat(all_predictions)
final_logits = torch.cat(all_logits)

In [None]:
from data_loader import id2label
verdicts = final_predictions.tolist()
verdicts = [id2label[i] for i in verdicts]

In [None]:
ids_ = ds_test['id']
# We flatten the list of ds_test
ids = []
for i in range(len(ids_)):
    ids.extend(ids_[i])
print(ids[:10])

In [None]:
import pandas as pd
submission_df = pd.DataFrame({'id': ids, 'label': verdicts})
submission_df.to_csv('submission.csv', index=False)

In [None]:
from datasets import load_from_disk
val_dataset = load_from_disk('data/validation')
test_dataset = load_from_disk('data/test')

In [2]:
from rag import EmbeddingStore

from datasets import load_from_disk
val_dataset = load_from_disk('data/validation')

In [5]:
val_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 5105
})

In [6]:
embedding_store = EmbeddingStore(
  embedding_model='jinaai/jina-embeddings-v3',
  ds = val_dataset,
)

[2025-05-23 05:00:35,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/rd/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/rd/miniconda3/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status
  @custom_fwd
  @custom_bwd


In [10]:
embedding_store.get_k_nearest("Hello world", k=5)

[{'review': 'Look forward to seeing you.', 'classification': 'positive'},
 {'review': 'Right...', 'classification': 'neutral'},
 {'review': 'What?', 'classification': 'neutral'},
 {'review': 'Right!', 'classification': 'neutral'},
 {'review': 'Please come here.', 'classification': 'positive'}]

In [11]:
from jinja2 import Template
template = Template(open("prompt_templates/sentiment_kshot.txt", "r").read())

In [12]:
template.render(
  examples=embedding_store.get_k_nearest("Hello world", k=5),
  query="Hello world",
)

'You are tasked with indetifying if a certain review is negative, neutral or positive.\n\n\nReview: Look forward to seeing you.\nClassification: positive\n\nReview: Right...\nClassification: neutral\n\nReview: What?\nClassification: neutral\n\nReview: Right!\nClassification: neutral\n\nReview: Please come here.\nClassification: positive\n\nReview: \nClassification:'