In [None]:
from datasets import Dataset
import pandas as pd

In [None]:
test_df = pd.read_csv('data/test.csv')
train_df = pd.read_csv('data/training.csv')
train_df, test_df

In [None]:
train_dataset = Dataset.from_dict({'text': train_df['sentence'], 'label': train_df['label']})
test_dataset = Dataset.from_dict({'id': test_df['id'], 'text': test_df['sentence']})
# shuffle
train_dataset = train_dataset.shuffle()

validation_dataset = train_dataset.train_test_split(test_size=0.1)
train_dataset = validation_dataset['train']
validation_dataset = validation_dataset['test']
# Save the datasets to disk
train_dataset.save_to_disk('data/train')
test_dataset.save_to_disk('data/test')
validation_dataset.save_to_disk('data/validation')

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

train_dataset = train_dataset.map(
    lambda examples: {"len":len(tokenizer.encode(
        examples['text'],
    )),
    }
)

In [None]:
# get max length
max_length = max(train_dataset['len'])
print(max_length)

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(model="checkpoints/0.5b-rl/checkpoint-620", max_model_len=1500, gpu_memory_utilization=0.8)

In [None]:
from datasets import load_from_disk
val_dataset = load_from_disk('data/validation')
test_dataset = load_from_disk('data/test')

In [None]:
sampling_params = SamplingParams(
    n = 1,
    temperature=1.0,
    top_p=0.9,
    max_tokens=1500,
)
tokenizer = llm.get_tokenizer()
prompt = open("prompt_templates/sentiment.txt", "r").read()
val_dataset = val_dataset.map(lambda conversation: {"conv": tokenizer.apply_chat_template([
    {"role": "user", "content": prompt.format(conversation['text'])}
    ], tokenize=False, add_generation_prompt=True)
}
)

to_process = val_dataset['conv']
outputs = llm.generate(
    to_process,
    sampling_params=sampling_params,
)

In [None]:
verdicts = []
for i in range(len(outputs)):
  type_of_verdicts = ['boxed{negative}', 'boxed{positive}', 'boxed{neutral}']
  texts = [outputs[i].outputs[j].text for j in range(len(outputs[i].outputs))]
  # We do a majority vote to get the final verdict
  verdict = max(type_of_verdicts, key=lambda x: sum([1 for text in texts if x in text]))
  verdicts.append(verdict.replace('boxed{', '').replace('}', ''))

In [None]:
import numpy as np
def score(label, verdict):
    if label == 'positive':
        if verdict == 'positive':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    elif label == 'negative':
        if verdict == 'negative':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    else:
        if verdict == 'neutral':
            return 1
        elif verdict == 'positive':
            return 0.5
        else:
            return 0.5
scores = []
labels = val_dataset['label']
for i in range(len(labels)):
    scores.append(score(labels[i], verdicts[i]))
# get the mean of the scores
mean_score = np.mean(scores)
print(mean_score, np.std(scores))

In [None]:
sampling_params = SamplingParams(
    n = 1,
    temperature=1.0,
    top_p=0.9,
    max_tokens=1500,
)
tokenizer = llm.get_tokenizer()
prompt = open("prompt_templates/sentiment.txt", "r").read()
test_dataset = test_dataset.map(lambda conversation: {"conv": tokenizer.apply_chat_template([
    {"role": "user", "content": prompt.format(conversation['text'])}
    ], tokenize=False, add_generation_prompt=True)
}
)

to_process = test_dataset['conv']
outputs = llm.generate(
    to_process,
    sampling_params=sampling_params,
)

In [None]:
verdicts = []
for i in range(len(outputs)):
  type_of_verdicts = ['boxed{negative}', 'boxed{positive}', 'boxed{neutral}']
  texts = [outputs[i].outputs[j].text for j in range(len(outputs[i].outputs))]
  # We do a majority vote to get the final verdict
  verdict = max(type_of_verdicts, key=lambda x: sum([1 for text in texts if x in text]))
  verdicts.append(verdict.replace('boxed{', '').replace('}', ''))

In [None]:
# save verdicts to file
import pandas as pd
submission_df = pd.DataFrame({'id': test_dataset['id'], 'label': verdicts})
submission_df.to_csv('submission.csv', index=False)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader
import tqdm

model = AutoModelForSequenceClassification.from_pretrained("rd211/custom-trainer", torch_dtype=torch.bfloat16, device_map="auto", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("rd211/custom-trainer")


In [None]:
from hydra import compose, initialize
from omegaconf import OmegaConf
from data_loader import get_dataset

with initialize(version_base=None, config_path="config/classifier", job_name="train"):
    cfg = compose(config_name="train")


cfg.data.path = './data/validation'
ds_val, collator = get_dataset(cfg, tokenizer=tokenizer)

batch_size = 1
ds_val = ds_val['train'].batch(batch_size)

model.eval()

all_predictions = []
all_logits = []


with torch.no_grad():

    for batch in tqdm.tqdm(ds_val, desc="Inference"):
        outputs = model(input_ids=torch.tensor(batch['input_ids']).to('cuda'), attention_mask=torch.tensor(batch['attention_mask']).to('cuda'))

        logits = outputs.logits


        predictions = torch.argmax(logits, dim=-1)
        all_predictions.append(predictions.cpu())
        all_logits.append(logits.cpu())

final_predictions = torch.cat(all_predictions)
final_logits = torch.cat(all_logits)

In [None]:
from data_loader import id2label
verdicts = final_predictions.tolist()
verdicts = [id2label[i] for i in verdicts]
print(verdicts[:10])

In [None]:
import numpy as np
def score(label, verdict):
    if label == 'positive':
        if verdict == 'positive':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    elif label == 'negative':
        if verdict == 'negative':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    else:
        if verdict == 'neutral':
            return 1
        elif verdict == 'positive':
            return 0.5
        else:
            return 0.5
        
import random
scores = []
labels = val_dataset['label']
# random.shuffle(labels)
for i in range(len(labels)):
    scores.append(score(labels[i], verdicts[i]))
# get the mean of the scores
mean_score = np.mean(scores)
print(mean_score, np.std(scores))

In [None]:
from hydra import compose, initialize
from omegaconf import OmegaConf
from data_loader import get_dataset

with initialize(version_base=None, config_path="config/classifier", job_name="train"):
    cfg = compose(config_name="train")


cfg.data.path = './data/test'
ds_test, collator = get_dataset(cfg, tokenizer=tokenizer)

batch_size = 1
ds_test = ds_test['train'].batch(batch_size)

model.eval()

all_predictions = []
all_logits = []


with torch.no_grad():

    for batch in tqdm.tqdm(ds_test, desc="Inference"):
        outputs = model(input_ids=torch.tensor(batch['input_ids']).to('cuda'), attention_mask=torch.tensor(batch['attention_mask']).to('cuda'))

        logits = outputs.logits


        predictions = torch.argmax(logits, dim=-1)
        all_predictions.append(predictions.cpu())
        all_logits.append(logits.cpu())

final_predictions = torch.cat(all_predictions)
final_logits = torch.cat(all_logits)

In [None]:
from data_loader import id2label
verdicts = final_predictions.tolist()
verdicts = [id2label[i] for i in verdicts]

In [None]:
ids_ = ds_test['id']
# We flatten the list of ds_test
ids = []
for i in range(len(ids_)):
    ids.extend(ids_[i])
print(ids[:10])

In [None]:
import pandas as pd
submission_df = pd.DataFrame({'id': ids, 'label': verdicts})
submission_df.to_csv('submission.csv', index=False)

In [None]:
from datasets import load_from_disk
val_dataset = load_from_disk('data/validation')
test_dataset = load_from_disk('data/test')

In [None]:
import numpy as np
import xgboost as xgb

def score(label, verdict):
    if label == 'positive':
        if verdict == 'positive':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    elif label == 'negative':
        if verdict == 'negative':
            return 1
        elif verdict == 'neutral':
            return 0.5
        else:
            return 0
    else:  # label is 'neutral'
        if verdict == 'neutral':
            return 1
        elif verdict == 'positive':
            return 0.5
        else:
            return 0.5

def custom_objective(preds, dtrain):
    num_class = 3
    preds = preds.reshape(-1, num_class)

    exp_preds = np.exp(preds - np.max(preds, axis=1, keepdims=True))
    probs = exp_preds / np.sum(exp_preds, axis=1, keepdims=True)
    
    labels = dtrain.get_label().astype(int)
    n = preds.shape[0]
    
    s = np.zeros_like(probs)
    for i, label in enumerate(labels):
        if label == 0:  # negative
            s[i] = [1.0, 0.5, 0.0]
        elif label == 1:  # neutral
            s[i] = [0.5, 1.0, 0.5]
        elif label == 2:  # positive
            s[i] = [0.0, 0.5, 1.0]
    
    mu = np.sum(probs * s, axis=1, keepdims=True)  # shape (n, 1)
    
    grad = probs * (mu - s)

    hess = probs * (mu - s) * (1 - 2 * probs)
    hess = np.maximum(hess, 1e-6)
    
    return grad.flatten(), hess.flatten()

from data_loader import id2label, label2id
X_train = final_logits.float()
y_train = val_dataset['label']
y_train = [label2id[i] for i in y_train]

X_train_np = X_train.detach().cpu().numpy()  # training features
X_val_np   = X_train.detach().cpu().numpy()      # validation features

# Convert labels (already processed via label2id) to numpy arrays.
y_train_np = np.array(y_train)
y_val_np   = np.array(y_train)

# Prepare DMatrix for training and validation.
dtrain = xgb.DMatrix(X_train_np, label=y_train_np)
dval   = xgb.DMatrix(X_val_np, label=y_val_np)

params = {
    'num_class': 3,
    'eval_metric': 'mlogloss'  # this metric is used for logging; training uses the custom objective
}

# Create a watchlist to monitor performance.
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Train the model using xgb.train with the custom objective.
bst = xgb.train(params, dtrain, num_boost_round=10000, obj=custom_objective, evals=watchlist)

# -------------------------------
# Evaluation using your custom scoring function.
# First, get predictions on the validation set.
# Note: With a custom objective, predictions are still raw scores.
# We reshape them to (n_samples, num_class) and take argmax to get the predicted label.
preds = bst.predict(dval)
print(preds)
pred_labels = preds
print(y_val_np, pred_labels)
# Mapping from indices to verdict strings (order must match the one used above).
verdicts = ['negative', 'neutral', 'positive']

# Calculate custom scores for each instance.
scores = [score(verdicts[int(y_true)], verdicts[int(y_pred)])
          for y_true, y_pred in zip(y_val_np, pred_labels)]

mean_score = np.mean(scores)
std_score = np.std(scores)
print("Mean custom score:", mean_score)
print("Std custom score:", std_score)


In [None]:
final_logits