In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-7B", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
device = torch.device('cuda')
model.to(device)
model.eval()
model.requires_grad_(False)
None

In [None]:
! cd /content
!unzip -q public_data_dev.zip

In [None]:
def get_embeddings(texts: list[str], bs=128):
  embeddings = []
  with torch.no_grad():
    for i in tqdm(range(0, len(texts), bs)):
      batch = texts[i: i+bs]
      input = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=256).to(device)
      positions = (torch.sum(input['attention_mask'], dim=1) - 1).long()
      fdim = torch.tensor(range(len(positions)), device=device, dtype=torch.long)
      out2 = model.model(**input)
      batch_emb = out2['last_hidden_state'][fdim, positions, :]
      embeddings.append(batch_emb.cpu().numpy())
      del batch_emb
  return np.concatenate(embeddings, axis=0)

In [None]:
data_dir = '/content/public_data_dev/'
LANGS = ['eng', 'deu', 'esp', 'ron', 'ukr', 'hin']

In [None]:
##
with torch.no_grad():
  x = model(**tokenizer(['trust me, I have to do it once'], return_tensors='pt').to(device))

In [None]:
for lang in LANGS:
  print(lang)

  train_data_mtd_path = f'/content/public_data_dev/track_a/train/{lang}.csv'
  mtd = pd.read_csv(train_data_mtd_path)
  texts = mtd.text.to_list()
  embs = get_embeddings(texts)
  np.save('/content/train_' + lang + '.npy', embs)
  print(embs.shape)

  dev_data_mtd_path = f'/content/public_data_dev/track_a/dev/{lang}.csv'
  mtd = pd.read_csv(dev_data_mtd_path)
  texts = mtd.text.to_list()
  embs = get_embeddings(texts)
  np.save('/content/dev_a_' + lang + '.npy', embs)
  print(embs.shape)

  dev_data_mtd_path = f'/content/public_data_dev/track_c/dev/{lang}.csv'
  mtd = pd.read_csv(dev_data_mtd_path)
  texts = mtd.text.to_list()
  embs = get_embeddings(texts)
  np.save('/content/dev_c_' + lang + '.npy', embs)


In [None]:
! zip all_embedding.zip *.npy