In [4]:
!export CUDA_VISIBLE_DEVICES=1,3
bge_path = "/media/wuyuhuan/bge-small-zh"
from transformers import AutoTokenizer, AutoModel
import torch
# Sentences we want sentence embeddings for
sentences = ["样例数据-1", "样例数据-2"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(bge_path)
model = AutoModel.from_pretrained(bge_path)
model.eval()

# Tokenize sentences
encoded_input = tokenizer(sentences, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sentence embeddings: tensor([[-0.0141, -0.0318,  0.0410,  ..., -0.0393, -0.0152,  0.0084],
        [-0.0252, -0.0326,  0.0487,  ..., -0.0431, -0.0079,  0.0234]])




## Loading whole training data: df.iterrow() approach
```python
def tokenize_data(self, load_batch_size=1000):
        """
        Tokenize the data in the dataset. Modify the dataset in place to 
        reduce memory costs.
        """
        iter_data = tqdm(self.dataset.iterrows(), total=self.dataset.shape[0], desc=f"Tokenizing {self.mode} Dataset")
        for idx, row in iter_data:
            cv = row['cv'] 
            jd = row['jd']
            # TODO: More on Modifying the Tokenizing methods.
            tokenized_cv = self.tokenizer(cv, return_tensors='pt', padding='max_length', truncation=True, max_length=512) # (input_ids, attention_mask, token_type_ids): (bs, seq_len)
            tokenized_jd = self.tokenizer(jd, return_tensors='pt', padding='max_length', truncation=True, max_length=512) # (input_ids, attention_mask, token_type_ids): (bs, seq_len)
            self.data.append((row['user_id:token'], row['job_id:token'], tokenized_cv, tokenized_jd, torch.tensor(row['browsed:label'], dtype=torch.float32)))
        return 
```
This approach costs around **20-30** min to load the training dataset


In [None]:
%time
bge_path = "/media/wuyuhuan/bge-small-zh"
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.clip_grad import clip_grad_norm_

import logging

import time
import random
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
logging.info(f"Usable GPU: {torch.cuda.device_count()}")  
tokenizer = AutoTokenizer.from_pretrained(bge_path)

def same_seed(seed):
    '''Fixes random number generator seeds for reproducibility.'''
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

same_seed(42)

class BGE_FTDataset(Dataset):
    def __init__(self, mode: str, in_file: str, tokenizer: AutoTokenizer, ratio: float = 1.0):
        """
        mode: str, one of ['train', 'valid', 'test']
        in_file: str, path to the input csv file.
        tokenizer: AutoTokenizer, tokenizer for the model.
        ratio: float, the ratio of the data to be used. Default: 1.0. 
            set to 0.01 for functionality testing.
        """
        self.mode = mode
        self.dataset = pd.read_csv(in_file).sample(frac=ratio)
        self.tokenizer = AutoTokenizer.from_pretrained(bge_path)
        logging.info(f"Dataset {mode} Loaded. Shape: {self.dataset.shape}")
        self.data = []
        self.tokenize_data() # -> self.data will look like: [(user_id, job_id, tokenized_cv, tokenized_jd, label), ...] 


    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self, idx):
        return {
            "user_id": self.data[idx][0],
            "job_id": self.data[idx][1],
            "model_input": {
                "model_input_cv": self.data[idx][2],
                "model_input_jd": self.data[idx][3],
                "label": self.data[idx][4],
            }
        }
        
    def tokenize_data(self):
        """
        Tokenize the data in the dataset. Modify the dataset in place to 
        reduce memory costs.
        """
        iter_data = tqdm(self.dataset.iterrows(), total=self.dataset.shape[0], desc=f"Tokenizing {self.mode} Dataset")
        for idx, row in iter_data:
            cv = row['cv'] 
            jd = row['jd']
            # TODO: More on Modifying the Tokenizing methods.
            tokenized_cv = self.tokenizer(cv, return_tensors='pt', padding='max_length', truncation=True, max_length=512) # (input_ids, attention_mask, token_type_ids): (bs, seq_len)
            tokenized_jd = self.tokenizer(jd, return_tensors='pt', padding='max_length', truncation=True, max_length=512) # (input_ids, attention_mask, token_type_ids): (bs, seq_len)
            self.data.append((row['user_id:token'], row['job_id:token'], tokenized_cv, tokenized_jd, torch.tensor(row['browsed:label'], dtype=torch.float32)))
        return 
        
train_dataset = BGE_FTDataset('train', 'dataset/processed_train.csv', tokenizer,ratio=1)
# valid_dataset = BGE_FTDataset('valid', "dataset/processed_valid.csv", tokenizer,ratio=1)
# test_dataset = BGE_FTDataset('test', "dataset/processed_test.csv", tokenizer, ratio=5)

# train_dataset[0]
# valid_dataset[0]
# test_dataset[0]

KeyboardInterrupt: 

batch approach costs around 3 min.
but the problem is how to extract
```python
def batch_tokenize(self):
    batch_size = 1000
    results = []
    
    iter_data = tqdm(
        range(0, len(self.data), batch_size),
        total= len(self.data) // batch_size + 1,
        desc=f"Tokenizing {self.mode} Dataset"
    )
    for i in iter_data:
        batch = self.data.iloc[i:i+batch_size]
        # 批量tokenize
        batch_tokens = self.tokenizer(
            batch['cv'].tolist(),
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        results.append(batch_tokens)
```

In [22]:
from tqdm import tqdm
class BGE_FTDataset(Dataset):
    def __init__(self, mode, file_path, tokenizer, ratio=1):
        self.data = pd.read_csv(file_path).sample(frac=ratio)
        logging.log
        self.tokenizer = tokenizer
        self.mode = mode
        
        # 批量tokenize并存储结果
        self.tokenized_data = self.batch_tokenize()
        
    def batch_tokenize(self, process_batch_size=1000):
        self.data['model_input_cv'] = None
        self.data['model_input_jd'] = None

        def tokenize_text(text):
            return self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        
        iter_data = tqdm(
            range(0, len(self.data), process_batch_size),
            total = len(self.data) // process_batch_size + 1, 
            desc=f"Tokenizing {self.mode} Dataset",
        )
        for i in iter_data:
            end_idx = min(i + process_batch_size, len(self.data))
            self.data.loc[i: end_idx - 1, 'model_input_cv'] = self.data.loc[i:end_idx-1, 'cv'].apply(tokenize_text)
            self.data.loc[i: end_idx - 1, 'model_input_jd'] = self.data.loc[i:end_idx-1, 'jd'].apply(tokenize_text)

    def __len__(self):
        return len(self.data)     
    
    def __getitem__(self, idx):
        # 直接从预处理好的数据中获取
        return {
            "user_id": self.data[idx][0],
            "job_id": self.data[idx][1],
            "model_input": {
                "model_input_cv": self.data[idx][2],
                "model_input_jd": self.data[idx][3],
                "label": self.data[idx][4],
            }
        }
    
train_dataset = BGE_FTDataset('train', 'dataset/processed_train.csv', tokenizer,ratio=0.01)

train_dataset.data

Tokenizing train Dataset:   0%|          | 0/6 [00:00<?, ?it/s]


KeyError: 0

In [19]:
type(train_dataset.data['model_input_cv'].iloc[0])

transformers.tokenization_utils_base.BatchEncoding

In [14]:
def estimate_memory_theoretical():
    num_samples = 500000
    seq_length = 512
    
    # 计算主要组件大小
    input_ids_size = num_samples * seq_length * 4  # int32
    attention_mask_size = num_samples * seq_length * 4  # int32
    token_type_ids_size = num_samples * seq_length * 4  # int32
    
    # 总大小（字节）
    total_bytes = (input_ids_size + attention_mask_size + token_type_ids_size) * 2 # include cv and jd
    
    # 转换为更易读的单位
    total_gb = total_bytes / (1024**3)
    
    print(f"Theoretical estimation: {total_gb:.2f} GB")

estimate_memory_theoretical()

Theoretical estimation: 5.72 GB
