In [1]:
clean = True

# Linq-Embed-Mistral 进行 embedding 编码

```
nohup jupyter nbconvert --to notebook --execute B榜-v004Linq_mistral-Embedding.ipynb > B榜-v004Linq_mistral-Embedding.ipynb.nohup.out 2>&1 &
```


模型的总参数量: 7110660096


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import gc 
import ctypes
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

import json
import pandas as pd 
import re
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import numpy as np 
import pickle

def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

In [3]:
BATCH_SIZE = 32

## question embedding 

In [4]:
def last_token_pool(last_hidden_states: Tensor,
                    attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'


def encode_questions_Linq_Embedding_Mistral(df): 
    df = df.fillna('')
    
    ## model 
    # model_path = '/mntdata/wangql43/A000Files/A003Model/recallModel/SFR-Embedding-Mistral/'
    model_path = '/mntdata/wangql43/A000Files/A003Model/Linq-Embed-Mistral/'
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path).eval().to(f'cuda') 
    
    ## encode 
    # Each query must come with a one-sentence instruction that describes the task
    # task = 'Given Question and The Detailed Analysis of the Question, retrieve most relevant Title and Abstract that answer the Question.'
    # task = 'Please retrieve and provide the most relevant title and abstract of literature based on the user’s specific question and its detailed analysis. Ensure a deep understanding of the question’s underlying meaning, and prioritize the accuracy and relevance of the information in the returned results.'
    task = 'Retrieve the most pertinent title and abstract addressing the user’s question and its analysis, emphasizing understanding, accuracy, and relevance.'
    
    ## df -- Query 
    queries = []
    for _, row in df.iterrows():
        text =  f"Question: {row.question}\n\n The Detailed Analysis of the Question: {row.body}"
        queries.append(get_detailed_instruct(task, text)) 
    
    ## dl 
    max_length = 2048
    input_texts = queries 
    dataloader = DataLoader(
        input_texts, batch_size=BATCH_SIZE, num_workers=16,
        collate_fn=lambda batch: tokenizer(batch, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
    )
    
    ## model 
    embeddings = []
    with torch.no_grad():
        with torch.autocast(device_type='cuda'):
            for batch in tqdm(dataloader):
                model_output = model(**batch.to(model.device))
                sentence_embeddings = last_token_pool(model_output.last_hidden_state, batch['attention_mask'])
                sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
                embeddings.append(sentence_embeddings)
    embeddings = torch.cat(embeddings, dim=0).cpu().numpy().astype(np.float16)
    
    clean_memory()
    
    return embeddings

In [5]:
## 读取数据的内容 
def read_train_valid_test(path): 
    data = []
    
    assert path.endswith('.txt')
    # 打开并逐行读取txt文件
    with open(path, 'r') as f:
        for line in f:
            # 使用json.loads将每一行转换为字典
            data.append(json.loads(line))
            
    ## 转换成 df 格式 
    data = pd.DataFrame(data)
    return data 

def read_json_to_df(json_path): 
    # 打开json文件
    with open(json_path, 'r') as file:
        # 解析json文件
        data = json.load(file)
    ## json --> df 
    data = pd.DataFrame(data).T.reset_index(names=['pids'])
    return data


## 清洗数据 
def clean_body_remove_symbol(text): 
    ## clean_body_remove_symbol(text) 
    text = re.sub('<[^<]+?>', ' ', text).replace('\n', '').strip()
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('http://', '').replace('https://', '').replace('.com', '').replace('.cn', '')
    return text 

In [6]:
## 
testpath = 'data/AQA-test-public/qa_test_wo_ans_new.txt'
test = read_train_valid_test(testpath)
(test['question'] + '\n' + test['body']).apply(lambda x : len(x.split(' '))).describe()

## 是否需要清洗 
if clean: 
    test['body'] = test['body'].apply(clean_body_remove_symbol)

In [8]:
clean_memory()
if clean: 
    Linq_mistral_embeddings = encode_questions_Linq_Embedding_Mistral(test) 
    with open(f'outslgb/encoded_question_Linq_mistral_test.pkl', 'wb') as f:
        pickle.dump(Linq_mistral_embeddings, f)
else: 
    Linq_mistral_embeddings = encode_questions_Linq_Embedding_Mistral(test) 
    with open(f'outslgb/encoded_question_Linq_mistral-NoClean_test.pkl', 'wb') as f:
        pickle.dump(Linq_mistral_embeddings, f)
clean_memory()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/94 [00:02<?, ?it/s]

# passageJson embedding 

In [9]:
def encode_passageJson_Linq_Embedding_Mistral(df): 
    df = df.fillna('')
    
    ## model 
    model_path = '/mntdata/wangql43/A000Files/A003Model/Linq-Embed-Mistral/'
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    model = AutoModel.from_pretrained(model_path).eval().to(f'cuda') 
    
    ## encode 
    # Each query must come with a one-sentence instruction that describes the task
    task = "Represent the Title and Abstract for Paper Searching. "
    
    ## df -- Query 
    queries = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text =  f"Title: {row.title}\n\n The Abstract of the Title: {row.abstract}"
        queries.append(get_detailed_instruct(task, text)) 
    
    ## dl 
    max_length = 2048
    input_texts = queries 
    dataloader = DataLoader(
        input_texts, batch_size=BATCH_SIZE, num_workers=16,
        collate_fn=lambda batch: tokenizer(batch, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
    )
    
    ## model 
    all_embeddings = []
    with torch.no_grad():
        with torch.autocast(device_type='cuda'):
            for batch in tqdm(dataloader, total=len(dataloader)):
                batch = batch.to(model.device)
                model_output = model(**batch)
                sentence_embeddings = last_token_pool(model_output.last_hidden_state, batch['attention_mask'])
                sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
                ## 转到 cpu 上 
                cpu_embeddings = sentence_embeddings.cpu().numpy().astype(np.float16) 
                all_embeddings.append(cpu_embeddings) 
                ## 管理显存 
                del batch, model_output, sentence_embeddings
                clean_memory()
                
    embeddings = np.concatenate(all_embeddings, axis=0) 
    clean_memory()
    
    with open(f'outslgb/encoded_passageJson_Linq_mistral_{max_length}.pkl', 'wb') as f:
        pickle.dump(embeddings, f)
            
    return embeddings

In [10]:
## json  
json_path = 'data/AQA/pid_to_title_abs_new.json'
df_json_old = read_json_to_df(json_path) 

json_path = 'data/AQA-test-public/pid_to_title_abs_update_filter.json'
df_json_new = read_json_to_df(json_path) 

len(set(df_json_new['pids']).difference(set(df_json_old['pids'])))
df_json = pd.merge(df_json_new, df_json_old, how='outer', on=['pids', 'title', 'abstract'])

del df_json_new, df_json_old

In [11]:
## passage 文章清洗 
df_json['title'] = df_json['title'].fillna('None').apply(clean_body_remove_symbol) 
df_json['abstract'] = df_json['abstract'].apply(clean_body_remove_symbol) 

In [12]:
df_json.head()

Unnamed: 0,pids,title,abstract
0,5390877920f70186a0d2cb29,A New Use Of An Automated Reasoning Assistant ...,The field of automated reasoning is an outgrow...
1,5390877920f70186a0d2cc14,Why AM an EUISKO appear to work.,"Seven years ago, the AM program was constructe..."
2,5390877920f70186a0d2cc43,Movement Problems for 2-Dimensional Linkages,NO ABSTRACT SUPPLIED
3,5390877920f70186a0d2cf01,Amortized efficiency of list update and paging...,In this article we study the amortized efficie...
4,5390877920f70186a0d2cf9c,Applications Of Symbolic Evaluation,Symbolic evaluation is a program analysis meth...


In [13]:
Linq_mistral_embeddings = encode_passageJson_Linq_Embedding_Mistral(df_json) 

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/466387 [00:00<?, ?it/s]

  0%|          | 0/14575 [00:02<?, ?it/s]

In [14]:
## 读取 embedding 
with open('outslgb/encoded_passageJson_Linq_mistral_2048.pkl', 'rb') as f:
    passage_embedding = pickle.load(f) 

In [None]:
print('Finish !!!')