In [None]:
# 使用自己预训练的模型
MAX_TEXT_LEN = 512
MAX_CODE_LEN = 256

from torch import nn, tensor, cat, load

class RelatednessModel(nn.Module):
    def __init__(self, clas, tags_count, freeze_embedder=False, pretrained_model_path=None, dropout_prob=0.1, hidden_size=4096):
        super(RelatednessModel, self).__init__()
        print('加载模型')
        self.embedder = clas(tags_count)
        if pretrained_model_path != None:
            self.embedder.load_state_dict(load(pretrained_model_path))
        if freeze_embedder:
            self.embedder.freeze_bert()

        self.dropout_embeddings = nn.Dropout(dropout_prob)
        self.dense_hidden = nn.Linear(768 * clas.embedders * 2, hidden_size)
        self.relu_hidden = nn.ReLU()
        self.dropout_hidden = nn.Dropout(dropout_prob)
        self.dense_classifaction = nn.Linear(hidden_size, 4)
        # self.dense.weight.data.normal_(0, 0.01)
        self.sigmoid_classifaction = nn.Sigmoid()
    
    def forward(self, batch_title1, batch_code1, batch_desc1, batch_title2, batch_code2, batch_desc2):
        embeddings1 = self.embedder(batch_title1, batch_code1, batch_desc1)[1]
        embeddings2 = self.embedder(batch_title2, batch_code2, batch_desc2)[1]

        embeddings = cat((embeddings1, embeddings2), dim=1)
        dropout_embeddings_result = self.dropout_hidden(embeddings)
        dense_hidden_result = self.dense_hidden(dropout_embeddings_result)
        relu_hidden_result = self.relu_hidden(dense_hidden_result)
        dropout_hidden_result = self.dropout_embeddings(relu_hidden_result)
        dense_classifaction = self.dense_classifaction(dropout_hidden_result)
        classifaction = self.sigmoid_classifaction(dense_classifaction)
        return classifaction, embeddings

In [None]:
# 使用自己预训练的模型，两路合并
MAX_TEXT_LEN = 512
MAX_CODE_LEN = 256

from torch import nn, tensor, cat, load

class RelatednessModelCross(nn.Module):
    def __init__(self, clas, tags_count, freeze_embedder=False, pretrained_model_path=None, dropout_prob=0.1, hidden_size=4096, code_model=False,
            text_cls_token_id=101, text_sep_token_id=102, code_cls_token_id=0, code_sep_token_id=2):
        super(RelatednessModelCross, self).__init__()
        print('加载模型')
        self.embedder = clas(tags_count)
        if pretrained_model_path != None:
            self.embedder.load_state_dict(load(pretrained_model_path))
        if freeze_embedder:
            self.embedder.freeze_bert()

        self.dropout_embeddings = nn.Dropout(dropout_prob)
        self.dense_hidden = nn.Linear(768 * clas.embedders, hidden_size)
        self.relu_hidden = nn.ReLU()
        self.dropout_hidden = nn.Dropout(dropout_prob)
        self.dense_classifaction = nn.Linear(hidden_size, 4)
        # self.dense.weight.data.normal_(0, 0.01)
        self.sigmoid_classifaction = nn.Sigmoid()

        self.code_model = code_model
        self.text_cls_token_id = text_cls_token_id
        self.text_sep_token_id = text_sep_token_id
        self.code_cls_token_id = code_cls_token_id
        self.code_sep_token_id = code_sep_token_id
    
    def tokenize(self, tokenizer, batch_sentences, length):
        encoded = tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=False, truncation=True, max_length=length)
        return encoded
    
    def embedding(self, bert, batch_input_ids, batch_attention_mask):
        tokens_ids = tensor(batch_input_ids).cuda()
        attention_mask = tensor(batch_attention_mask).cuda()
        bert_result = bert(input_ids=tokens_ids, attention_mask=attention_mask)
        # embeddings = bert_result[0][:,0,:].contiguous()
        embeddings = bert_result[1]
        return embeddings

    def cross_embedding(self, bert, tokenizer, batch1, batch2, cls_token_id, sep_token_id, max_len):
        tokenized1 = self.tokenize(tokenizer, batch1, max_len)
        tokenized2 = self.tokenize(tokenizer, batch2, max_len)
        
        input_ids1 = tokenized1['input_ids']
        input_ids2 = tokenized2['input_ids']

        batch_input_ids = []
        batch_attention_masks = []
        for i in range(len(input_ids1)):
            input_idsi1 = input_ids1[i]
            input_idsi2 = input_ids2[i]
            if len(input_idsi1) + len(input_idsi2) > max_len - 3:
                s = len(input_idsi1) + len(input_idsi2)
                input_idsi1 = input_idsi1[:len(input_idsi1) * (max_len - 3) // s]
                input_idsi2 = input_idsi2[:len(input_idsi2) * (max_len - 3) // s]
            input_ids = [cls_token_id] + input_idsi1 + [sep_token_id]+ input_idsi2 + [sep_token_id]
            attention_mask = [1 for i in range(len(input_ids))]
            attention_mask += [0 for i in range(max_len - len(input_ids))]
            input_ids += [0 for i in range(max_len - len(input_ids))]
            attention_mask += [0 for i in range(max_len - len(input_ids))]
            batch_input_ids.append(input_ids)
            batch_attention_masks.append(attention_mask)
            
        embeddings = self.embedding(bert, batch_input_ids, batch_attention_masks)
        return embeddings

    def forward(self, batch_title1, batch_code1, batch_desc1, batch_title2, batch_code2, batch_desc2):
        batch_text1 = []
        batch_text2 = []
        for i in range(len(batch_title1)):
            text1 = batch_title1[i] + ' ' + batch_desc1[i]
            text2 = batch_title2[i] + ' ' + batch_desc2[i]
            batch_text1.append(text1)
            batch_text2.append(text2)

        if self.code_model:
            embeddings_text = self.cross_embedding(self.embedder.bert_text, self.embedder.tokenizer_text, batch_text1, batch_text2, self.text_cls_token_id, self.text_sep_token_id, MAX_TEXT_LEN)
            embeddings_code = self.cross_embedding(self.embedder.bert_code, self.embedder.tokenizer_code, batch_code1, batch_code2, self.code_cls_token_id, self.code_sep_token_id, MAX_CODE_LEN)
            embeddings = cat((embeddings_text, embeddings_code), 1)
        else:
            embeddings = self.cross_embedding(self.embedder.bert_text, self.embedder.tokenizer_text, batch_text1, batch_text2, self.text_cls_token_id, self.text_sep_token_id, MAX_TEXT_LEN)
        
        dropout_embeddings_result = self.dropout_hidden(embeddings)
        dense_hidden_result = self.dense_hidden(dropout_embeddings_result)
        relu_hidden_result = self.relu_hidden(dense_hidden_result)
        dropout_hidden_result = self.dropout_embeddings(relu_hidden_result)
        dense_classifaction = self.dense_classifaction(dropout_hidden_result)
        classifaction = self.sigmoid_classifaction(dense_classifaction)
        return classifaction, embeddings

In [None]:
from models.BtdCModel import TagRecommandModel
model = RelatednessModelCross(TagRecommandModel, 23687, pretrained_model_path='大数据量/2022-04-08 041514-epoch1.dat', freeze_embedder=True, hidden_size=4096)

batch_sentences = ['test aaa hahaha', 'test aaa hahaha']
model = model.cuda()
print(model(batch_sentences, batch_sentences, batch_sentences, batch_sentences, batch_sentences, batch_sentences))

In [None]:
# 使用预训练的公共模型
MAX_TEXT_LEN = 512
MAX_CODE_LEN = 256

from torch import nn, tensor, cat, load
from transformers import AutoTokenizer, AutoModel

class RelatednessModel(nn.Module):
    def __init__(self, ptm_name, freeze_embedder=False,dropout_prob=0.1, hidden_size=4096):
        super(RelatednessModel, self).__init__()
        print('加载模型')
        self.tokenizer_text = AutoTokenizer.from_pretrained(ptm_name)

        self.bert_text = AutoModel.from_pretrained(ptm_name)

        if freeze_embedder:
            for param in self.bert_text.parameters():
                param.requires_grad = False

        self.dropout_embeddings = nn.Dropout(dropout_prob)
        self.dense_hidden = nn.Linear(768 * 1 * 2, hidden_size)
        self.dense_hidden.weight.data.normal_(0, 0.01)
        self.relu_hidden = nn.ReLU()
        self.dropout_hidden = nn.Dropout(dropout_prob)
        self.dense_classifaction = nn.Linear(hidden_size, 4)
        self.dense_classifaction.weight.data.normal_(0, 0.01)
        self.sigmoid_classifaction = nn.Sigmoid()

    def part_embedding(self, tokenizer, bert, batch_sentences, length):
        encoded = tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True, truncation=True, max_length=length, padding='max_length')
        tokens_ids = tensor(encoded['input_ids']).cuda()
        attention_mask = tensor(encoded['attention_mask']).cuda()
        bert_result = bert(input_ids=tokens_ids, attention_mask=attention_mask)
        # embeddings = bert_result[0][:,0,:].contiguous()
        embeddings = bert_result[1]
        return embeddings
    
    def forward(self, batch_title1, batch_code1, batch_desc1, batch_title2, batch_code2, batch_desc2):
        batch_text1 = [batch_title1[i] + ' ' + batch_desc1[i] for i in range(len(batch_title1))]
        embeddings_text1 = self.part_embedding(self.tokenizer_text, self.bert_text, batch_text1, MAX_TEXT_LEN)
        batch_text2 = [batch_title2[i] + ' ' + batch_desc2[i] for i in range(len(batch_title2))]
        embeddings_text2 = self.part_embedding(self.tokenizer_text, self.bert_text, batch_text2, MAX_TEXT_LEN)

        embeddings = cat((embeddings_text1, embeddings_text2), dim=1)
        dropout_embeddings_result = self.dropout_hidden(embeddings)
        dense_hidden_result = self.dense_hidden(dropout_embeddings_result)
        relu_hidden_result = self.relu_hidden(dense_hidden_result)
        dropout_hidden_result = self.dropout_embeddings(relu_hidden_result)
        dense_classifaction = self.dense_classifaction(dropout_hidden_result)
        classifaction = self.sigmoid_classifaction(dense_classifaction)
        return classifaction, embeddings

In [None]:
# 使用预训练的公共模型，两句直接合并
MAX_TEXT_LEN = 512
MAX_CODE_LEN = 256

from torch import nn, tensor, cat, load
from transformers import AutoTokenizer, AutoModel

class RelatednessModelCross(nn.Module):
    def __init__(self, ptm_name, freeze_embedder=False,dropout_prob=0.1, hidden_size=4096, cls_token_id=101, sep_token_id=102):
        super(RelatednessModelCross, self).__init__()
        print('加载模型')
        self.tokenizer_text = AutoTokenizer.from_pretrained(ptm_name)

        self.bert_text = AutoModel.from_pretrained(ptm_name)

        if freeze_embedder:
            for param in self.bert_text.parameters():
                param.requires_grad = False

        self.dropout_embeddings = nn.Dropout(dropout_prob)
        self.dense_hidden = nn.Linear(768 * 1 * 1, hidden_size)
        self.dense_hidden.weight.data.normal_(0, 0.01)
        self.relu_hidden = nn.ReLU()
        self.dropout_hidden = nn.Dropout(dropout_prob)
        self.dense_classifaction = nn.Linear(hidden_size, 4)
        self.dense_classifaction.weight.data.normal_(0, 0.01)
        self.sigmoid_classifaction = nn.Sigmoid()
        self.cls = cls_token_id
        self.sep = sep_token_id

    def tokenize(self, tokenizer, batch_sentences, length):
        encoded = tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=False, truncation=True, max_length=length)
        return encoded
    
    def embedding(self, bert, batch_input_ids, batch_attention_mask):
        tokens_ids = tensor(batch_input_ids).cuda()
        attention_mask = tensor(batch_attention_mask).cuda()
        bert_result = bert(input_ids=tokens_ids, attention_mask=attention_mask)
        # embeddings = bert_result[0][:,0,:].contiguous()
        embeddings = bert_result[1]
        return embeddings
    
    def forward(self, batch_title1, batch_code1, batch_desc1, batch_title2, batch_code2, batch_desc2):
        batch_text1 = []
        batch_text2 = []
        for i in range(len(batch_title1)):
            text1 = batch_title1[i] + ' ' + batch_desc1[i]
            text2 = batch_title2[i] + ' ' + batch_desc2[i]
            batch_text1.append(text1)
            batch_text2.append(text2)
        tokenized1 = self.tokenize(self.tokenizer_text, batch_text1, MAX_TEXT_LEN)
        tokenized2 = self.tokenize(self.tokenizer_text, batch_text2, MAX_TEXT_LEN)
        
        input_ids1 = tokenized1['input_ids']
        input_ids2 = tokenized2['input_ids']

        batch_input_ids = []
        batch_attention_masks = []
        for i in range(len(input_ids1)):
            input_idsi1 = input_ids1[i]
            input_idsi2 = input_ids2[i]
            if len(input_idsi1) + len(input_idsi2) > MAX_TEXT_LEN - 3:
                s = len(input_idsi1) + len(input_idsi2)
                input_idsi1 = input_idsi1[:len(input_idsi1) * (MAX_TEXT_LEN - 3) // s]
                input_idsi2 = input_idsi2[:len(input_idsi2) * (MAX_TEXT_LEN - 3) // s]
            input_ids = [self.cls] + input_idsi1 + [self.sep]+ input_idsi2 + [self.sep]
            attention_mask = [1 for i in range(len(input_ids))]
            attention_mask += [0 for i in range(MAX_TEXT_LEN - len(input_ids))]
            input_ids += [0 for i in range(MAX_TEXT_LEN - len(input_ids))]
            batch_input_ids.append(input_ids)
            batch_attention_masks.append(attention_mask)
            
        embeddings = self.embedding(self.bert_text, batch_input_ids, batch_attention_masks)

        dropout_embeddings_result = self.dropout_hidden(embeddings)
        dense_hidden_result = self.dense_hidden(dropout_embeddings_result)
        relu_hidden_result = self.relu_hidden(dense_hidden_result)
        dropout_hidden_result = self.dropout_embeddings(relu_hidden_result)
        dense_classifaction = self.dense_classifaction(dropout_hidden_result)
        classifaction = self.sigmoid_classifaction(dense_classifaction)
        return classifaction, embeddings

In [None]:
# 使用预训练的公共模型，两句直接合并，减小后置层数
MAX_TEXT_LEN = 512
MAX_CODE_LEN = 256

from torch import nn, tensor, cat, load
from transformers import AutoTokenizer, AutoModel

class RelatednessModelCross(nn.Module):
    def __init__(self, ptm_name, freeze_embedder=False,dropout_prob=0.1, hidden_size=4096, cls_token_id=101, sep_token_id=102):
        super(RelatednessModelCross, self).__init__()
        print('加载模型')
        self.tokenizer_text = AutoTokenizer.from_pretrained(ptm_name)

        self.bert_text = AutoModel.from_pretrained(ptm_name)

        if freeze_embedder:
            for param in self.bert_text.parameters():
                param.requires_grad = False

        self.dropout_embeddings = nn.Dropout(dropout_prob)
        self.dense_classifaction = nn.Linear(768 * 1 * 1, 4)
        self.dense_classifaction.weight.data.normal_(0, 0.01)
        self.sigmoid_classifaction = nn.Sigmoid()
        self.cls = cls_token_id
        self.sep = sep_token_id

    def tokenize(self, tokenizer, batch_sentences, length):
        encoded = tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=False, truncation=True, max_length=length)
        return encoded
    
    def embedding(self, bert, batch_input_ids, batch_attention_mask):
        tokens_ids = tensor(batch_input_ids).cuda()
        attention_mask = tensor(batch_attention_mask).cuda()
        bert_result = bert(input_ids=tokens_ids, attention_mask=attention_mask)
        # embeddings = bert_result[0][:,0,:].contiguous()
        embeddings = bert_result[1]
        return embeddings
    
    def forward(self, batch_title1, batch_code1, batch_desc1, batch_title2, batch_code2, batch_desc2):
        batch_text1 = []
        batch_text2 = []
        for i in range(len(batch_title1)):
            text1 = batch_title1[i] + ' ' + batch_desc1[i]
            text2 = batch_title2[i] + ' ' + batch_desc2[i]
            batch_text1.append(text1)
            batch_text2.append(text2)
        tokenized1 = self.tokenize(self.tokenizer_text, batch_text1, MAX_TEXT_LEN)
        tokenized2 = self.tokenize(self.tokenizer_text, batch_text2, MAX_TEXT_LEN)
        
        input_ids1 = tokenized1['input_ids']
        input_ids2 = tokenized2['input_ids']

        batch_input_ids = []
        batch_attention_masks = []
        for i in range(len(input_ids1)):
            input_idsi1 = input_ids1[i]
            input_idsi2 = input_ids2[i]
            if len(input_idsi1) + len(input_idsi2) > MAX_TEXT_LEN - 3:
                s = len(input_idsi1) + len(input_idsi2)
                input_idsi1 = input_idsi1[:len(input_idsi1) * (MAX_TEXT_LEN - 3) // s]
                input_idsi2 = input_idsi2[:len(input_idsi2) * (MAX_TEXT_LEN - 3) // s]
            input_ids = [self.cls] + input_idsi1 + [self.sep]+ input_idsi2 + [self.sep]
            attention_mask = [1 for i in range(len(input_ids))]
            attention_mask += [0 for i in range(MAX_TEXT_LEN - len(input_ids))]
            input_ids += [0 for i in range(MAX_TEXT_LEN - len(input_ids))]
            attention_mask += [0 for i in range(MAX_TEXT_LEN - len(input_ids))]
            batch_input_ids.append(input_ids)
            batch_attention_masks.append(attention_mask)
            
        embeddings = self.embedding(self.bert_text, batch_input_ids, batch_attention_masks)

        dropout_hidden_result = self.dropout_embeddings(embeddings)
        dense_classifaction = self.dense_classifaction(dropout_hidden_result)
        classifaction = self.sigmoid_classifaction(dense_classifaction)
        return classifaction, embeddings

In [None]:
# Cross-encoding + TAPT + hidden-layer
MAX_TEXT_LEN = 512
MAX_CODE_LEN = 256

from torch import nn, Tensor, tensor, cat, load
from transformers import AutoTokenizer, AutoModel

class BertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states: Tensor) -> Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

class RelatednessModelCrossHidden(nn.Module):
    def __init__(self, ptm_name, freeze_embedder=False,dropout_prob=0.1, hidden_size=4096, cls_token_id=101, sep_token_id=102, used_hidden_layer=1):
        super(RelatednessModelCrossHidden, self).__init__()
        print('加载模型')
        self.tokenizer_text = AutoTokenizer.from_pretrained(ptm_name)

        self.bert_text = AutoModel.from_pretrained(ptm_name)

        if freeze_embedder:
            for param in self.bert_text.parameters():
                param.requires_grad = False

        self.dropout_embeddings = nn.Dropout(dropout_prob)
        self.dense_classifaction = nn.Linear(768 * 1 * 1, 4)
        self.dense_classifaction.weight.data.normal_(0, 0.01)
        self.sigmoid_classifaction = nn.Sigmoid()
        self.cls = cls_token_id
        self.sep = sep_token_id
        self.used_hidden_layer = used_hidden_layer
        self.pooler = BertPooler(self.bert_text.config)

    def tokenize(self, tokenizer, batch_sentences, length):
        encoded = tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=False, truncation=True, max_length=length)
        return encoded
    
    def embedding(self, bert, batch_input_ids, batch_attention_mask):
        tokens_ids = tensor(batch_input_ids).cuda()
        attention_mask = tensor(batch_attention_mask).cuda()
        bert_result = bert(input_ids=tokens_ids, attention_mask=attention_mask, output_hidden_states=True)
        embeddings = bert.pooler(bert_result.hidden_states[self.used_hidden_layer])
        return embeddings
    
    def forward(self, batch_title1, batch_code1, batch_desc1, batch_title2, batch_code2, batch_desc2):
        batch_text1 = []
        batch_text2 = []
        for i in range(len(batch_title1)):
            text1 = batch_title1[i] + ' ' + batch_desc1[i]
            text2 = batch_title2[i] + ' ' + batch_desc2[i]
            batch_text1.append(text1)
            batch_text2.append(text2)
        tokenized1 = self.tokenize(self.tokenizer_text, batch_text1, MAX_TEXT_LEN)
        tokenized2 = self.tokenize(self.tokenizer_text, batch_text2, MAX_TEXT_LEN)
        
        input_ids1 = tokenized1['input_ids']
        input_ids2 = tokenized2['input_ids']

        batch_input_ids = []
        batch_attention_masks = []
        for i in range(len(input_ids1)):
            input_idsi1 = input_ids1[i]
            input_idsi2 = input_ids2[i]
            if len(input_idsi1) + len(input_idsi2) > MAX_TEXT_LEN - 3:
                s = len(input_idsi1) + len(input_idsi2)
                input_idsi1 = input_idsi1[:len(input_idsi1) * (MAX_TEXT_LEN - 3) // s]
                input_idsi2 = input_idsi2[:len(input_idsi2) * (MAX_TEXT_LEN - 3) // s]
            input_ids = [self.cls] + input_idsi1 + [self.sep]+ input_idsi2 + [self.sep]
            attention_mask = [1 for i in range(len(input_ids))]
            attention_mask += [0 for i in range(MAX_TEXT_LEN - len(input_ids))]
            input_ids += [0 for i in range(MAX_TEXT_LEN - len(input_ids))]
            attention_mask += [0 for i in range(MAX_TEXT_LEN - len(input_ids))]
            batch_input_ids.append(input_ids)
            batch_attention_masks.append(attention_mask)
            
        embeddings = self.embedding(self.bert_text, batch_input_ids, batch_attention_masks)

        dropout_hidden_result = self.dropout_embeddings(embeddings)
        dense_classifaction = self.dense_classifaction(dropout_hidden_result)
        classifaction = self.sigmoid_classifaction(dense_classifaction)
        return classifaction, embeddings

In [None]:
name = 'roberta-base'
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModel.from_pretrained(name)
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
print(encoded_input)
text = "<s>Replace me by any text you'd like.</s>"
encoded_input = tokenizer(text, return_tensors='pt')
print(encoded_input)
# output = model(**encoded_input)
# print(encoded_input)
# print(output[0])
# print(output[1])
# print(output)

In [None]:
from models.BtdModel import TagRecommandModel
batch_sentences = ['test aaa hahaha', 'var i = 0', "Replace me by any text you'd like."]
model = RelatednessModelCrossHidden('bert-base-uncased', freeze_embedder=True)
model = model.cuda()
print(model(batch_sentences, batch_sentences, batch_sentences, batch_sentences, batch_sentences, batch_sentences))

In [None]:
import json
from torch import tensor
from glob import glob
import pandas as pd

TAGS_LIST_SAVE = '../data/mid/commonTags.json'

class Dataset():
    label_dict = {
        'duplicate': 0,
        'direct': 1,
        'indirect': 2,
        'isolated': 3
    }

    def __init__(self, data_path):
        self.source = pd.read_csv(data_path, lineterminator="\n")
        self.size = len(self.source)
        self.data = []
        for i in range(self.size):
            item = self.source.iloc[i]
            result = [0 for i in range(4)]
            result[Dataset.label_dict[item['class'].strip()]] = 1
            self.data.append({
                'title1': item['q1_Title'].strip() if type(item['q1_Title']) == str else '',
                'desc1': item['q1_Body'].strip() if type(item['q1_Body']) == str else '',
                'code1': item['q1_BodyCode'].strip() if type(item['q1_BodyCode']) == str else '',
                'title2': item['q2_Title'].strip() if type(item['q2_Title']) == str else '',
                'desc2': item['q2_Body'].strip() if type(item['q2_Body']) == str else '',
                'code2': item['q2_BodyCode'].strip() if type(item['q2_BodyCode']) == str else '',
                'class': tensor(result).float()
            })

    def __getitem__(self, i):
        if i >= self.size:
            raise StopIteration
        return self.data[i]
        
    def __len__(self):
        return self.size

In [None]:
DATASET_TRAIN = '../data/raw/medium_link_prediction_noClue_shuffled_train.csv'
DATASET_TEST = '../data/raw/medium_link_prediction_noClue_shuffled_test.csv'

a = pd.read_csv(DATASET_TEST)
print(a.iloc[0]['q1_Id'])
dataset = Dataset(DATASET_TEST)
print(dataset[0])

In [None]:
VAL_BATCH_SIZE = 8
from torch.utils.data import DataLoader
from torch import topk, arange
import torch
try:
    get_ipython().__class_._name__
    from tqdm.notebook import tqdm
except:
    from tqdm import tqdm

LABEL_BACK_DICT = ['duplicate', 'direct', 'indirect', 'isolated']

def val(model, val_dataset, show=False):
    val_dataloader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, num_workers=0, shuffle=False)

    c = [[0 for j in range(4)] for i in range(4)]
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(tqdm(val_dataloader)):
            input_title1 = data['title1']
            input_code1 = data['code1']
            input_desc1 = data['desc1']
            input_title2 = data['title2']
            input_code2 = data['code2']
            input_desc2 = data['desc2']
            probability = model(input_title1, input_code1, input_desc1, input_title2, input_code2, input_desc2)[0]
            tops = topk(probability, 1, sorted=True)
            for indices, j in zip(tops.indices, arange(len(tops.indices))):
                for k in range(4):
                    if data['class'][j][k] == 1:
                        act = k
                for index in indices:
                    c[act][index] += 1
    print(c)
    precision = {}
    recall = {}
    f1_score = {}
    precision_sum = 0
    recall_sum = 0
    f1_score_sum = 0
    for k in range(4):
        precision[k] = c[k][k] / (c[0][k] + c[1][k] + c[2][k] + c[3][k]) if c[k][k] != 0 else 0
        recall[k] = c[k][k] / (c[k][0] + c[k][1] + c[k][2] + c[k][3]) if c[k][k] != 0 else 0
        f1_score[k] = 2 * precision[k] * recall[k] / (precision[k] + recall[k]) if (precision[k] + recall[k] != 0) else 0
        precision_sum += precision[k]
        recall_sum += recall[k]
        f1_score_sum += f1_score[k]
    if show:
        for k in range(4):
            print(f"Precision@{LABEL_BACK_DICT[k]} = {precision[k]}, Recall@{LABEL_BACK_DICT[k]} = {recall[k]}, F1-score@{LABEL_BACK_DICT[k]} = {f1_score[k]}")
    return {'precision': precision_sum / 4, 'recall': recall_sum / 4, 'f1_score': f1_score_sum / 4}

In [None]:
import torch
from transformers import AdamW, get_scheduler
import numpy as np
import random
from datetime import datetime
from torch.utils.data import DataLoader
import json
try:
    get_ipython().__class_._name__
    from tqdm.notebook import tqdm
except:
    from tqdm import tqdm

RANDOM_SEED = 20
LEARNING_RATE = 5e-5
TRAIN_BATCH_SIZE = 8
EPOCHS = 10
BATCHSTEP_PER = 4
DATASET_TRAIN = '../data/raw/medium_link_prediction_noClue_shuffled_train.csv'
DATASET_TEST = '../data/raw/medium_link_prediction_noClue_shuffled_test.csv'

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, criterion):
    model.train()
    optimizer.zero_grad()
    for i, data in enumerate(train_dataloader):
        input_title1 = data['title1']
        input_code1 = data['code1']
        input_desc1 = data['desc1']
        input_title2 = data['title2']
        input_code2 = data['code2']
        input_desc2 = data['desc2']
        target = data['class'].cuda()
        output = model(input_title1, input_code1, input_desc1, input_title2, input_code2, input_desc2)[0]
        loss = criterion(output, target)
        loss.backward()
        if (i + 1) % BATCHSTEP_PER == 0:
            print(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

def train(model, optimizer, lr_scheduler, criterion, train_dataset, val_dataset):
    setup_seed(RANDOM_SEED)
    train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=0, shuffle=True, pin_memory=True)
    for epoch, _ in enumerate(tqdm(range(EPOCHS), total=EPOCHS)):
        train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, criterion)
        acc = val(model, val_dataset, show=True)
        print(acc)
        torch.save(model.state_dict(), datetime.now().strftime('%Y-%m-%d %H%M%S') + f'-epoch{epoch}.dat')
    return acc

# from models.BtdCModel import TagRecommandModel
# model = RelatednessModelCross(TagRecommandModel, 23687, pretrained_model_path='大数据量/2022-04-08 041514-epoch1.dat', freeze_embedder=True, hidden_size=4096, code_model=True)

# from models.BtdModel import TagRecommandModel
# model = RelatednessModelCross(TagRecommandModel, 23687, pretrained_model_path='大数据量/2022-03-24 145103-epoch1.dat', freeze_embedder=True, hidden_size=4096)

# model = RelatednessModel('roberta-base', freeze_embedder=False, cls_token_id=0, sep_token_id=2)

# model = RelatednessModel('bert-base-uncased', freeze_embedder=True)

# model = RelatednessModel('jeniya/BERTOverflow', freeze_embedder=False)

# model = RelatednessModel('albert-base-v2', freeze_embedder=True)

# model = RelatednessModel('./pre-training/roberta-tapt', freeze_embedder=False)
# model = RelatednessModelCross('./pre-training/roberta-tapt', freeze_embedder=False, cls_token_id=0, sep_token_id=2)
model = RelatednessModelCrossHidden('./pre-training/roberta-tapt', freeze_embedder=False, cls_token_id=0, sep_token_id=2, used_hidden_layer=12)

# model = RelatednessModelCross('./pre-training/bertoverflow-tapt', freeze_embedder=True)

model = model.cuda()
print(model)

train_dataset = Dataset(DATASET_TRAIN)
val_dataset = Dataset(DATASET_TEST)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = num_training_steps = EPOCHS * (len(train_dataset) // BATCHSTEP_PER)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

criterion = torch.nn.BCELoss()

es = train(model, optimizer, lr_scheduler, criterion, train_dataset, val_dataset)
print(es)

In [None]:
from torch import load, no_grad, topk, arange
from models.RelatednessModel import RelatednessModel
from torch.utils.data import DataLoader
DATASET_TEST = '../data/raw/medium_link_prediction_noClue_shuffled_test-mini.csv'

model = RelatednessModelCross('./pre-training/roberta-tapt', freeze_embedder=False, cls_token_id=0, sep_token_id=2)
model.load_state_dict(load('C:\\Users\\3090\\Documents\\相似比对\\cross-encoder\\Roberta-TAPT，不锁，epoch3\\2022-07-09 185026-epoch2.dat'))
model = model.cuda()

val_dataset = Dataset(DATASET_TEST)
val_dataloader = DataLoader(val_dataset, batch_size=8, num_workers=0, shuffle=False)

c = [[0 for j in range(4)] for i in range(4)]
model.eval()
act = []
pred = []
with no_grad():
    for i, data in enumerate(val_dataloader):
        print(data)
        input_title1 = data['title1']
        input_code1 = data['code1']
        input_desc1 = data['desc1']
        input_title2 = data['title2']
        input_code2 = data['code2']
        input_desc2 = data['desc2']
        probability = model(input_title1, input_code1, input_desc1, input_title2, input_code2, input_desc2)[0]
        tops = topk(probability, 1, sorted=True)
        for indices, j in zip(tops.indices, arange(len(tops.indices))):
            for k in range(4):
                if data['class'][j][k] == 1:
                    act.append(k)
            for index in indices:
                pred.append(index.item())

with open('right.txt', 'w', encoding='utf-8') as f:
    for i in range(len(act)):
        if act[i] == pred[i]:
            f.write(f'{i} {pred[i]} {act[i]}\n')