In [70]:
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as npa
from tqdm import tqdm
import json
import jieba
from transformers import BertTokenizer

from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [71]:
with open('/Users/xiachunxuan/User_config/cn_stopwords.txt', 'r', encoding='utf-8') as f:
    STOPWORDS = f.read()
    STOPWORDS = STOPWORDS.split('\n')

In [27]:
def get_word_list(sentence):
    # tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    # return tokenizer.tokenize(sentence)
    return set(sentence)

In [28]:
get_word_list('你好啊 好好好')

{' ', '你', '啊', '好'}

In [29]:
set(['你好啊'])

{'你好啊'}

In [30]:
def text_to_sequence(text, vocab):
    word_list = get_word_list(text)
    return [vocab[word] for word in word_list]

In [31]:
def build_vocab(filepath):
    # stopwords = {' '}
    global STOPWORDS
    df = pd.read_csv(filepath, sep='\t', names=['sentence1', 'sentence2', 'label'])
    X = df['sentence1'].tolist() + df['sentence2'].tolist()
    idx = 2
    vocab = {'<SEP>':0, "<PAD>":1}
    for sentence in tqdm(X, desc="Building Vocabulary"):
        for word in get_word_list(sentence):
            if word not in vocab:
                vocab[word] = idx
                idx += 1 
    return vocab

In [32]:
vocab = build_vocab('atec_nlp_sim_train_all.csv')

Building Vocabulary: 100%|██████████| 184954/184954 [00:00<00:00, 619544.95it/s]


In [33]:
vocab

{'<SEP>': 0,
 '<PAD>': 1,
 '消': 2,
 '费': 3,
 '呗': 4,
 '花': 5,
 '还': 6,
 '能': 7,
 '吗': 8,
 '最': 9,
 '前': 10,
 '了': 11,
 '低': 12,
 '提': 13,
 '收': 14,
 '少': 15,
 '钱': 16,
 '额': 17,
 '的': 18,
 '我': 19,
 '多': 20,
 '是': 21,
 '度': 22,
 '怎': 23,
 '么': 24,
 '办': 25,
 '取': 26,
 '想': 27,
 '就': 28,
 '不': 29,
 '商': 30,
 '蚂': 31,
 '借': 32,
 '贷': 33,
 '网': 34,
 '有': 35,
 '没': 36,
 '蚁': 37,
 '现': 38,
 '抢': 39,
 '返': 40,
 '清': 41,
 '，': 42,
 '来': 43,
 '转': 44,
 '面': 45,
 '宝': 46,
 '日': 47,
 '里': 48,
 '才': 49,
 '款': 50,
 '今': 51,
 '进': 52,
 '刚': 53,
 '天': 54,
 '在': 55,
 '余': 56,
 '为': 57,
 '用': 58,
 '啥': 59,
 '交': 60,
 '成': 61,
 '显': 62,
 '示': 63,
 '功': 64,
 '账': 65,
 '果': 66,
 '如': 67,
 '换': 68,
 '号': 69,
 '算': 70,
 '单': 71,
 '看': 72,
 '所': 73,
 '查': 74,
 '停': 75,
 '金': 76,
 '拜': 77,
 '摩': 78,
 '次': 79,
 '押': 80,
 '两': 81,
 '之': 82,
 '变': 83,
 '*': 84,
 '分': 85,
 '期': 86,
 '！': 87,
 '月': 88,
 '个': 89,
 '长': 90,
 '要': 91,
 '可': 92,
 '扣': 93,
 '家': 94,
 '小': 95,
 '点': 96,
 '心': 97,
 '便': 98,
 '码': 99,
 '

In [34]:
def pad_seq(seq, max_length):
    return seq[:max_length] + [vocab['<PAD>']] * (max_length - len(seq))

In [35]:
a = [i for i in range(10)]

In [36]:
def generate_dataset(filepath, vocab):
    df = pd.read_csv('atec_nlp_sim_train_all.csv', sep='\t', names=['sentence1', 'sentence2', 'label'])
    sentences = []
    labels = []
    for s1, s2, label in zip(df['sentence1'], df['sentence2'], df['label']):
        sentence = pad_seq(text_to_sequence(s1, vocab), 60) + pad_seq(text_to_sequence(s2, vocab), 60)
        # sentence = pad_seq(sentence, 60)
        sentences.append(sentence)
        labels.append(label)
    
    X = torch.tensor(sentences, dtype=torch.long)
    y = torch.tensor(labels, dtype=torch.long)
    
    return X, y

In [37]:
X, y = generate_dataset('atec_nlp_sim_train_all.csv', vocab)

In [38]:
y = torch.tensor(y, dtype=torch.float)

  y = torch.tensor(y, dtype=torch.float)


In [39]:
import logging

In [61]:
class Attention(nn.Module):
    def __init__(self, hidden_size) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        
    def score(self, hidden, encoder_outputs):
        # print(f"hidden: {hidden.shape}, encoder_outputs: {encoder_outputs.shape}")
        # print(f"attn: {self.attn(encoder_outputs).shape}")
        energy = torch.tanh(self.attn(encoder_outputs) + hidden)
        energy = energy.transpose(1,2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        energy = torch.bmm(v, energy)
        return energy.squeeze(1)
    
    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(1)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        # encoder_outputs = encoder_outputs.transpose(1, 2)
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [62]:
class TextMatchModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True, bidirectional=True) 
        self.attention = Attention(hidden_size * 2)
        self.fc = nn.Linear(hidden_size * 2, 1)
        
    def forward(self, in1, in2):
        # print(in1.shape) # 32*60
        embed1 = self.embedding(in1)
        embed2 = self.embedding(in2)
        # print(embed1.shape) #32*60*100
        
        output1, _ = self.gru(embed1)
        output2, _ = self.gru(embed2)
        
        # print(output1.shape) #32*60*128
        
        attn_weights1 = self.attention(output1[:, -1, :], output1)
        attn_weights2 = self.attention(output2[:, -1, :], output2)
        # print(f"attn_weights1: {attn_weights1.shape}") # 32*1*60
        
        context1 = attn_weights1.bmm(output1) # 32*2*128
        context2 = attn_weights2.bmm(output2)
        
        combined = torch.cat((context1, context2), dim=1)
        output = self.fc(combined.squeeze(1)).squeeze(-1)
        output = nn.Sigmoid()(output.mean(dim=1, keepdim=True))
        # output[output > 0.5] = 1
        # output[output <= 0.5] = 0
        # output = torch.tensor(output, dtype=torch.long)
        return output # 32*2*1 

In [63]:
class TextPairDataset(Dataset):
    def __init__(self, sentence1, sentence2, label) -> None:
        super().__init__()
        self.sentence1 = sentence1
        self.sentence2 = sentence2
        self.label = label
        
    def __len__(self):
        return len(self.sentence1)
    
    def __getitem__(self, index):
        return self.sentence1[index], self.sentence2[index], self.label[index]

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [65]:
X_train.shape

torch.Size([73981, 120])

In [66]:
# 文本转Tensor
TrainDataset = TextPairDataset(X_train[:,:60], X_train[:, 60:], y_train)
TestDataset = TextPairDataset(X_test[:,:60], X_test[:, 60:], y_test)

# collect_fn
TrainDataloader = DataLoader(TrainDataset, batch_size=32, shuffle=True)
TestDataloader = DataLoader(TestDataset, batch_size=32, shuffle=False)

In [67]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_size = 128
model = TextMatchModel(vocab_size, embedding_dim, hidden_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [68]:
epoch_nums = 10
# 训练模型
model.train()
for _ in range(epoch_nums):
    with tqdm(TrainDataloader, unit='batch') as tepoch:
        for input_ids1, input_ids2, labels in tepoch:
            optimizer.zero_grad()
            output = model(input_ids1, input_ids2)
            # print(output.shape)
            # print(output)
            loss = criterion(output, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()

100%|██████████| 2312/2312 [03:12<00:00, 11.99batch/s]
100%|██████████| 2312/2312 [03:10<00:00, 12.12batch/s]
100%|██████████| 2312/2312 [03:21<00:00, 11.49batch/s]
100%|██████████| 2312/2312 [03:11<00:00, 12.05batch/s]
100%|██████████| 2312/2312 [03:13<00:00, 11.97batch/s]
100%|██████████| 2312/2312 [03:17<00:00, 11.71batch/s]
100%|██████████| 2312/2312 [03:12<00:00, 11.98batch/s]
100%|██████████| 2312/2312 [03:13<00:00, 11.95batch/s]
100%|██████████| 2312/2312 [03:17<00:00, 11.73batch/s]
100%|██████████| 2312/2312 [03:18<00:00, 11.64batch/s]


In [78]:
model.eval()
with torch.no_grad():
    input_ids1, input_ids2 = X_test[:, :60], X_test[:, 60:]
    labels = y_test
    output = model(input_ids1, input_ids2)
    output[output > 0.5] = 1
    output[output <= 0.5] = 0
    print(accuracy_score(labels, output))

0.8226102941176471


In [73]:
output

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], grad_fn=<IndexPutBackward0>)