In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import random 
import numpy as np
import transformers
import logging
import random
import time

SEED = 2020
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

gpu = 0
use_cuda = gpu>=0 and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(gpu)
    device = torch.device("cuda",gpu)
else:
    device = torch.device("cpu")
logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)

2020-08-04 15:12:51,503 INFO: Use cuda: True, gpu id: 0.


In [32]:
fold_num = 10
DATA_PATH = 'train_set.csv'
import pandas as pd

def all_data2fold(fold_num,num=10000):
    fold_data = []
    f = pd.read_csv(DATA_PATH,sep='\t',encoding='UTF-8')
    texts = f['text'].tolist()[:num]
    labels = f['label'].tolist()[:num]
    
    total = len(labels)
    index = list(range(total))
    np.random.shuffle(index)
    all_texts = []
    all_labels = []
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])
    label2id = {}
    for i in range(total):
        label = str(all_labels[i])
        if label not in label2id:
            label2id[label] = [i]
        else:
            label2id[label].append(i)
    all_index = [[] for _ in range(fold_num)]
    for label,data in label2id.items():
        batch_size = int(len(data)/fold_num)
        other = len(data) - batch_size*fold_num
        pre_batch_size = batch_size
        pre_end = 0
        for i in range(fold_num):
            if i < other:
                cur_batch_size = batch_size +1;
            else: cur_batch_size = batch_size
            batch_data = [data[pre_end+b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data)
            pre_batch_size = cur_batch_size
            pre_end += cur_batch_size
    
    batch_size = int(total/fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        num = len(all_index[fold])
        texts = [all_texts[i] for i in all_index[fold]]
        labels = [all_labels[i] for i in all_index[fold]]
        
        if num > batch_size:
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num +=num-batch_size
        # 实现对orther_texts的利用
        elif num<batch_size:
            end = start+batch_size-num
            fold_texts = texts + other_texts[start:end]
            fold_labels = labels + other_labels[start:end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels
        assert batch_size == len(fold_labels)
        
        index = list(range(batch_size))
        np.random.shuffle(index)
        
        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])
        data = {'label':shuffle_fold_labels,'text':shuffle_fold_texts}
        fold_data.append(data)
    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
    return fold_data

fold_data = all_data2fold(10,100000)

2020-08-04 15:51:56,931 INFO: Fold lens [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]


In [33]:
fold_id = 9
# 验证集
dev_data = fold_data[fold_id]

# 训练集
train_texts = []
train_labels = []
for i in range(0,fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    train_labels.extend(data['label'])

train_data = {'label':train_labels,'text':train_texts}

# 测试集
test_data_file = 'test_a.csv'
f = pd.read_csv(test_data_file,sep='\t',encoding='UTF-8')
texts = f['text'].tolist()
test_data = {'label':[0]*len(texts),'text':texts}

In [34]:
# bulid vocab
from collections import Counter
from transformers import BasicTokenizer

basic_tokenizer = BasicTokenizer()

class Vocab():
    def __init__(self,train_data):
        self.min_count = 5
        self.pad = 0
        self.unk = 1
        self._id2word = ['[PAD]','[UNK]']
        self._id2extword = ['[PAD]','[UNK]']
        
        self._id2label = []
        self.target_names = []
        self.build_vocab(train_data)
        
        reverse = lambda x: dict(zip(x,range(len(x)) ))
        self._word2id = reverse(self._id2word)
        self._label2id = reverse(self._id2label)
        
        logging.info("Bulid vocab: word %d, labels %d."%(self.word_size,self.label_size))
        
    def build_vocab(self,data):
        self.word_counter = Counter()
        for text in data['text']:
            words = text.split()
            for word in words:
                self.word_counter[word]+= 1
        for word, count in self.word_counter.most_common():
            if count> self.min_count:
                self._id2word.append(word)
        label2name = {0: '科技', 1: '股票', 2: '体育', 3: '娱乐', 4: '时政', 5: '社会', 6: '教育', 7: '财经',
                      8: '家居', 9: '游戏', 10: '房产', 11: '时尚', 12: '彩票', 13: '星座'}
        self.label_counter = Counter(data['label'])
        for label in range(len(self.label_counter)):
            count = self.label_counter[label]
            self._id2label.append(label)
            self.target_names.append(label2name[label])
    def load_pretrained_embs(self,embfile):
        with open(embfile,encoding='UTF-8') as f:
            lines = f.readlines()
            items = lines[0].split()
            word_count,embedding_dim = int(items[0]) ,int(items[1])
        index = len(self._id2extword)
        embeddings = np.zeros((word_count+index,embedding_dim))
        for line in lines[1:]:
            values = line.split()
            self._id2extword.append(values[0])
            vector = np.array(values[1:],dtype='float64')
            embeddings[self.unk] += vector
            embeddings[index] = vector
            index += 1
        # unknow token的embedding 就是把整个词表的embedding 平均了一下
        embeddings[self.unk] = embedding[self.unk]//word_count
        embeddings = embeddings /np.std(embeddings)
        
        reverse = lambda x: dict(zip(x,range(len(x))))
        self._extword2id = reverse(self._id2extword)
        
        assert len(set(self._id2extword)) == len(self._id2extword)
        
        return embeddings
    # 字典(Dictionary) get() 函数返回指定键的值，如果值不在字典中返回默认值。
    def word2id(self,xs):
        if ininstance(xs,list):
            return [self._word2id.get(x,self.unk) for x in xs]
        return self._word2id.get(xs,self.unk)
    def extword2id(self,xs):
        if isinstance(xs,list):
            return [self._extword2id.get(x,self.unk) for x in xs]
        return self._extword2id.get(xs,self.unk)
    def label2id(self,xs):
        if isinstance(xs,list):
            return [self._label2id.get(x,self.unk) for x in xs]
        return self._label2id.get(xs,self.unk)
    
    @property
    def word_size(self):
        return len(self._id2word)
    @property
    def extword_size(self):
        return len(self._id2extword)
    @property
    def label_size(self):
        return len(self._id2label)

vocab = Vocab(train_data)

2020-08-04 15:52:51,065 INFO: Bulid vocab: word 5503, labels 14.


In [48]:
class Attention(nn.Module):
    def __init__(self,hidden_size):
        super(Attention,self)._init__()
        self.weight = nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.weight.data.normal_(mean=0.0,std=0.05)
        
        self.bias = nn.Parameter(torch.Tensor(hidden_size))
        b = np.zeros(hidden_size,dtype=np.float32)
        self.bias.data.copy_(torch.from_numpy)
        self.query = nn.Parameter(torch.Tensor(hidden_size))
        self.query = nn.data.normal_(mean=0.0,std=0.05)
    def forward(self,batch_hidden,batch_masks):
        
        # key 其中matmul针对高位矩阵相乘
        key = torch.matmul(batch_hidden,self.weight) +self.bias
        
        # compute attention
        outputs = torch.matmul(key,self.query)
        
        masked_outputs = outputs.masked_fill((1-batch_masks).bool(),float(-1e32))
        
        attention_scores = F.softmax(masked_outputs,dim=1)
        
        masked_attention_scores = attention_scores.masked_fill((1-batch_masks).bool(),0.0)
        
        batch_outputs = torch.bmm(masked_attention_scores.unsequeeze(1),key).squeeze(1)
        
        return batch_outputs, attention_scores

In [None]:
word2vec_path = 'word2vec.txt'
drop_out = 0.15

class WordCNNEncoder(nn.Module):
    def __init__(self,vocab):
        super(WordCNNEncoder,self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.word_dims = 100
        
        self.word_embed = nn.Embedding(vocab.word_size,self.word_dims,padding_idx=0)
        
        extword_embed = vocab.load_pretrained_embs(word2vec_path)
        extword_size, word_dims = extword_embed.shape
        logging.info("Load extword embed: words %d, dims %d."%(extword_size,word_dims))
        
        self.extword_embed = nn.Embedding(extword_size, word_dims, padding_idx=0)
        self.extword_embed.weight.data.copy_(torch.from_numpy(extword_embed))
        self.extword_embed.weigth.requires_grad = False
        
        input_size = self.word_dims
        # n_gram window
        self.filter_sizes = [2,3,4]
        