In [15]:
import torch
import torch.nn as nn


config = {
    'train_file_path': 'dataset/train.csv',
    'test_file_path': 'dataset/test.csv',
    'embedding_path': 'dataset/sgns.weibo.word.bz2',
    'train_val_ratio': 0.1,
    'vocab_size': 30000,
    'batch_size': 64,
    'num_epochs': 10,
    'learning_rate': 1e-3,
    'logging_step': 300,
    'seed': 10003
}

config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'

import random
import numpy as np

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

10003

In [7]:
# train.csv 4 cols: id, label, label_desc, sentence
from collections import Counter
from tqdm import tqdm
import jieba

def get_vocab(config):
    token_counter = Counter()
    with open(config['train_file_path'], 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in tqdm(lines, desc='Counting tokens', total=len(lines)):
            sent = line.split(',')[-1].strip()
            sent_cut = list(jieba.cut(sent))
            token_counter.update(sent_cut)
            # token_counter {'我': 2, '是': 5, ...}
    
    vocab = set(token for token, _ in token_counter.most_common(config['vocab_size']))
    return vocab

In [9]:
vocab = get_vocab(config)

Counting tokens: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 53361/53361 [00:07<00:00, 6960.64it/s]


In [12]:
import bz2

with bz2.open(config['embedding_path']) as f:
    token_vector = f.readlines()

In [14]:
for i, line in enumerate(token_vector):
    if i == 10:
        line = line.split()
        print(line[0].decode('utf-8'))
        # print(line[1:])
        print(len(line[1:]))
        break

是
300


In [18]:
# 将 词典(vocab) 中的token 转化为 词向量
# token -> embedding
# token -> id

# '是' <-> 10 <-> 300d vector

def get_embedding(vocab):
    token2embedding = {}
    
    with bz2.open('dataset/sgns.weibo.word.bz2') as f:
        token_vector = f.readlines()
        
        meta_info = token_vector[0].split()
        print(f'{int(meta_info[0])} tokens in embedding file in total, vector size is {int(meta_info[1])}')
        
        # sgns.weibo.word.bz2 从第二行开始, 每一行是 'token embedding' 的形式
        # '我' 0.88383 .... (300)
        for line in tqdm(token_vector[1:]):
            line = line.split()
            token = line[0].decode('utf-8')
            
            vector = line[1:]
            
            if token in vocab:
                token2embedding[token] = [float(num) for num in vector]
        
        # enumerate(, [start])
        token2id = {token: idx for idx, token in enumerate(token2embedding.keys(), 4)}
        id2embedding = {token2id[token]: embedding for token, embedding in token2embedding.items()}
        
        PAD, UNK, BOS, EOS = '<pad>', '<unk>', '<bos>', 'eos'
        
        token2id[PAD] = 0
        token2id[UNK] = 1
        token2id[BOS] = 2
        token2id[EOS] = 3
        
        id2embedding[0] = [.0] * int(meta_info[1])
        id2embedding[1] = [.0] * int(meta_info[1])
        id2embedding[2] = np.random.random(int(meta_info[1])).tolist()
        id2embedding[3] = np.random.random(int(meta_info[1])).tolist()
        
        emb_mat = [id2embedding[idx] for idx in range(len(id2embedding))]
        
        return torch.tensor(emb_mat, dtype=torch.long), token2id, len(vocab) + 4

In [21]:
emb_mat, token2id, config['vocab_size'] = get_embedding(vocab)
print(token2id['你'])

195202 tokens in embedding file in total, vector size is 300


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 195202/195202 [00:04<00:00, 44767.69it/s]

31



  return torch.tensor(emb_mat, dtype=torch.long), token2id, len(vocab) + 4


In [22]:
def tokenizer(sent, token2id):
    return [token2id.get(token, 1) for token in jieba.cut(sent)]

In [23]:
import pandas as pd
from collections import defaultdict


def read_data(config, token2id, mode='train'):
    data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')
    if mode == 'train':
        X_train, y_train = defaultdict(list), []
        X_val, y_val = defaultdict(list), []
        num_val = int(config['train_val_ratio'] * len(data_df))
    else:
        X_test, y_test = defaultdict(list), []
        
    for i, row in tqdm(data_df.iterrows(), desc=f'Preprocessing {mode} data', total=len(data_df)):
        label = row[1] if mode == 'train' else 0
        sentence = row[-1]
        inputs = tokenizer(sentence, token2id)
        if mode == 'train':
            if i < num_val:
                X_val['input_ids'].append(inputs)
                y_val.append(label)
            else:
                X_train['input_ids'].append(inputs)
                y_train.append(label)
        else:
            X_test['input_ids'].append(inputs)
            y_test.append(label)
    
    if mode == 'train':
        label2id = {label: i for i, label in enumerate(np.unique(y_train))}
        id2label = {i: label for label, i in label2id.items()}
        
        y_train = torch.tensor([label2id[label] for label in y_train], dtype=torch.long)
        y_val = torch.tensor([label2id[label] for label in y_val], dtype=torch.long)
        
        return X_train, y_train, X_val, y_val, label2id, id2label
    else:
        y_test = torch.tensor(y_test, dtype=torch.long)
        return X_test, y_test

In [24]:
X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, token2id, mode='train')
X_test, y_test = read_data(config, token2id, mode='test')

Preprocessing train data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 53360/53360 [00:09<00:00, 5613.96it/s]
Preprocessing test data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 5764.72it/s]


In [25]:
from torch.utils.data import Dataset

class TNEWSDataset(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y
        
    def __getitem__(self, idx):
        return {
            'input_ids': self.x['input_ids'][idx],
            'label': self.y[idx]
        }
    
    def __len__(self):
        return self.y.size(0)

In [26]:
def collate_fn(examples):
    input_ids_list = []
    labels = []
    for example in examples:
        input_ids_list.append(example['input_ids'])
        labels.append(example['label'])
        
    # 1.找到input_ids_list中最长的句子
    max_length = max(len(input_ids) for input_ids in input_ids_list)
    # 2. 定义一个Tensor
    input_ids_tensor = torch.zeros((len(labels), max_length), dtype=torch.long)
    
    for i, input_ids in enumerate(input_ids_list):
        # 3.得到当前句子的长度
        seq_len = len(input_ids)
        input_ids_tensor[i: seq_len] = torch.tensor(input_ids, dtype=torch.long)
        
    return {
        'input_ids': input_ids_tensor,
        'label': torch.tensor(labels, dtype=torch.long)
    }

In [27]:
from torch.utils.data import DataLoader

def build_dataloader(config, vocab):
    X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, token2id, mode='train')
    X_test, y_test = read_data(config, token2id, mode='test')
    
    train_dataset = TNEWSDataset(X_train, y_train)
    val_dataset = TNEWSDataset(X_val, y_val)
    test_dataset = TNEWSDataset(X_test, y_test)
    
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'], num_workers=8, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size=config['batch_size'], num_workers=8, shuffle=False, collate_fn=collate_fn)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=config['batch_size'], num_workers=8, shuffle=False, collate_fn=collate_fn)
    
    return id2label, train_dataloader, val_dataloader, test_dataloader

In [28]:
id2label, train_dataloader, val_dataloader, test_dataloader = build_dataloader(config, vocab)

Preprocessing train data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 53360/53360 [00:09<00:00, 5535.85it/s]
Preprocessing test data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 5729.76it/s]
