In [1]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [2]:
class Vocabulary(object):
    #初始化
    def __init__(self,token_to_idx=None):
        if token_to_idx is None:
            token_to_idx={}
        self.token_to_idx=token_to_idx
        self.idx_to_token={v:k for k,v in self.token_to_idx.items()}
    #序列化输出
    def to_serializable(self):
        return {'token_to_idx':self.token_to_idx}
    @classmethod
    def from_serializable(cls,contents):
        return cls(**contents)
    #添加token
    def add_token(self,token):
        if token in self.token_to_idx:
            return self.token_to_idx[token]
        else:
            index=len(self.token_to_idx)
            self.token_to_idx[token]=index
            self.idx_to_token[index]=token
        
        return index
    #添加多个token
    def add_many(self,tokens):
        return [self.add_token(token) for token in tokens]
    #查找token的idx
    def lookup_token(self,token):
        return self.token_to_idx[token]
    #查找idx的token
    def lookup_idx(self,index):
        if index not in self.idx_to_idx:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self.idx_to_token(index)
    #
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self.token_to_idx)

In [3]:
class SequenceVocabulary(Vocabulary):
    
    def __init__(self,token_to_idx=None,unk_token='<UNK>',mask_token='<MASK>',begin_seq_token='<BEGIN>',end_seq_token='<END>'):
        super(SequenceVocabulary,self).__init__(token_to_idx)
        
        self.mask_token=mask_token
        self.unk_token=unk_token
        self.begin_seq_token=begin_seq_token
        self.end_seq_token=end_seq_token
        
        self.mask_idx=self.add_token(self.mask_token)
        self.unk_idx=self.add_token(self.unk_token)
        self.begin_seq_idx=self.add_token(self.begin_seq_token)
        self.end_seq_idx=self.add_token(self.end_seq_token)
    
    #序列化
    def to_serializable(self):
        #获取父类的序列化输出
        contents=super(SequenceVocabulary,self).to_serializable()
        #加入子类的数据
        contents.update(
        {
            'unk_token':self.unk_token,
            'mask_token':self.mask_token,
            'begin_seq_token':self.begin_seq_token,
            'end_seq_token':self.end_seq_token
        })
    def lookup_token(self,token):
        if self.unk_idx>=0:
            return self.token_to_idx.get(token,self.unk_idx)
        else:
            return self.token_to_idx[token]

In [4]:
class NewsVectorizer(object):
    
    def __init__(self,title_vocab,category_vocab):
        #文本字典
        self.title_vocab=title_vocab
        #标签字典
        self.category_vocab=category_vocab
    
    
    def vectorize(self,title,vector_length=-1):
        
        indices=[self.title_vocab.begin_seq_idx]
        #找出句子中的每个token的序号
        indices.extend(self.title_vocab.lookup_token(token) for token in title.split(' '))
        indices.append(self.title_vocab.end_seq_idx)
        
        if vector_length<0:
            vector_length=len(indices)
        #vector_length 为超参数，即为训练文本的长度
        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)]=indices
        out_vector[len(indices):]=self.title_vocab.mask_idx
        
        
        return out_vector
    
    
    @classmethod
    def from_dataframe(cls, news_df, cutoff=25):
        """
        从csv文件加载
        """
        category_vocab = Vocabulary() 
        #创建标签字典
        for category in sorted(set(news_df.category)):
            category_vocab.add_token(category)
        
        #统计每个token 出现的次数
        word_counts=Counter()
        for title in news_df.title:
            for token in title.split(' '):
                if token not in string.punctuation:
                    word_counts[token]+=1
        
        #文本字典
        title_vocab=SequenceVocabulary()
        for word,word_count in word_counts.items():
            #如果token 出现的频率大于cutoff，就添加入词典
            if word_count>cutoff==25:
                title_vocab.add_token(word)
        
        return cls(title_vocab,category_vocab)
    #根据序列化初始
    @classmethod
    def from_serializable(cls, contents):
        title_vocab =SequenceVocabulary.from_serializable(contents['title_vocab'])
        category_vocab =Vocabulary.from_serializable(contents['category_vocab'])

        return cls(title_vocab=title_vocab, category_vocab=category_vocab)
    #转化为序列化
    def to_serializable(self):
        return {'title_vocab': self.title_vocab.to_serializable(),
                'category_vocab': self.category_vocab.to_serializable()}

In [5]:
#创建数据集的类
class NewsDataset(Dataset):
    
    def __init__(self,news_df,vectorizer):
        self.news_df=news_df
        self.vectorizer=vectorizer
        
        measure_len=lambda context:len(context.split(' '))
        #获取文本的最长长度
        self.max_seq_length=max(map(measure_len,news_df.title))+2
        #获取各数据集
        self.train_df = self.news_df[self.news_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.news_df[self.news_df.split=='val']
        self.val_size = len(self.val_df)

        self.test_df = self.news_df[self.news_df.split=='test']
        self.test_size = len(self.test_df)
        
        #建立数据集字典
        self.lookup_dict={
            'train':(self.train_df,self.train_size),
            'val':(self.val_df,self.val_size),
            'test':(self.test_df,self.test_size)
        }
        
        #设置初始化哪种数据
        self.set_split('train')
        
        
        #统计各标签样本的多少
        class_counts=news_df.category.value_counts().to_dict()
        
        def sort_key(item):
            #返回标签的序号
            return self.vectorizer.category_vocab.lookup_token(item[0])
        #根据标签序号进行排序
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        #获取每个标签的数据集大小
        frequencies=[count for _,count in sorted_counts]
        
        self.class_weights=1.0/torch.tensor(frequencies,dtype=torch.float32)
    
    #加载数据并且创造Vectorizer类
    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv):
        news_df=pd.read_csv(news_csv)
        train_news_df=news_df[news_df.split=='train']
        
        return cls(news_df,NewsVectorizer.from_dataframe(train_news_df))
    
    #加载数据和vectorizer
    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        news_df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_csv, vectorizer)
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))
        
    #保存vectorizer为json文件
    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self.vectorizer.to_serializable(), fp)
            
    def get_vectorizer(self):
        return self.vectorizer
    
    #设置当前数据集
    def set_split(self,split='train'):
        self.target_split=split
        #获取对应数据集的文本和标签
        self.target_df,self.target_size=self.lookup_dict[split]
    
    
    def __len__(self):
        return self.target_size
    
    def __getitem__(self,idx):
        #获取idx的那一行数据
        row=self.target_df.iloc[idx]
        
        title_vector=self.vectorizer.vectorize(row.title,self.max_seq_length)
        
        category_vector=self.vectorizer.category_vocab.lookup_token(row.category)
        
        return {
            'x_data':title_vector,
            'y_target':category_vector
        }
    #计算多少个batch
    def get_num_batches(self,batch_size):
        return len(self)//batch_size
    
#用来生成数据
def generate_batches(dataset,batch_size,shuffle=True,drop_last=True,device='cpu'):
        dataloader=DataLoader(dataset=dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last)
        
        for data_dict in dataloader:
            out_data_dict={}
            
            for name,tensor in data_dict.items():
                out_data_dict[name]=data_dict[name].to(device)
            yield out_data_dict

In [6]:
#模型定义

class NewsClassifier(nn.Module):
    
    def __init__(self,embedding_size,num_embeddings,
                 num_channels,hidden_dim,num_classes,dropout_p,pretrained_embeddings=None,padding_idx=0):
        """
        embedding_size: 词向量的长度
        num_embedidngs：词向量的数量
        num_channels:每层卷积的数量
        hidden_dim:隐藏层的数目
        num_classes:类别数目
        """
        
        super(NewsClassifier,self).__init__()
        
        if pretrained_embeddings is None:
            self.emb=nn.Embedding(embedding_dim=embedding_size,num_embeddings=num_embeddings,padding_idx=padding_idx)
        else:
            pretrained_embeddings=torch.from_numpy(pretrained_embeddings).float()
            #给予权重
            self.emb=nn.Embedding(embedding_dim=embedding_size,num_embeddings=num_embeddings,padding_idx=padding_idx,_weight=pretrained_embeddings)
        
        self.conv=nn.Sequential(
            nn.Conv1d(embedding_size,num_channels,3),
            nn.ELU(),
            nn.Conv1d(num_channels,num_channels,3,stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels,num_channels,3,stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels,num_channels,3),
            nn.ELU()
        )
        
        self.dropout_p=dropout_p
        
        self.fc1=nn.Linear(num_channels,hidden_dim)
        self.fc2=nn.Linear(hidden_dim,num_classes)
    
    def  forward(self,x_in,apply_softmax=False):
        
        #permute 变化size
        x_embedded=self.emb(x_in).permute(0,2,1)
        features=self.conv(x_embedded)
        
        remaining_size=features.size(dim=2)
        features=F.avg_pool1d(features,remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self.dropout_p)
        
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self.dropout_p))
        prediction_vector = self.fc2(intermediate_vector)
        
        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)
        return prediction_vector

In [7]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}
def update_train_state(args, model, train_state):
    
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [8]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

def load_glove_from_file(glove_filepath):
    
    word_to_index={}
    embeddings=[]
    
    with open(glove_filepath,'r',encoding='utf-8') as fp:
        for index,line in enumerate(fp):
            line=line.split(' ')
            word_to_index[line[0]]=index
            embedding_i=np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
        return word_to_index,np.stack(embeddings)
def make_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size=glove_embeddings.shape[1]
    
    final_embeddings=np.zeros((len(words),embedding_size))
    
    for i,word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i

    return final_embeddings

In [33]:
from argparse import Namespace
args = Namespace(
    # Data and Path hyper parameters
    news_csv="data/news_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/document_classification",
    # Model hyper parameters
    glove_filepath='data/glove/glove.6B.100d.txt', 
    use_glove=False,
    embedding_size=100, 
    hidden_dim=100, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.1, 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/document_classification\vectorizer.json
	model_storage/document_classification\model.pth
Using CUDA: True


In [10]:
args.use_glove = True
#如果是重新加载
if args.reload_from_files:
    dataset = NewsDataset.load_dataset_and_load_vectorizer(args.news_csv,
                                                              args.vectorizer_file)
else:
    #创造数据库
    dataset = NewsDataset.load_dataset_and_make_vectorizer(args.news_csv)
    dataset.save_vectorizer(args.vectorizer_file)
#获取向量化工具类
vectorizer = dataset.get_vectorizer()

#判断是否使用预训练词向量
if args.use_glove:
    words=vectorizer.title_vocab.token_to_idx.keys()
    embeddings=make_embedding_matrix(args.glove_filepath,words=words)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

    
classifier = NewsClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.title_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.category_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Using pre-trained embeddings


In [11]:
len(vectorizer.title_vocab)

3297

In [12]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)
try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

HBox(children=(IntProgress(value=0, description='training routine', style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='split=train', max=656, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='split=val', max=140, style=ProgressStyle(description_width='i…

In [31]:
classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [32]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 0.5621352142521316;
Test Accuracy: 82.91852678571425


In [None]:
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
#用来推断信
def predict_category(title, classifier, vectorizer, max_length):