In [1]:
import numpy as np
import pandas as pd
import os
import time
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from argparse import Namespace

Using TensorFlow backend.


In [2]:
data=pd.read_csv('./data/news_with_splits.csv')

In [3]:
data.head()

Unnamed: 0,category,split,title
0,Business,train,"jobs , tax cuts key issues for bush"
1,Business,train,jarden buying mr . coffee s maker
2,Business,train,retail sales show festive fervour
3,Business,train,intervoice s customers come calling
4,Business,train,boeing expects air force contract


In [4]:
categories=set(data.category)
categories

{'Business', 'Sci/Tech', 'Sports', 'World'}

In [5]:
results={}
for i,value in  enumerate(categories):
    results[value]=i

In [6]:
results

{'World': 0, 'Sports': 1, 'Sci/Tech': 2, 'Business': 3}

In [7]:
data['category']=data['category'].map(results).astype(int)

In [8]:
data.head()

Unnamed: 0,category,split,title
0,3,train,"jobs , tax cuts key issues for bush"
1,3,train,jarden buying mr . coffee s maker
2,3,train,retail sales show festive fervour
3,3,train,intervoice s customers come calling
4,3,train,boeing expects air force contract


In [9]:
len(data)

120000

In [10]:
from torch.utils.data import Dataset, DataLoader

In [11]:
class NewsDataset(Dataset):
    
    def __init__(self,tokenizer,data):
        
        self.tokenizer=tokenizer
        self.data=data
        
        #获取文本的最长长度
        measure_len=lambda context:len(context.split(' '))
        self.max_seq_length=max(map(measure_len,self.data.title))
        
        
        #获取各种类型数据的数据集
        train_data=data[data.split=='train']
        val_data=data[data.split=='val']
        test_data=data[data.split=='test']
        
        #获取各个样本的x转化为index,
        x_train = tokenizer.texts_to_sequences(list(train_data.title))
        x_val=tokenizer.texts_to_sequences(list(val_data.title))
        x_test = tokenizer.texts_to_sequences(list(test_data.title))
        
        #pad,即获取各数据集的x_data
        self.x_train = sequence.pad_sequences(x_train, maxlen=self.max_seq_length)
        self.x_val=sequence.pad_sequences(x_val,maxlen=self.max_seq_length)
        self.x_test = sequence.pad_sequences(x_test, maxlen=self.max_seq_length)
        
        #获取y值
        self.y_train=list(train_data.category)
        self.y_val=list(val_data.category)
        self.y_test=list(test_data.category)
        
        #获取各样本的长度
        self.train_size=len(train_data)
        self.val_size=len(val_data)
        self.test_size=len(test_data)
        
        #建立数据集字典
        self.lookup_dict={
            'train':(self.x_train,self.y_train,self.train_size),
            'val':(self.x_val,self.y_val,self.val_size),
            'test':(self.x_test,self.y_test,self.test_size)
        }
        
        #默认训练数据
        self.set_split('train')
    
    #设置数据集
    def set_split(self,split='train'):
        self.target_x,self.target_y,self.target_size=self.lookup_dict[split]
        
    #计算多少个batch
    def get_num_batches(self,batch_size):
        return len(self)//batch_size
    
    def __len__(self):
        return self.target_size
    
    def __getitem__(self,index):
        x_data=self.target_x[index]
        y_data=self.target_y[index]
        
        x_data=torch.tensor(x_data,dtype=torch.long)
        y_data=torch.tensor(y_data,dtype=torch.long)
        return {
            'x_data':x_data,
            'y_target':y_data
        }

In [12]:
class NewsClassifier(nn.Module):
    def __init__(self,embedding_matrix,max_features,num_channels,hidden_dim,num_classes,dropout_p):
        super(NewsClassifier,self).__init__()
        embed_size = embedding_matrix.shape[1]
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.conv=nn.Sequential(
            nn.Conv1d(embed_size,num_channels,3),
            nn.ELU(),
            nn.Conv1d(num_channels,num_channels,3,stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels,num_channels,3,stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels,num_channels,3),
            nn.ELU()
        )
        self.dropout_p=dropout_p
        
        self.fc1=nn.Linear(num_channels,hidden_dim)
        self.fc2=nn.Linear(hidden_dim,num_classes)
        
    def  forward(self,x_in,apply_softmax=False):
        
        #permute 变化size
        x_embedded=self.embedding(x_in).permute(0,2,1)
        features=self.conv(x_embedded)
        
        remaining_size=features.size(dim=2)
        features=F.avg_pool1d(features,remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self.dropout_p)
        
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self.dropout_p))
        prediction_vector = self.fc2(intermediate_vector)
        
        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)
        return prediction_vector

In [13]:
tokenizer=text.Tokenizer()
tokenizer.fit_on_texts(list(data.title))
newsDataset=NewsDataset(tokenizer,data)

In [14]:
dataloader=DataLoader(dataset=newsDataset,batch_size=128,shuffle=True,drop_last=True)
for data_dict in dataloader:
    print(data_dict['x_data'].type())
    print(data_dict['y_target'].type())
    break

torch.LongTensor
torch.LongTensor


In [15]:
#用来生成数据
def generate_batches(dataset,batch_size,shuffle=True,drop_last=True,device='cpu'):
        dataloader=DataLoader(dataset=dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last)
        
        for data_dict in dataloader:
            out_data_dict={}
            
            for name,tensor in data_dict.items():
                out_data_dict[name]=data_dict[name].to(device)
            yield out_data_dict
#创作训练状态
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}
#更新训练状态
def update_train_state(args, model, train_state):
    
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state
#计算准确率
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

#
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)
#文件夹处理
def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [16]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
#加载普通txt和vec预训练词向量
def load_embeddings_normal(path):
    with open(path,'r',encoding='utf-8') as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))
    
#建议英文文本时，不要将所有文本转化为小写，这可能丢失许多信息
#根据word_index 建立index_word矩阵，如果当前word在词中找不到对应的向量，转化为小写
def build_matrix(word_index, path,embedding_size):
    embedding_index = load_embeddings_normal(path)
    embedding_matrix = np.zeros((max_features + 1, embedding_size))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
    return embedding_matrix, unknown_words

In [17]:
#训练参数
args = Namespace(
    # Data and Path hyper parameters
    news_csv="data/news_with_splits.csv",
    model_state_file="model.pth",
    save_dir="model_storage/document_classification",
    # Model hyper parameters
    glove_filepath='data/glove/glove.6B.100d.txt', 
    embedding_size=100, 
    hidden_dim=100, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.1, 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    expand_filepaths_to_save_dir=True
) 

#如果选择创建文件夹
if args.expand_filepaths_to_save_dir:
   
    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)    
    print("Expanded filepaths: ")
    print("\t{}".format(args.model_state_file))
# 检查CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/document_classification\model.pth
Using CUDA: True


In [18]:
max_features = None
max_features = max_features or len(tokenizer.word_index) + 1

In [19]:
#建立词向量矩阵
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, args.glove_filepath,args.embedding_size)
print('n unknown words (crawl): ', len(unknown_words_glove))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


n unknown words (crawl):  1682


In [20]:
max_features

32183

In [21]:
#创建模型
classifier=NewsClassifier(embedding_matrix=glove_matrix,max_features=max_features,num_channels=args.num_channels,
                         hidden_dim=args.hidden_dim,
                         num_classes=4,
                         dropout_p=args.dropout_p
                         )

In [22]:
#查看模型结构
print(classifier)

NewsClassifier(
  (embedding): Embedding(32183, 100)
  (conv): Sequential(
    (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(100, 100, kernel_size=(3,), stride=(2,))
    (3): ELU(alpha=1.0)
    (4): Conv1d(100, 100, kernel_size=(3,), stride=(2,))
    (5): ELU(alpha=1.0)
    (6): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (7): ELU(alpha=1.0)
  )
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)


In [23]:
import torch.optim as optim
print(args.device)
classifier=classifier.to(args.device)
loss_func=nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)
epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

newsDataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=newsDataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
newsDataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=newsDataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)
try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index
        
        newsDataset.set_split('train')
        batch_generator = generate_batches(newsDataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()
        
        
        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)
        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        newsDataset.set_split('val')
        batch_generator = generate_batches(newsDataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

cuda


HBox(children=(IntProgress(value=0, description='training routine', style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='split=train', max=656, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='split=val', max=140, style=ProgressStyle(description_width='i…