In [1]:
# 作业要求：接触NLP 当中一个简单的task —— 语句分类（文本分类），给定一个语句，判断他有没有恶意（负面标1，正面标0）
# 环境：Anaconda3+Jupyter Notebook，Python 3.6.8

In [2]:
# 1.utils.py
# 这个block用来定义一下用到的函数
import warnings
warnings.filterwarnings('ignore')# 过滤警告
import torch # 深度学习框架
import numpy as np # 支持大量的维度数组与矩阵运算
import pandas as pd # 用于数据分析
import torch.optim as optim # 主要包含用来更新参数的优化算法
import torch.nn.functional as F # 主要包含用来搭建各个层的模块和一系列有用的loss函数

# 把training 时需要的data读取出来
def load_training_data(path='training_label.txt'): 
    # 如果是‘training_label.txt',需要读取label；如果是’training_nolabel.txt'，不需要读取label
    if 'training_label' in path:
        with open(path,'r',encoding='utf-8') as f:
            lines=f.readlines()
            lines=[line.strip('\n').split(' ') for line in lines] 
            # line为lines中删除开头结尾处的‘\n’并按照‘  ’分割
        x=[line[2:] for line in lines]
        y=[line[0] for line in lines]
        return x,y
    else:
        with open(path,'r',encoding='utf-8') as f:
            lines=f.readlines()
            x=[line.strip('\n').split(' ') for line in lines]
        return x
    
# 把 testing 时需要的data读取出来    
def load_testing_data(path='testing_data.txt'):
    with open(path,'r',encoding='utf-8') as f:
        lines=f.readlines()
        # 将lines中的line删除开头结尾处的‘\n’并按‘ ’分割加入新的空字符串中，再删除空白符
        X=["".join(line.strip('\n').split(',')[1:]).strip() for line in lines]
        # 取出X中的元素并以‘ ’分割
        X=[sen.split(' ') for sen in X]
    return X

# outputs  => 概率(float)  labels   => labels
def evaluation(outputs,labels):
    # 大于等于0.5即为正面
    outputs[outputs>=0.5]=1
    # 小于0.5为负面
    outputs[outputs<0.5]=0 
    # torch.sum(input)对输入的tensor数据的某一维度求和
    # torch.eq(input,other)比较元素是否相等，other可以是一个数或者是一个和input同类型形状的张量
    #        返回结果是一个torch.ByteTensor张量，包含了每个位置的比较结果（相等为1，不等为0）
    # correct即为将outputs和labels比较后的形成的tensor中的元素
    correct=torch.sum(torch.eq(outputs,labels)).item()
    return correct

In [3]:
# 2.w2v.py 
# 这个block将训练word变成vector做word embedding
import os
import numpy as np
import pandas as pd
import argparse # python命令行解析模块
from gensim.models import Word2Vec  #用于初始化并训练一个单词到向量转换

# 训练 word to vector 的 word embedding词嵌入
def train_word2vec(x):    
    # 参数说明：
    # x---语料集，size---特征向量的维度，
    # window---表示当前词与预测词在一个句子中最大的距离
    # min_count---对字典做截断，词频少于min_count次数的单词会被丢弃掉
    # workers---控制训练的并行数
    # iter---迭代次数
    # sg---设置训练算法 默认为0---CBOW算法，sg=1则采用skip-gram算法
    model = Word2Vec(x,size=250,window=5,min_count=5,workers=12,iter=10,sg=1)
    return model

# 此部分的代码入口在此
if __name__=="__main__": 
    # 加载训练数据
    print("loading training data ...")
    train_x,y=load_training_data('training_label.txt')
    train_x_no_label=load_training_data('training_nolabel.txt')
    
    # 加载测试数据
    print("loading testing data ...")
    test_x = load_testing_data('testing_data.txt')
    
    # 将word转换成vector
    #model = train_word2vec(train_x+train_x_no_label+test_x)
    model=train_word2vec(train_x+test_x)
    
    print("saving model ...")
    #model.save(os.path.join('./','model/w2v_all.model'))
    model.save(os.path.join('./','w2v_all.model'))

loading training data ...
loading testing data ...
saving model ...


In [4]:
# preprocess.py
# 这个block用来做data的预处理
from torch import nn
from gensim.models import Word2Vec

class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        self.w2v_path = w2v_path # word2vetor模型路径
        self.sentences = sentences # 句子
        self.sen_len = sen_len # 句子长度
        self.idx2word = [] # idx 到 单词
        self.word2idx = {} # 单词 到 inx
        self.embedding_matrix = [] # 词嵌入
    def get_w2v_model(self):
        # 把之前训练好的 word to vector 模型读取出来
        self.embedding = Word2Vec.load(self.w2v_path)# 加载模型
        self.embedding_dim = self.embedding.vector_size # 获取维度
    def add_embedding(self, word):
        # 把 word 加进 embedding，并赋予他一个随机生成的represrntation vector
        # word 只会是“<PAD>”或者“<UNK>”
        vector = torch.empty(1, self.embedding_dim)
        # 创建一个未被初始化数值的tensor，形状是是size（1，词嵌入的维度）
        torch.nn.init.uniform_(vector)
        # 从均匀分布（1，embedding_dim)中生成值
        self.word2idx[word] = len(self.word2idx)
        # 将长度放入self.word2idx中
        self.idx2word.append(word)
        # 将word添加到self.idx2word的末尾
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
        # 竖着将 embedding_matrix和vector拼接起来
    def make_embedding(self, load=True):
        print("Get embedding ...")
        # 取得训练好的 word2vex word embedding
        if load:
            print("loading word to vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError
        # 制作一个 word2idx的字典
        # 制作一个 idx2word的列表
        # 制作一个 word2vector的列表
        for i, word in enumerate(self.embedding.wv.vocab):
        # enumerate()用于将词嵌入中单词组合为一个索引序列，同时列出数据和数据下标
            print('get words #{}'.format(i+1), end='\r')
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        print('')
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
         # 将“<PAD>”和“<UNK>”加入embedding中
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix
    def pad_sequence(self, sentence):
         # 将每个句子变成一样的长度
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
            # 如果句子长则截断
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        #句子长度相等正常运行，句子长度不等触发异常
        return sentence
    def sentence_word2idx(self):
       # 把句子里面的字转成相对应的 index
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            print('sentence count #{}'.format(i+1), end='\r')
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                    # 单词在句子中则将其放入索引之中
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            # 单词不在句子中则在索引中放入未知的标识
            # 将每个句子变成一样的长度
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
            # 将句子索引添加到列表的末尾
        return torch.LongTensor(sentence_list)
    def labels_to_tensor(self, y):
        # 把 labels 轉成 tensor
        y = [int(label) for label in y]
        return torch.LongTensor(y)


In [5]:
# data.py
# 这部分所需要的“__init__","__getitem__","__len__"好让dataloader能使用
import torch
from torch.utils import data

class TwitterDataset(data.Dataset):
    """
    期望的数据形状大致像：（数据数量，数据长度）
    数据可以是numpy数组的list或者列表的list
    输入数据的形状：（数据数量，序列长度，特征维度）
    __len__将返回数据的数量
    """
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

In [6]:
# model.py
# 这个bolck是要拿来训练的模型
import torch
from torch import nn
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        # 制作 embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        # 是否将embedding fix住，如果fix_embedding 为 False，在训练过程中，embedding也会跟着被训练
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential( nn.Dropout(dropout),
                                         nn.Linear(hidden_dim, 1),
                                         nn.Sigmoid() )
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # x 的 dimension (batch, seq_len, hidden_size)
        # 取用LSTM最后一层的隐藏状态
        x = x[:, -1, :] 
        x = self.classifier(x)
        return x

In [7]:
# train.py
# 这个block是用来训练模型的
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
    model.train() # 将model的模式设为train，这样optimiser就看也i更新model的参数
    criterion = nn.BCELoss() # 定义算是函数 这里使用binary cross entropy
    t_batch = len(train) 
    v_batch = len(valid) 
    optimizer = optim.Adam(model.parameters(), lr=lr)  # 将模型的参数给optimizer，并给与适当的learning rate
    total_loss, total_acc, best_acc = 0, 0, 0
    for epoch in range(n_epoch):
        total_loss, total_acc = 0, 0
        # 這段做 training
        for i, (inputs, labels) in enumerate(train):
            inputs = inputs.to(device, dtype=torch.long) 
            # device 为 “cuda”，将inputs转成torch.cuda.LongTensor
            labels = labels.to(device, dtype=torch.float) 
            # device 为“cuda”，将labels转成torch.duda.FloatTensor
            optimizer.zero_grad() 
            # 由于loss.backward() 的 gradient 会累加，所以每次喂完一个batch后需要归零
            outputs = model(inputs)  
            # 将input 喂给模型
            outputs = outputs.squeeze()  
            # 去掉最外面的dimension，好让outputs 可以喂进criterion()
            loss = criterion(outputs, labels) 
            # 计算此时模型的training loss
            loss.backward() 
            # 计算loss的gradient
            optimizer.step() 
            # 更新训练模型的参数
            correct = evaluation(outputs, labels) 
            # 计算此时模型的training accuracy
            total_acc += (correct / batch_size)
            total_loss += loss.item()
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))

        # 这段做 validation
        model.eval() #将model的模式设为eval，这样model的参数就会固定住
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long) 
                # decice为“cude",将input转成torch.cuda.LongTensor
                labels = labels.to(device, dtype=torch.float) 
                # device为"cude",将labels转成torch cuda.FloatTensor
                outputs = model(inputs) 
                # 將 input 放入模型中
                outputs = outputs.squeeze()  
                # 去掉最外面的 dimension，好让 outputs 可以放进 criterion()
                loss = criterion(outputs, labels)  
                # 计算此时模型的 validation loss
                correct = evaluation(outputs, labels)  
                # 计算此时模型的 validation accuracy
                total_acc += (correct / batch_size)
                total_loss += loss.item()

            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            if total_acc > best_acc:
                 # 如果validation的结果优于之前的结果，就把当下的模型保存下来以备之后做预测时使用
                best_acc = total_acc
                #torch.save(model, "{}/val_acc_{:.3f}.model".format(model_dir,total_acc/v_batch*100))
                torch.save(model, "{}/ckpt.model".format(model_dir))
                print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
        print('-----------------------------------------------')
        model.train() # 将model的模式设为train，这样optimizer就可以更新model的参数（因为刚刚转成eval模式）

In [8]:
# test.py
# 这个 block 用来对testing_data.txt做预测
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def testing(batch_size, test_loader, model, device):
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs>=0.5] = 1 # 大于等于0.5为正面
            outputs[outputs<0.5] = 0 # 小于0.5为负面
            ret_output += outputs.int().tolist()
    
    return ret_output

In [9]:
path_prefix = './'
# main.py
import os
import torch
import argparse
import numpy as np
from torch import nn
from gensim.models import word2vec
from sklearn.model_selection import train_test_split

# 通过 torch.cuda.is_available() 的回传值进行判断是否有使用 GPU 的环境，如果有的话 device 就设为 "cuda"，没有的话就设为 "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 处理好各个 data 的路径
train_with_label = os.path.join(path_prefix, 'training_label.txt')
train_no_label = os.path.join(path_prefix, 'training_nolabel.txt')
testing_data = os.path.join(path_prefix, 'testing_data.txt')

w2v_path = os.path.join(path_prefix, 'w2v_all.model') # 處理 word to vec model 的路徑

# 定义句子长度、要不要固定 embedding、batch 大小、要训练几个 epoch、learning rate 的值、model 的资料夹路径
sen_len = 20
fix_embedding = True # 训练时fix住embedding
batch_size = 128
epoch = 5
lr = 0.001
model_dir = path_prefix # 检查点模型的目录

print("loading data ...") 
# 把 'training_label.txt' 跟 'training_nolabel.txt' 读进来
train_x, y = load_training_data(train_with_label)
train_x_no_label = load_training_data(train_no_label)

# 对 input 跟 labels 做预处理
preprocess = Preprocess(train_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)

# 制作一个 model 的对象
model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
model = model.to(device) 
# device为 "cuda"，model 使用 GPU 来训练（喂进去的 inputs 也需要是 cuda tensor）

# 把 data 分为 training data 跟 validation data（将一部份 training data 拿去当作 validation data）
X_train, X_val, y_train, y_val = train_x[:180000], train_x[180000:], y[:180000], y[180000:]

# 把 data 做成 dataset 供 dataloader 取用
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_val, y=y_val)

# 把 data 转成 batch of tensors
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            num_workers = 0)

val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 0)

# 开始训练
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model, device)

loading data ...
Get embedding ...
loading word to vec model ...
get words #24694
total words: 24696
sentence count #200000
start training, parameter total:6415351, trainable:241351

[ Epoch1: 1407/1407 ] loss:0.356 acc:21.094 
Train | Loss:0.49712 Acc: 75.117
Valid | Loss:0.45631 Acc: 77.956 
saving model with acc 77.956
-----------------------------------------------
[ Epoch2: 1407/1407 ] loss:0.526 acc:19.531 
Train | Loss:0.44308 Acc: 79.154
Valid | Loss:0.44218 Acc: 78.856 
saving model with acc 78.856
-----------------------------------------------
[ Epoch3: 1407/1407 ] loss:0.358 acc:20.312 
Train | Loss:0.42594 Acc: 80.178
Valid | Loss:0.43172 Acc: 79.553 
saving model with acc 79.553
-----------------------------------------------
[ Epoch4: 1407/1407 ] loss:0.478 acc:18.750 
Train | Loss:0.41360 Acc: 80.882
Valid | Loss:0.42230 Acc: 80.295 
saving model with acc 80.295
-----------------------------------------------
[ Epoch5: 1407/1407 ] loss:0.568 acc:17.969 
Train | Loss:0.4

In [12]:
# 开始测试模型并做预测
print("loading testing data ...")
test_x = load_testing_data(testing_data)
preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 0)
print('\nload model ...')
model = torch.load(os.path.join(model_dir, 'ckpt.model'))
outputs = testing(batch_size, test_loader, model, device)

tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"label":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(path_prefix, 'predict.csv'), index=False)
print("Finish Predicting")


loading testing data ...
Get embedding ...
loading word to vec model ...
get words #24694
total words: 24696
sentence count #200001
load model ...
save csv ...
Finish Predicting
