In [33]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.vocab import FastText
import pkuseg
import pandas as pd
import numpy as np

In [2]:
WORD_VEC_DIM = 300
BATCH = 128

In [3]:
totalpath = '../data/movie_remiew/train_data.csv'
testpath = '../data/movie_remiew/test_data.csv'

In [4]:
totaldf = pd.read_csv(totalpath)
totaldf.head(3)

Unnamed: 0,label,comment
0,0,国王的工作就是读几句稿子啊
1,0,小朋友看嫌复杂大朋友看想快进的尴尬人物似曾相识唯一的泪点也是复制黏贴
2,0,一个非常丰富传奇的故事拍得这么浅薄大家是怎么给出五星的好奇


In [5]:
totaldf.label.value_counts()

1    24960
0    19108
Name: label, dtype: int64

In [6]:
totaldf = totaldf.sample(frac=1).reset_index(drop=True)

In [7]:
totaldf.head()

Unnamed: 0,label,comment
0,1,好看治愈温情搞笑梦工厂的作品一直都很好看个人非常喜欢
1,1,确实和标签里写的一样有思考价值这部电影低成本的低到普通人也可以拍了但是里面蕴涵的深度比大片可牛多了
2,1,小时候老妈带着去电影院看的感动
3,1,修复版大漠孤烟英雄儿女刀光剑影江湖旧梦一直被模仿从未被超越
4,1,上座率非常高看完大家都没走一直到序幕结束才有序离开


In [8]:
validdf = totaldf.iloc[:4000]
traindf = totaldf.iloc[4000:]

validdf.to_csv('../data/movie_remiew/validset.csv', index=False)
traindf.to_csv('../data/movie_remiew/trainset.csv', index=False)

In [9]:
testdf = pd.read_csv(testpath)
testdf.head(3)

Unnamed: 0,comment
0,画面很美但台词实在是太太太矫情不是十几岁少年的正常青涩哪怕故作的沧桑而像没文化的油腻中年对键...
1,这片子好看好看好看好看重要的问题问四遍竟然是前百失望致幻的嗨药是弱者逃避现实的选择你觉得现实...
2,老电影有老电影说不出来的美感即使是近三个小时的电影细节做的仍非精致七武士的故事人类百科


In [10]:
testdf.shape[0]

9999

In [11]:
with open('./stopwords.txt', 'r') as f:
    lines = f.readlines()
    stopwords = [word.strip() for word in lines]

In [55]:
seg = pkuseg.pkuseg()

In [13]:
TEXT = torchtext.data.Field(init_token='<sos>', eos_token='<eos>', tokenize=seg.cut, stop_words=stopwords)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

In [14]:
train, valid = torchtext.data.TabularDataset.splits(path='../data/movie_remiew/', 
                                                    train='trainset.csv', 
                                                    validation='validset.csv',
                                                    format='csv', 
                                                    skip_header=True,
                                                    fields=[('label', LABEL), 
                                                            ('text', TEXT),])

In [15]:
test = torchtext.data.TabularDataset(path='../data/movie_remiew/test_data.csv', 
                                    format='csv', 
                                    skip_header=True,
                                    fields=[('text', TEXT),])

In [16]:
vectors = FastText(language='zh')

In [17]:
TEXT.build_vocab(train, vectors=vectors)

In [18]:
len(TEXT.vocab)

56927

In [19]:
TEXT.vocab.vectors.size()

torch.Size([56927, 300])

In [20]:
class TextDataset(Dataset):
    def __init__(self, tabularDataset, filed):
        super().__init__()
        self.tabular = tabularDataset
        self.filed = filed
        
    def __getitem__(self, index):
        text = self.tabular[index].text
        label = getattr(self.tabular[index], 'label', None)
        text_index = [self.filed.vocab.stoi[word] for word in text]
        
        return torch.tensor(text_index), torch.tensor(int(label)) if label is not None else None
    
    def __len__(self):
        return len(self.tabular)

In [21]:
# trainiter, validiter =  torchtext.data.BucketIterator.splits((train, valid), batch_sizes=(64, 64))

In [22]:
trainset = TextDataset(train, TEXT)
validset = TextDataset(valid, TEXT)
testset = TextDataset(test, TEXT)

In [23]:
def collate(batch):
    texts = [item[0] for item in batch]
    
    if batch[0][1] is None:
        labels = None
    else:
        labels = torch.tensor([item[1] for item in batch])
    offsets = [0] + [len(text) for text in texts][:-1]
    offsets = torch.tensor(offsets).cumsum(dim=0)
    texts = torch.cat(texts)
    
    return texts, labels, offsets

In [25]:
trainloader = DataLoader(trainset, batch_size=BATCH, shuffle=True, num_workers=2, collate_fn=collate)
validloader = DataLoader(validset, batch_size=BATCH, shuffle=False, num_workers=2, collate_fn=collate)
testloader = DataLoader(testset, batch_size=BATCH, shuffle=False, num_workers=2, collate_fn=collate)

In [26]:
class Net(nn.Module):
    def __init__(self, vectors):
        super().__init__()
        vocab_size = vectors.size(0)
        vec_dim = vectors.size(1)
        self.embedding = nn.EmbeddingBag(vocab_size, vec_dim)
        self.fc = nn.Linear(vec_dim, 1)
        self.init_weights(vectors)
        
    def init_weights(self, vectors):
        self.embedding.weight.data.copy_(vectors)
        self.fc.weight.data.uniform_(-0.5, 0.5)
        self.fc.bias.data.zero_()
        
    def forward(self, x, offsets):
        x = self.embedding(x, offsets)
        x = self.fc(x)
        return x

In [27]:
net = Net(TEXT.vocab.vectors)
net.embedding.requires_grad = False

In [28]:
def train(net, trainloader, epochs, lr=0.01):
    net.train()
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(net.parameters())
    
    for epoch in range(epochs):
        for step, (texts, labels, offsets) in enumerate(trainloader):
            labels = labels.unsqueeze(1).to(dtype=torch.float32)
            
            outputs = net(texts, offsets)
            loss = loss_fn(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if step % 100 == 99:
                with torch.no_grad():
                    accuracy = ((torch.sigmoid(outputs).numpy() > 0.5) == labels.numpy()).sum() / BATCH
                
                print(f"loss: {loss.item():.4f}    \taccuracy: {accuracy:.4f}")

In [45]:
train(net, trainloader, 5, 0.001)

loss: 0.1916    	accuracy: 0.9375
loss: 0.1711    	accuracy: 0.9609
loss: 0.1781    	accuracy: 0.9609
loss: 0.1164    	accuracy: 0.9766
loss: 0.1894    	accuracy: 0.9297
loss: 0.1732    	accuracy: 0.9609
loss: 0.1725    	accuracy: 0.9453
loss: 0.1710    	accuracy: 0.9141
loss: 0.1516    	accuracy: 0.9297
loss: 0.1587    	accuracy: 0.9453
loss: 0.1493    	accuracy: 0.9375
loss: 0.1663    	accuracy: 0.9375
loss: 0.2602    	accuracy: 0.9219
loss: 0.1165    	accuracy: 0.9531
loss: 0.1112    	accuracy: 0.9766


In [46]:
def evaluate(net, loader):
    net.eval()
    
    total = 0
    corrects = 0
    with torch.no_grad():
        for texts, labels, offsets in loader:
            outputs = net(texts, offsets)
            corrects += ((torch.sigmoid(outputs).numpy() > 0.5) == labels.numpy()).sum()
            total += outputs.size(0)
            
    return corrects / total

In [47]:
evaluate(net, validloader)

65.388

In [48]:
def predict(net, loader):
    net.eval()
    
    predictions = []
    with torch.no_grad():
        for texts, _, offsets in loader:
            outputs = net(texts, offsets)
            batch_pre = torch.sigmoid(outputs)
            predictions.append((batch_pre.numpy() > 0.5).squeeze().astype(np.int))
            
    return np.concatenate(predictions)

In [49]:
predictions = predict(net, testloader)

In [50]:
df_submit = pd.DataFrame(data=predictions, columns=['label'])

In [51]:
df_submit.head(3)

Unnamed: 0,label
0,0
1,1
2,0


In [52]:
df_submit.to_csv('../data/movie_remiew/zhangshulin_2019_9_4.csv', index=False)