Inspiration for this notebook is from https://www.kaggle.com/abefetterman/pytorch-gru-pooling-with-torchtext/code

https://gist.github.com/ceshine/50a71e266722d0b7b00e2641fc86eb6f

In [1]:
from torchtext import data
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import spacy
import logging

NLP = spacy.load('en')
MAX_CHARS = 20000
VAL_RATIO = 0.2
LOGGER = logging.getLogger("yelp")

id_label = 'id'
text_label = 'text'
stars_label = 'stars'

yelp_reviews = 'yelp_review.csv'

# embedding_file = 'crawl-300d-2M.vec'

# some iterators produce StopIteration, which is no longer a warning, we don't need to hear about it
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
%load_ext memory_profiler

In [3]:
%memit
from torchtext.vocab import FastText
vectors = FastText('simple')

peak memory: 328.74 MiB, increment: 0.33 MiB


In [4]:
%memit
def clean_str(string):
    """
    Tokenization/string cleaning.
    Original from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()

peak memory: 466.30 MiB, increment: 0.02 MiB


In [5]:
# %memit
# df = pd.read_csv('yelp_review.csv')

In [6]:
# df = df[['text', 'stars']]
# df.head()

In [7]:
def prepare_csv(df, seed=999):
    df['text'] = df['text'].apply(clean_str)
    df_train, df_test = train_test_split(df, test_size=0.2)
    df_train.to_csv("yelp_tmp/dataset_train.csv", index=False)
    df_test.to_csv("yelp_tmp/dataset_val.csv", index=False)

In [8]:
# %%time
# prepare_csv(df)
# df = 1

In [9]:
import gc
gc.collect()

0

In [None]:
%memit
# Define all the types of fields
# pip install spacy for the tokenizer to work (or remove to use default)
TEXT = data.Field(lower=True, include_lengths=True, fix_length=150, tokenize='spacy')
LABEL = data.Field(sequential=True, use_vocab=False)

# we use the index field to re-sort test data after processing
INDEX = data.Field(sequential=False)

train_fields=[
#     (id_label, INDEX),
    (text_label, TEXT),
    (stars_label, LABEL)
]

train_fields=[
#     (id_label, INDEX),
    (text_label, TEXT),
    (stars_label, LABEL)
]

train = data.TabularDataset(
    path='yelp_tmp/dataset_train.csv', format='csv', skip_header=True,
    fields=train_fields)

test_fields=[
    (id_label, INDEX),
    (text_label, TEXT),
    (stars_label, LABEL)
]
test = data.TabularDataset(
        path='yelp_tmp/dataset_val.csv', format='csv', skip_header=True,
        fields=test_fields)

peak memory: 466.54 MiB, increment: 0.02 MiB


In [None]:
max_size = 30000
TEXT.build_vocab(train, test, vectors=vectors, max_size=max_size)

INDEX.build_vocab(test)

# print vocab information
ntokens = len(TEXT.vocab)
print('ntokens', ntokens)

In [None]:
train = data.BucketIterator(train, batch_size=32,
                            sort_key=lambda x: len(x.text),
                            sort_within_batch=True, repeat=False)
test = data.BucketIterator(test, batch_size=128,
                           sort_key=lambda x: len(x.text),
                           sort_within_batch=True, train=False, repeat=False)

def get_text(batch):
    return getattr(batch, text_label)
def get_labels(batch):
    # Get the labels as one tensor from the batch object
    return torch.cat([getattr(batch, label).unsqueeze(1) for label in label_cols], dim=1).float()

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nout, nlayers, dropemb=0.2, droprnn=0.0, bidirectional=True):
        super(RNNModel, self).__init__()
        self.encoder = nn.Embedding(ntoken, ninp)
        self.drop = nn.Dropout2d(dropemb)
        self.ndir = 2 if bidirectional else 1
        assert rnn_type in ['LSTM', 'GRU'], 'RNN type is not supported'
        if rnn_type == 'LSTM':
            self.rnns = [torch.nn.LSTM(ninp if l == 0 else nhid*self.ndir, nhid, 1, dropout=droprnn, bidirectional=bidirectional) for l in range(nlayers)]
        if rnn_type == 'GRU':
            self.rnns = [torch.nn.GRU(ninp if l == 0 else nhid*self.ndir, nhid, 1, dropout=droprnn, bidirectional=bidirectional) for l in range(nlayers)]
        
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.avg_pool = torch.nn.AdaptiveAvgPool1d(1)
        self.max_pool = torch.nn.AdaptiveMaxPool1d(1)
        self.decoder = nn.Linear(nhid*self.ndir*2, nout)

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def forward(self, input, lengths=None):
        emb = self.encoder(input)
        
        raw_output = self.drop(emb)
        
        if lengths is not None:
            lengths = lengths.view(-1).tolist()
            raw_output = nn.utils.rnn.pack_padded_sequence(raw_output, lengths)
            
        for rnn in self.rnns:
            raw_output,_ = rnn(raw_output)
        
        if lengths is not None:
            raw_output, lengths = nn.utils.rnn.pad_packed_sequence(raw_output)
            
        bsz = raw_output.size(1)
        rnn_avg = self.avg_pool(raw_output.permute(1,2,0))
        rnn_max = self.max_pool(raw_output.permute(1,2,0))
        rnn_out = torch.cat([rnn_avg.view(bsz,-1),rnn_max.view(bsz,-1)], dim=1)
            
        result = self.decoder(rnn_out)
        return self.decoder(rnn_out)

In [None]:
use_cuda = torch.cuda.is_available()
nhidden=100
emsize=300
nlayers = 1
dropemb = 0.2
droprnn = 0.0
model = RNNModel('GRU', ntokens, emsize, nhidden, 6, nlayers, dropemb=dropemb, droprnn=droprnn, bidirectional=True)
model.encoder.weight.data.copy_(TEXT.vocab.vectors)

import torch.optim as optim
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.7, 0.99))
if use_cuda:
    model=model.cuda()
    criterion=criterion.cuda()

In [None]:
from tqdm import tqdm_notebook as tqdm

epochs = 2

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_count = 0
    model.train() 
    t = tqdm(train)
    for batch in t:
        (x,xl) = get_text(batch)
        y = get_labels(batch)
        
        optimizer.zero_grad()

        preds = model(x, lengths=xl)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        running_loss += loss.data[0]*len(x)
        running_count += len(x)
        t.set_postfix(loss=(running_loss/running_count))

    epoch_loss = running_loss / running_count

    print('Epoch: {}, Loss: {:.5f}'.format(epoch, epoch_loss))

In [None]:
def get_ids(batch):
    return getattr(batch, id_label).data.cpu().numpy().astype(int)

In [None]:
import numpy as np
test_preds = np.zeros((len(INDEX.vocab), 6))
model.eval()
for batch in test:
    (x,xl) = get_text(batch)
    ids = get_ids(batch)
    preds=model(x,lengths=xl)
    preds = preds.data.cpu().numpy()
    preds = 1/(1+np.exp(-np.clip(preds,-10,10)))
    test_preds[ids]=preds