In [0]:
import torch as tt
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator
import nltk
import torch.nn.functional as F
#from tqdm.autonotebook import tqdm
from tqdm import tqdm
from sklearn import metrics

In [0]:
np.random.seed(42)

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
data = pd.read_csv('imdb_master.csv', encoding='latin-1')

In [0]:
data = data.loc[data["label"] != "unsup"]

In [0]:
data.to_csv('imdb_master.csv', encoding='utf-8')

In [95]:
data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,type,review,label,file
0,0,0,0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,1,1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,2,2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,3,3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,4,4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
...,...,...,...,...,...,...,...,...
49995,49995,49995,49995,49995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt
49996,49996,49996,49996,49996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt
49997,49997,49997,49997,49997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt
49998,49998,49998,49998,49998,train,A Christmas Together actually came before my t...,pos,99_8.txt


In [0]:
data[data['type'] == 'train'].to_csv('train.csv', encoding='utf-8')

In [0]:
data_train = pd.read_csv('train.csv')

In [102]:
data_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,type,review,label,file
0,25000,25000,25000,25000,25000,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt
1,25001,25001,25001,25001,25001,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt
2,25002,25002,25002,25002,25002,train,This film lacked something I couldn't put my f...,neg,10001_4.txt
3,25003,25003,25003,25003,25003,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt
4,25004,25004,25004,25004,25004,train,When I was little my parents took me along to ...,neg,10003_1.txt


In [0]:
data[data['type'] == 'test'].to_csv('test.csv', encoding='utf-8')

In [0]:
data_test = pd.read_csv('test.csv')

In [105]:
data_test.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,type,review,label,file
0,0,0,0,0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,1,1,1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,2,2,2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,3,3,3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,4,4,4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [0]:
import spacy


spacy_en = spacy.load('en')

def tokenizer(text): 
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]

In [0]:
classes={
    'neg':0,
    'pos':1
}
#classes={
#    'neg':0,
#    'pos':1,
#    'unsup':2
#}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))

LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

dataset = TabularDataset('imdb_master.csv', format='csv',
                         fields=[(None, None), (None, None), (None, None), (None, None), (None, None), (None, None), ('review', TEXT), ('label', LABEL), (None, None)], 
                         skip_header=True)

train = TabularDataset('train.csv', format='csv',
                         fields=[(None, None), (None, None), (None, None), (None, None), (None, None), (None, None), ('review', TEXT), ('label', LABEL), (None, None)], 
                         skip_header=True)

test = TabularDataset('test.csv', format='csv',
                         fields=[(None, None), (None, None), (None, None), (None, None), (None, None), (None, None), ('review', TEXT), ('label', LABEL), (None, None)], 
                         skip_header=True)

In [0]:
TEXT.build_vocab(dataset, min_freq=10, vectors="glove.6B.100d")

In [109]:
len(TEXT.vocab.itos)

27263

In [0]:
LABEL.build_vocab(dataset)

In [0]:
train, valid = train.split(0.7, stratified=True)

In [112]:
np.unique([x.label for x in train.examples], return_counts=True)

(array([0, 1]), array([8750, 8750]))

In [113]:
np.unique([x.label for x in valid.examples], return_counts=True)

(array([0, 1]), array([3750, 3750]))

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k, padding=5) for k in kernels])
        self.relu = tt.nn.ReLU(inplace=True)
        self.drop_out = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_size * len(kernels), 2)
        
    def forward(self, x):
        
        x = self.embedding(x)
        x = self.drop_out(x)
        x = x.transpose(1,2)
        
        concatenated = []
        for conv in self.convs:
            z = conv(x)
            z = F.avg_pool1d(z, kernel_size=z.size(2))
            z = z.squeeze(2)
            concatenated.append(z)
            
        x = tt.cat(concatenated, 1)
        x = self.relu(x) 
        x = self.fc(x)
        return x

In [0]:
device = tt.device('cuda') if tt.cuda.is_available() else tt.device('cpu')

In [0]:
model = MyModel(len(TEXT.vocab.itos), embed_size=100, hidden_size=128, kernels=[2,3,4,5])

In [117]:
model

MyModel(
  (embedding): Embedding(27263, 100)
  (convs): ModuleList(
    (0): Conv1d(100, 128, kernel_size=(2,), stride=(1,), padding=(5,))
    (1): Conv1d(100, 128, kernel_size=(3,), stride=(1,), padding=(5,))
    (2): Conv1d(100, 128, kernel_size=(4,), stride=(1,), padding=(5,))
    (3): Conv1d(100, 128, kernel_size=(5,), stride=(1,), padding=(5,))
  )
  (relu): ReLU(inplace=True)
  (drop_out): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=512, out_features=2, bias=True)
)

In [0]:
criterion = tt.nn.CrossEntropyLoss()
optimizer = tt.optim.Adam(params=model.parameters())

model = model.to(device)
criterion = criterion.to(device)

In [0]:
tt.cuda.empty_cache()

batch_size = 32
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.review))

In [122]:
epochs = 10
losses = []      
best_val_loss = 10.

acc_score = []

for n_epoch in range(epochs):
  train_losses = []
  val_losses = []
  val_targets = []
  val_pred_class = []

  progress_bar = tqdm(total=len(train_iterator.dataset), desc='Epoch {}'.format(n_epoch + 1))
              
  model.train()

  for batch in train_iterator:
    x = batch.review[0].to(device)
    y = batch.label.to(device)

    optimizer.zero_grad()

    pred = model(x)
    loss = criterion(pred, y)
    loss.backward() 
                  
    optimizer.step()
      
    train_losses.append(loss.item())
    losses.append(loss.item())
  
    progress_bar.set_postfix(train_loss = np.mean(losses[-500:]))
    progress_bar.update(x.shape[0])
          
  progress_bar.close()
              
  model.eval()

  for batch in valid_iterator:
    x = batch.review[0].to(device)
    y = batch.label
                  
    with tt.no_grad():
      val_pred = model(x)
      val_pred.cpu()
      val_targets.append(y.numpy())
      val_pred_class.append(np.argmax(val_pred, axis=1))
      val_loss = criterion(val_pred, y)
      val_losses.append(val_loss.item())
              
  mean_val_loss = np.mean(val_losses)            
  val_targets = np.concatenate(val_targets).squeeze()
  val_pred_class = np.concatenate(val_pred_class).squeeze()

  acc = metrics.accuracy_score(val_targets, val_pred_class)
  acc_score.append(acc)
              
  print('Losses: train - {:.3f}, valid - {:.3f}'.format(np.mean(train_losses), mean_val_loss))

  print('Accuracy score - {:.3f}'.format(acc))
        
  # Early stopping:
  if mean_val_loss < best_val_loss:
    best_val_loss = mean_val_loss
  else:
    print('Early stopping')
    break

Epoch 1: 100%|██████████| 17500/17500 [04:22<00:00, 57.18it/s, train_loss=0.639]
Epoch 2:   0%|          | 0/17500 [00:00<?, ?it/s]

Losses: train - 0.644, valid - 0.495
Accuracy score - 0.816


Epoch 2: 100%|██████████| 17500/17500 [04:23<00:00, 59.70it/s, train_loss=0.391]
Epoch 3:   0%|          | 0/17500 [00:00<?, ?it/s]

Losses: train - 0.397, valid - 0.456
Accuracy score - 0.857


Epoch 3: 100%|██████████| 17500/17500 [04:24<00:00, 63.56it/s, train_loss=0.299]


Losses: train - 0.297, valid - 0.475
Accuracy score - 0.870
Early stopping


In [0]:
def test_model(model, test_iterator):
  model.eval()
  
  test_targets = []
  test_pred_class = []
  
  for batch in test_iterator:
      x = batch.review[0].to(device)
      y = batch.label

      with tt.no_grad():
          test_pred = model(x)

          test_pred= test_pred.cpu()

          test_targets.append(y.numpy())
          test_pred_class.append(np.argmax(test_pred, axis=1))

  test_targets = np.concatenate(test_targets).squeeze()
  test_pred_class = np.concatenate(test_pred_class).squeeze()

  acc = metrics.accuracy_score(test_targets, test_pred_class)
  return acc

In [125]:
test_model(model, test_iterator)

0.86544

In [0]:
for instance in list(tqdm._instances): 
    tqdm._decr_instances(instance)