In [1]:
import numpy as np
from scipy.stats import pearsonr,spearmanr
import sys
from os.path import join
from train_model import sent_util
import torch
from torchtext import data, datasets
import pandas as pd
import nltk
import nltk.corpus

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
def get_sst_trees():
    
    inputs = data.Field(lower='preserve-case')
    answers = data.Field(sequential=False, unk_token=None)

    train_s, dev_s, test_s = datasets.SST.splits(inputs, answers, 
                                                 fine_grained = False, 
                                                 train_subtrees = True,
                                                 filter_pred=lambda ex: ex.label != 'neutral')
    
    inputs.build_vocab(train_s, dev_s, test_s)
    answers.build_vocab(train_s)
    
    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train_s, dev_s, test_s), batch_size=1, device=device)
    
    return inputs, answers, train_iter, dev_iter, test_iter


In [10]:
inputs, answers, train_iter, dev_iter, test_iter = get_sst_trees()

In [19]:
batches = sent_util.get_batches_iterator([i for i in range(10)], train_iter)

getting batches...


In [24]:
for index in batches:
    text = batches[index].text.data[:,0]
    print([inputs.vocab.itos[i] for i in text], answers.vocab.itos[batches[index].label.data])

['entertainment'] positive
['precious'] positive
['a', 'stunning', 'fusion', 'of', 'music', 'and', 'images'] positive
['the', 'strength', 'of', 'the', 'film', 'lies', 'in', 'its', 'two', 'central', 'performances', 'by', 'sven', 'wollter', 'as', 'the', 'stricken', 'composer', 'and', 'viveka', 'seldahl', 'as', 'his', 'desperate', 'violinist', 'wife', '.'] positive
['the', 'ending', 'is', 'all', 'too', 'predictable', 'and', 'far', 'too', 'cliched', 'to', 'really', 'work'] negative
['the', 'action', 'is', 'stilted', 'and'] negative
['of', 'the', 'best', 'war', 'movies', 'ever', 'made'] positive
['on', 'its', 'way'] positive
['with', 'the', 'same', 'number', 'of', 'continuity', 'errors'] negative


In [25]:
sst_reader = nltk.corpus.BracketParseCorpusReader("/Users/silanhe/Documents/McGill/Grad/WINTER2020/NLU/sst/trees", ".*.txt")

In [26]:
sst_sentences = sst_reader.sents("train.txt")

In [27]:
sst = sst_reader.parsed_sents("train.txt")

In [28]:
len(sst)

8544

In [38]:
for index,tree in enumerate(sst):
    words= [nltk.word_tokenize(word.lower())[0] for word in sst_sentences[index] if word.isalnum()]
    print(' '.join(words))
    print(tree)
    break

the rock is destined to be the 21st century new conan and that he going to make a splash even greater than arnold schwarzenegger van damme or steven segal
(3
  (2 (2 The) (2 Rock))
  (4
    (3
      (2 is)
      (4
        (2 destined)
        (2
          (2
            (2
              (2
                (2 to)
                (2
                  (2 be)
                  (2
                    (2 the)
                    (2
                      (2 21st)
                      (2
                        (2 (2 Century) (2 's))
                        (2 (3 new) (2 (2 ``) (2 Conan))))))))
              (2 ''))
            (2 and))
          (3
            (2 that)
            (3
              (2 he)
              (3
                (2 's)
                (3
                  (2 going)
                  (3
                    (2 to)
                    (4
                      (3
                        (2 make)
                        (3
                          (3 (2 a) (3 splash))
 