# Time Vector
In this notebook, focus only on the time vector. The hypothesis is: The vector difference bert("I eat lunch") - bert("I ate lunch") resides on some concentrated directions, and is disentangled from other factors.

In [11]:
import os, time, sys
import pandas as pd
import numpy as np
import spacy
import pickle
import transformers
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

from utils import timed_func

tags_to_tense = {
    "VB": "base", 
    "VBD": "past",
    "VBG": "present_participle",
    "VBN": "past_participle",
    "VBP": "non_3rd_singular_present",
    "VBZ": "3rd_singular_present"
}
verb_tags = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]

In [2]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased', output_hidden_states=False)
nlp = spacy.load("en_core_web_md")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




In [3]:
@timed_func
def read_sst_sentences(verbose=True):
    """
    Output: list of spacy-processed Docs
    """
    L = []
    wordcounts = []
    with open("../data/stanfordSentimentTreebank/datasetSentences.txt", "r") as f:
        for line in f.readlines():
            raw_text = line.split("\t")[1]
            for sent in sent_tokenize(raw_text):
                doc = nlp(sent)
                L.append(doc)
                wordcounts.append(len(doc))
    if verbose:
        print ("SST contains {} sentences, avg words {:.2f}, stdvar {:.2f}.".format(
            len(wordcounts), np.mean(wordcounts), np.std(wordcounts)
        ))
    return L

@timed_func
def read_sst_sentences_plaintext(verbose=True):
    """
    Output: list of string
    """
    L = []
    wordcounts = []
    with open("../data/stanfordSentimentTreebank/datasetSentences.txt", "r") as f:
        for line in f.readlines():
            raw_text = line.split("\t")[1]
            for sent in sent_tokenize(raw_text):
                L.append(sent)
                wordcounts.append(len(sent.split()))
    if verbose:
        print ("SST contains {} sentences, avg words {:.2f}, stdvar {:.2f}.".format(
            len(wordcounts), np.mean(wordcounts), np.std(wordcounts)
        ))
    return L

#sst_sentences = read_sst_sentences()
sst_plaintext = read_sst_sentences_plaintext()

SST contains 11978 sentences, avg words 18.97, stdvar 9.24.
read_sst_sentences_plaintext done in 0.44 seconds.


In [4]:
@timed_func
def count_tags_freq(sentences, verb_only=True):
    tags_freq = {}
    for sent in sentences:
        for token in sent:
            t = token.tag_
            if t not in tags_freq:
                tags_freq[t] = 1
            else:
                tags_freq[t] += 1
                
    if verb_only:
        return [(k,tags_freq[k]) for k in tags]
    else:
        return [(k,tags_freq[k]) for k in tags_freq]

print(count_tags_freq(sst_sentences))

NameError: name 'sst_sentences' is not defined

In [None]:
@timed_func
def histogram_sentence_tenses(sentences):
    # Count how many tenses each sentence has.
    num_tenses_histogram = {}
    for sent in sentences:
        indicator = [0] * len(verb_tags)
        for token in sent:
            if token.tag_ in verb_tags:
                indicator[tags.index(token.tag_)] = 1
        num = np.array(indicator).sum()
        if num not in num_tenses_histogram:
            num_tenses_histogram[num] = 1
            print(f"Example of {num} tenses:", sent, [token.tag_ for token in sent])
        else:
            num_tenses_histogram[num] += 1
    print(num_tenses_histogram)
    
histogram_sentence_tenses(sst_sentences)

### Some observations
1. There are many annotation errors in determining verb tenses. Need manual checks. (but the dataset size is only several thousands. Can be done within hours)  
2. Let me systematically determine the tense of a sentence with the tense of **the outmost verb** in constituency parsing.  

In [None]:
@timed_func
def filter_past_tense_sentences_v1(sentences):
    # Intuition 1: if any "past tense" (VBD) verb form exists. This does not work well.
    results = []
    for sent in sentences:
        tags = [token.tag_ for token in sent]
        
        if "VBD" in tags:
            results.append(str(sent)+"\n")
        
        
    with open(os.path.join("20200701_outputs", "past_tense.txt"), "w+") as f:
        f.writelines(results)

filter_past_tense_sentences(sst_sentences)

In [5]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
allennlp_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz")

HBox(children=(FloatProgress(value=0.0, max=710808161.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=336.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=374434792.0), HTML(value='')))




In [6]:
def find_main_verb_and_tense(sentence):
    res = allennlp_predictor.predict(sentence=sentence)
    tr = nltk.tree.Tree.fromstring(res['trees'])
    treepositions = sorted(tr.treepositions(), key=lambda tup: len(tup))  # Level-order traversal. 
    for pos in treepositions:
        node = tr[pos]
        depth = len(pos)
        if hasattr(node, "label") and node.label() in verb_tags:  # The first occurrence
            return node.label(), node[0], tags_to_tense[node.label()]
    return None, None, None
                
find_main_verb_and_tense("If I brought you 10 dollars, can you buy me lunch?")

Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.


('VB', 'buy', 'base')

In [7]:
find_main_verb_and_tense("The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R.")

('VBZ', 'is', '3rd_singular_present')

In [15]:
@timed_func
def compute_tenses(plaintexts):
    tense_results = {}
    for text in tqdm(plaintexts):
        # Use the tense of the main verb 
        # (where the main verb is the highest verb in constituency parsing tree)
        verbtag, verb, tense = find_main_verb_and_tense(text)
        if verbtag is not None:
            if verbtag not in tense_results:
                tense_results[verbtag] = [(text, verb)]
            else:
                tense_results[verbtag].append(tuple([text, verb]))
    with open("20200701_outputs/tense_results.pkl", "wb+") as f:
        pickle.dump(tense_results, f)
        
compute_tenses(sst_plaintext)

100%|██████████| 11978/11978 [1:59:14<00:00,  1.67it/s] 

compute_tenses done in 7155.02 seconds.





In [23]:
with open("20200701_outputs/tense_results.pkl", "rb") as f:
    tense_results = pickle.load(f)
for verbtag in tense_results:
    print("\n{}: {}, {} sentences".format(verbtag, tags_to_tense[verbtag], len(tense_results[verbtag])))
    print(tense_results[verbtag][0])
    print(tense_results[verbtag][1])
    print(tense_results[verbtag][2])
    print(tense_results[verbtag][3])


VBZ: 3rd_singular_present, 6565 sentences
("The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .", 'is')
("The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\\/director Peter Jackson 's expanded vision of J.R.R.", 'is')
('If you sometimes like to go to the movies to have fun , Wasabi is a good place to start .', 'is')
('The film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .', 'provides')

VB: base, 1372 sentences
("Emerges as something rare , an issue movie that 's so honest and keenly observed that it does n't feel like one .", 'Emerges')
("The movie 's ripe , enrapturing beauty will tempt those willing to probe its inscrutable mysteries .", 'tempt')
('Fuller would surely have c