In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from __future__ import division
import itertools
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import re
import scipy.sparse
import time

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE as tsne

In [None]:
#@title brown corpus reader
import sys, os
import glob
import re
from collections import namedtuple

# Struct types for different lines in the bAbI dataset.
# StoryLine represents "ID text" lines as (int, string)
# QALine represents "ID question answer support" lines as
# (int, string, string, list(int)).
# If tokenized, string fields can be replaced with list(string).
StoryLine = namedtuple("StoryLine", ["id", "text"])
QALine = namedtuple("QALine", ["id", "question", "answer", "support_ids"])

class BabiTaskCorpusReader(object):
    """Corpus reader for the bAbI tasks dataset.

    See https://research.fb.com/downloads/babi/ for details.

    This class exposes a similar interface to NLTK's corpus readers, and should
    be interchangable with them in many applications.

    Example usage:

    import babi_utils
    import nltk
    tok = nltk.tokenize.treebank.TreebankWordTokenizer()
    cr = babi_utils.BabiTaskCorpusReader("/home/babi/en",
                                         tokenizer=tok.tokenize)
    words = list(cr.words())
    print words[:8]
    # ['John', 'travelled', 'to', 'the', 'hallway', '.', 'Mary', 'journeyed']

    """

    ALL_FILES = [
        'qa10_indefinite-knowledge_test.txt',
        'qa10_indefinite-knowledge_train.txt',
        'qa11_basic-coreference_test.txt',
        'qa11_basic-coreference_train.txt',
        'qa12_conjunction_test.txt',
        'qa12_conjunction_train.txt',
        'qa13_compound-coreference_test.txt',
        'qa13_compound-coreference_train.txt',
        'qa14_time-reasoning_test.txt',
        'qa14_time-reasoning_train.txt',
        'qa15_basic-deduction_test.txt',
        'qa15_basic-deduction_train.txt',
        'qa16_basic-induction_test.txt',
        'qa16_basic-induction_train.txt',
        'qa17_positional-reasoning_test.txt',
        'qa17_positional-reasoning_train.txt',
        'qa18_size-reasoning_test.txt',
        'qa18_size-reasoning_train.txt',
        'qa19_path-finding_test.txt',
        'qa19_path-finding_train.txt',
        'qa1_single-supporting-fact_test.txt',
        'qa1_single-supporting-fact_train.txt',
        'qa20_agents-motivations_test.txt',
        'qa20_agents-motivations_train.txt',
        'qa2_two-supporting-facts_test.txt',
        'qa2_two-supporting-facts_train.txt',
        'qa3_three-supporting-facts_test.txt',
        'qa3_three-supporting-facts_train.txt',
        'qa4_two-arg-relations_test.txt',
        'qa4_two-arg-relations_train.txt',
        'qa5_three-arg-relations_test.txt',
        'qa5_three-arg-relations_train.txt',
        'qa6_yes-no-questions_test.txt',
        'qa6_yes-no-questions_train.txt',
        'qa7_counting_test.txt',
        'qa7_counting_train.txt',
        'qa8_lists-sets_test.txt',
        'qa8_lists-sets_train.txt',
        'qa9_simple-negation_test.txt',
        'qa9_simple-negation_train.txt'
    ]

    def __init__(self, directory, mask="qa*.txt",
                 file_list=ALL_FILES,
                 file_reader=open,
                 tokenizer=lambda s: s.split(),
                 verbose=False):
        """Construct a corpus reader for the bAbI tasks dataset.

        Args:
            directory: (string) path to bAbI text files (e.g. /home/babi/en/)
            mask: (string) file glob to match particular files. Use
                "qa16_*" e.g. to match task 16.
            file_list: (list(string) or None) If None, will glob directory to
                find files. Otherwise, will use the given list of basenames.
            file_reader: (function string -> fd) optional replacement for
                Python's built-in open(...) method, to be used for reading
                from alternative file-like objects.
            tokenizer: function string -> list(string), used to split
                sentences.
            verbose: (bool) if true, will print when reading files.
        """
        self._open = file_reader
        self._tokenizer = tokenizer
        self._verbose = verbose

        if file_list:
            basenames = glob.fnmatch.filter(file_list, mask)
            filenames = [os.path.join(directory, f) for f in basenames]
        else:
            # Glob directory
            pattern = os.path.join(directory, mask)
            filenames = glob.glob(pattern)

        # Filenames of form qaXX_task-name_train.txt
        # Want to sort by XX as a number
        key_fn = lambda f: (int(os.path.basename(f).split("_")[0][2:]), f)
        self._filenames = sorted(filenames, key=key_fn)
        # Filenames should be nonempty!
        assert(self._filenames), "No files found matching [{:s}]".format(mask)

    def filenames(self):
        return self._filenames

    def parse_line(self, line):
        """Parse a single line from the bAbI corpus.

        Line is of one of the two forms:
        ID text
        ID question[tab]answer[tab]supporting fact IDs

        See https://research.fb.com/downloads/babi/

        Args:
            line: (string)

        Returns:
            (id, text) as (int, string)
            OR (id, question, answer, [ids]) as (int, string, string, list(int))
        """
        id_text, rest = line.split(" ", 1)
        id = int(id_text)
        if "\t" in rest:
            question, answer, s_ids_text = rest.split("\t")
            s_ids = map(int, s_ids_text.split())
            return QALine(id, question.strip(), answer.strip(), s_ids)
        else:
            return StoryLine(id, rest.strip())

    def tokenize_parsed_line(self, line):
        if isinstance(line, StoryLine):
            return StoryLine(line.id, self._tokenizer(line.text))
        else:
            return QALine(line.id,
                          self._tokenizer(line.question),
                          self._tokenizer(line.answer),
                          line.support_ids)

    def _line_iterator(self):
        for f in self._filenames:
            if self._verbose:
                print >> sys.stderr, "Reading {:s}".format(os.path.basename(f)),
            with self._open(f) as fd:
                for line in fd:
                    yield line.strip()
            if self._verbose:
                print >> sys.stderr, "...done!"

    def examples(self, tokenize=True):
        """Iterator over complete stories (training examples).

        A story spans multiple lines, of the form:

        1 text one
        2 text two
        3 text three
        4 question[tab]answer[tab]supporting fact IDs

        Args:
            tokenize: (bool) If true, will tokenize text fields.

        Returns:
            iterator yielding list(StoryLine|QALine)
              if tokenize=True, then text, question, and answer will be
              list(string); otherwise they will be plain strings.
        """
        buffer = []
        for line in self._line_iterator():
            parsed = self.parse_line(line)
            if tokenize:
                parsed = self.tokenize_parsed_line(parsed)
            # If new story item, flush buffer.
            if buffer and parsed.id <= buffer[-1].id:
                yield buffer
                buffer = []
            buffer.append(parsed)
        # Flush at end.
        yield buffer
        buffer = []

    def _raw_sents_impl(self, stories=False, questions=False, answers=False):
        for line in self._line_iterator():
            parsed = self.parse_line(line)
            if isinstance(parsed, StoryLine) and stories:
                yield parsed.text
            else:
                if questions:
                    yield parsed.question
                if answers:
                    yield parsed.answer

    def raw_sents(self):
        """Iterator over utterances in the corpus.

        Returns untokenized sentences.

        Returns:
            iterator yielding string
        """
        return self._raw_sents_impl(stories=True,
                                    questions=True,
                                    answers=True)

    def sents(self):
        """Iterator over utterances in the corpus.

        Returns tokenized sentences, a la NLTK.

        Returns:
            iterator yielding list(string)
        """
        for sentence in self.raw_sents():
            yield self._tokenizer(sentence)


    def words(self):
        """Iterator over words in the corpus.

        Returns:
            iterator yielding string
        """
        for sentence in self.sents():
            for word in sentence:
                yield word

In [None]:
import os
import tarfile

local_tar = '/tmp/babi_tasks_1-20_v1-2.tar.gz'
tar_ref = tarfile.open(local_tar, 'r:gz')
tar_ref.extractall('/tmp')
tar_ref.close()

In [59]:
# Load the Drive helper and mount google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
print 'Loading bAbI corpus... ',
import operator, functools
import nltk
nltk.download('brown')
from nltk.corpus import brown
babi_corpus = brown
print 'Done'

Loading bAbI corpus... [nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
 Done


In [None]:
corpus = list(babi_corpus.sents())
print 'Sentences:', len(corpus)

# Print the first 5 sentences of the corpus.
for i, sent in enumerate(corpus[:5]):
    print i, ' '.join(sent)

Sentences: 57340
0 The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
1 The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .
2 The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. .
3 `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .
4 The jury said it did find that many of Georgia's registration and election laws `` are outmoded or inadequate and often ambiguous '' .


In [None]:
#@title Utilities
import re
import time
import itertools
import numpy as np

# For pretty-printing
import pandas as pd
from IPython.display import display, HTML

UNK_TOKEN   = u"<unk>"

def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))

def pretty_print_matrix(M, rows=None, cols=None, dtype=float, float_fmt="{0:.04f}"):
    """Pretty-print a matrix using Pandas.

    Args:
      M : 2D numpy array
      rows : list of row labels
      cols : list of column labels
      dtype : data type (float or int)
      float_fmt : format specifier for floats
    """
    df = pd.DataFrame(M, index=rows, columns=cols, dtype=dtype)
    old_fmt_fn = pd.get_option('float_format')
    pd.set_option('float_format', lambda f: float_fmt.format(f))
    display(df)
    pd.set_option('float_format', old_fmt_fn)  # reset Pandas formatting

def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)


##
# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return UNK_TOKEN

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

##
# Data loading functions
def get_corpus(name="brown"):
    import nltk
    assert(nltk.download(name))
    return nltk.corpus.__getattr__(name)

def build_vocab(corpus, V=10000):
    import vocabulary
    token_feed = (canonicalize_word(w) for w in corpus.words())
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab

def get_train_test_sents(corpus, split=0.8, shuffle=True):
    """Generate train/test split for unsupervised tasks.

    Args:
      corpus: nltk.corpus that supports sents() function
      split (double): fraction to use as training set
      shuffle (int or bool): seed for shuffle of input data, or False to just
      take the training data as the first xx% contiguously.

    Returns:
      train_sentences, test_sentences ( list(list(string)) ): the train and test
      splits
    """
    sentences = np.array(list(corpus.sents()), dtype=object)
    fmt = (len(sentences), sum(map(len, sentences)))
    print "Loaded {:,} sentences ({:g} tokens)".format(*fmt)

    if shuffle:
        rng = np.random.RandomState(shuffle)
        rng.shuffle(sentences)  # in-place
    train_frac = 0.8
    split_idx = int(train_frac * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]

    fmt = (len(train_sentences), sum(map(len, train_sentences)))
    print "Training set: {:,} sentences ({:,} tokens)".format(*fmt)
    fmt = (len(test_sentences), sum(map(len, test_sentences)))
    print "Test set: {:,} sentences ({:,} tokens)".format(*fmt)

    return train_sentences, test_sentences

def preprocess_sentences(sentences, vocab, use_eos=False, emit_ids=True):
    """Preprocess sentences by canonicalizing and mapping to ids.

    Args:
      sentences ( list(list(string)) ): input sentences
      vocab: Vocabulary object, already initialized
      use_eos: if true, will add </s> token to end of sentence.
      emit_ids: if true, will emit as ids. Otherwise, will be preprocessed
          tokens.

    Returns:
      ids ( array(int) ): flattened array of sentences, including boundary <s>
      tokens.
    """
    # Add sentence boundaries, canonicalize, and handle unknowns
    word_preproc = lambda w: canonicalize_word(w, wordset=vocab.word_to_id)
    ret = []
    for s in sentences:
        canonical_words = vocab.pad_sentence(map(word_preproc, s),
                                             use_eos=use_eos)
        ret.extend(vocab.words_to_ids(canonical_words) if emit_ids else
                   canonical_words)
    if not use_eos:  # add additional <s> to end if needed
        ret.append(vocab.START_ID if emit_ids else vocab.START_TOKEN)
    return np.array(ret, dtype=(np.int32 if emit_ids else object))


def load_corpus(corpus, split=0.8, V=10000, shuffle=0):
    """Load a named corpus and split train/test along sentences.

    This is a convenience wrapper to chain together several functions from this
    module, and produce a train/test split suitable for input to most models.

    Sentences are preprocessed by canonicalization and converted to ids
    according to the constructed vocabulary, and interspersed with <s> tokens
    to denote sentence bounaries.

    Args:
        corpus: (string | corpus reader) If a string, will fetch the
            NLTK corpus of that name.
        split: (float \in (0,1]) fraction of examples in train split
        V: (int) vocabulary size (including special tokens)
        shuffle: (int) if > 0, use as random seed to shuffle sentence prior to
            split. Can change this to get different splits.

    Returns:
        (vocab, train_ids, test_ids)
        vocab: vocabulary.Vocabulary object
        train_ids: flat (1D) np.array(int) of ids
        test_ids: flat (1D) np.array(int) of ids
    """
    if isinstance(corpus, str):
        corpus = get_corpus(corpus)
    vocab = build_vocab(corpus, V)
    train_sentences, test_sentences = get_train_test_sents(corpus, split, shuffle)
    train_ids = preprocess_sentences(train_sentences, vocab)
    test_ids = preprocess_sentences(test_sentences, vocab)
    return vocab, train_ids, test_ids

##
# Window and batch functions
def rnnlm_batch_generator(ids, batch_size, max_time):
    """Convert ids to data-matrix form for RNN language modeling."""
    # Clip to multiple of max_time for convenience
    clip_len = ((len(ids)-1) / batch_size) * batch_size
    input_w = ids[:clip_len]     # current word
    target_y = ids[1:clip_len+1]  # next word
    # Reshape so we can select columns
    input_w = input_w.reshape([batch_size,-1])
    target_y = target_y.reshape([batch_size,-1])

    # Yield batches
    for i in xrange(0, input_w.shape[1], max_time):
        yield input_w[:,i:i+max_time], target_y[:,i:i+max_time]


def build_windows(ids, N, shuffle=True):
    """Build window input to the window model.

    Takes a sequence of ids, and returns a data matrix where each row
    is a window and target for the window model. For N=3:
        windows[i] = [w_3, w_2, w_1, w_0]

    For language modeling, N is the context size and you can use y = windows[:,-1]
    as the target words and x = windows[:,:-1] as the contexts.

    For CBOW, N is the window size and you can use y = windows[:,N/2] as the target words
    and x = np.hstack([windows[:,:N/2], windows[:,:N/2+1]]) as the contexts.

    For skip-gram, you can use x = windows[:,N/2] as the input words and y = windows[:,i]
    where i != N/2 as the target words.

    Args:
      ids: np.array(int32) of input ids
      shuffle: if true, will randomly shuffle the rows

    Returns:
      windows: np.array(int32) of shape [len(ids)-N, N+1]
        i.e. each row is a window, of length N+1
    """
    windows = np.zeros((len(ids)-N, N+1), dtype=int)
    for i in xrange(N+1):
        # First column: first word, etc.
        windows[:,i] = ids[i:len(ids)-(N-i)]
    if shuffle:
        # Shuffle rows
        np.random.shuffle(windows)
    return windows


def batch_generator(data, batch_size):
    """Generate minibatches from data.

    Args:
      data: array-like, supporting slicing along first dimension
      batch_size: int, batch size

    Yields:
      minibatches of maximum size batch_size
    """
    for i in xrange(0, len(data), batch_size):
        yield data[i:i+batch_size]

In [None]:
#@title Vocabulary helper functions
import collections
from collections import defaultdict

class Vocabulary(object):

  START_TOKEN = u"<s>"
  END_TOKEN   = u"</s>"
  UNK_TOKEN   = u"<unk>"

  def __init__(self, tokens, size=None):
    """Create a Vocabulary object.

    Args:
        tokens: iterator( string )
        size: None for unlimited, or int > 0 for a fixed-size vocab.
              Vocabulary size includes special tokens <s>, </s>, and <unk>
    """
    self.unigram_counts = collections.Counter(tokens)
    self.bigram_counts = defaultdict(lambda: defaultdict(lambda: 0))
    word1 = None
    for word in tokens:
        if word1 is None:
            pass
        self.bigram_counts[word1][word] += 1
        word1 = word
    self.bigram_counts.default_factory = None  # make into a normal dict

    # Leave space for "<s>", "</s>", and "<unk>"
    top_counts = self.unigram_counts.most_common(None if size is None else (size - 3))
    vocab = ([self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN] +
             [w for w,c in top_counts])

    # Assign an id to each word, by frequency
    self.id_to_word = dict(enumerate(vocab))
    self.word_to_id = {v:k for k,v in self.id_to_word.iteritems()}
    self.size = len(self.id_to_word)
    if size is not None:
        assert(self.size <= size)

    # For convenience
    self.wordset = set(self.word_to_id.iterkeys())

    # Store special IDs
    self.START_ID = self.word_to_id[self.START_TOKEN]
    self.END_ID = self.word_to_id[self.END_TOKEN]
    self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

  def words_to_ids(self, words):
    return [self.word_to_id.get(w, self.UNK_ID) for w in words]

  def ids_to_words(self, ids):
    return [self.id_to_word[i] for i in ids]

  def pad_sentence(self, words, use_eos=True):
    ret = [self.START_TOKEN] + words
    if use_eos:
      ret.append(self.END_TOKEN)
    return ret

  def sentence_to_ids(self, words, use_eos=True):
    return self.words_to_ids(self.pad_sentence(words, use_eos))

  def ordered_words(self):
    """Return a list of words, ordered by id."""
    return self.ids_to_words(range(self.size))

In [None]:
#@title Corpus Reader
import sys, os

class TSVCorpusReader(object):
    """Corpus reader for TSV files.

    Input files are assumed to contain one sentence per line, with tokens
    separated by tabs:

    foo[tab]bar[tab]baz
    span[tab]eggs

    Would correspond to the two-sentence corpus:
        ["foo", "bar", "baz"],
        ["spam", "eggs"]

    """

    def __init__(self, sentence_file, preload=True, file_reader=open):
        """Construct a corpus reader for the given file.

        Args:
            sentence_file: (string) path to a TSV file with one sentence per
                line.
            preload: (bool) If true, will read entire corpus to memory on
                construction. Otherwise, will load on-demand.
            file_reader: (function string -> fd) optional replacement for
                Python's built-in open(...) method, to be used for reading
                from alternative file-like objects.
        """
        self._open = file_reader
        self._sentence_file = sentence_file
        self._sentence_cache = []

        if preload:
            self._sentence_cache = list(self.sents())

    def _line_iterator(self):
        with self._open(self._sentence_file) as fd:
            for line in fd:
                yield line.strip()

    def sents(self):
        """Iterator over sentences in the corpus.

        Yields:
            list(string) of tokens
        """
        if self._sentence_cache:
            for sentence in self._sentence_cache:
                yield sentence
        else:
            # If no cache, actually read the file.
            for line in self._line_iterator():
                yield line.split("\t")

    def words(self):
        """Iterator over words in the corpus.

        Yields:
            (string) tokens
        """
        for sentence in self.sents():
            for word in sentence:
                yield word

## Creating the vocabulary

Let's now get started with creating the vocabulary. We'll use some of the functions defined in the utility classes we just loaded above.

(Note: the following code cell may take 20-30 seconds to complete running.)

In [None]:
# Create a vocabulary by first canonicalizing all the words -- lowercasing
# and converting all digits to a single string. The vocabulary maintains a
# mapping between words and integer ids.
vocab = Vocabulary(canonicalize_word(w)
                   for w in flatten(corpus))
print "Vocabulary: {:,} words".format(vocab.size)

# Turn the corpus into a single flattened list of tokens, where each sentence
# begins with a special marker <s>.
tokens = preprocess_sentences(corpus, vocab, use_eos=False, emit_ids=False)
print "Corpus: {:,} tokens (counting <s>)".format(len(tokens))

# Retrieve the ids corresponding to the tokens (above). This is the data
# we'll actually use.
token_ids = vocab.words_to_ids(tokens)
print 'Sample words:', tokens[:10]
print 'Sample ids:', token_ids[:5]

Vocabulary: 48,174 words
Corpus: 1,218,533 tokens (counting <s>)
Sample words: [u'<s>' u'the' u'fulton' u'county' u'grand' u'jury' u'said' u'friday'
 u'an' u'investigation']
Sample ids: [0, 3, 5613, 655, 2288]


In [None]:
# A function that produces a sparse co-occurrence matrix given a corpus,
# a vocabulary size V, and K (the context window is +-K).

from scipy import *
from scipy.sparse import csc_matrix
from scipy.sparse import coo_matrix

def co_occurrence_matrix(token_ids, V, K=2):
    # We'll use this as an "accumulator" matrix.
    C = csc_matrix((V,V), dtype=np.float32)

    for k in range(1, K+1):
        print (u'Counting pairs (i, i \u00B1 %d) ...' %k)
        i = token_ids[:-k]  # current word
        j = token_ids[k:]   # k words ahead
        data = (np.ones_like(i), (i,j))  # values, indices
        Ck_plus = coo_matrix(data, shape=C.shape, dtype=np.float32)
        Ck_plus = scipy.sparse.csc_matrix(Ck_plus)
        Ck_minus = Ck_plus.T  # consider k words behind
        C += Ck_plus + Ck_minus

    print( "Co-occurrence matrix: %d words x %d words" %C.shape)
    print ("  %.02g nonzero elements" %C.nnz)
    return C

In [None]:
# Build a toy corpus with the same shape as our corpus object.
toy_corpus = [
    "nlp class is awesome",
    "nlp is awesome fun"
]
toy_corpus = map(str.split, toy_corpus)

# Get vocab, tokens, and token_ids as above.
toy_vocab = Vocabulary(canonicalize_word(w)
                       for w in flatten(toy_corpus))
toy_tokens = preprocess_sentences(toy_corpus, toy_vocab,
                                  use_eos=False, emit_ids=False)
toy_token_ids = toy_vocab.words_to_ids(toy_tokens)

# Build the co-occurrence matrix.
toy_C = co_occurrence_matrix(toy_token_ids, toy_vocab.size, K=1)

# Display a table with the counts. The .toarray() function converts the
# sparse matrix into a dense one.
toy_labels = toy_vocab.ordered_words()
pretty_print_matrix(toy_C.toarray(), rows=toy_labels,
                    cols=toy_labels, dtype=int)

Counting pairs (i, i ± 1) ...
Co-occurrence matrix: 8 words x 8 words
  16 nonzero elements


Unnamed: 0,<s>,</s>,<unk>,nlp,is,awesome,fun,class
<s>,0,0,0,2,0,1,1,0
</s>,0,0,0,0,0,0,0,0
<unk>,0,0,0,0,0,0,0,0
nlp,2,0,0,0,1,0,0,1
is,0,0,0,1,0,2,0,1
awesome,1,0,0,0,2,0,1,0
fun,1,0,0,0,0,1,0,0
class,0,0,0,1,1,0,0,0


In [None]:
def PPMI(C):
    """Tranform a counts matrix to PPMI.
    
    Args:
      C: scipy.sparse.csc_matrix of counts C_ij
    
    Returns:
      (scipy.sparse.csc_matrix) PPMI(C) as defined above
    """
    # Total count.
    Z = float(C.sum())

    # Sum each row (along columns).
    Zr = np.array(C.sum(axis=1), dtype=np.float64).flatten()
    
    # Get indices of relevant elements.
    ii, jj = C.nonzero()  # row, column indices
    Cij = np.array(C[ii,jj], dtype=np.float64).flatten()
    
    # PMI equation.
    pmi = np.log(Cij * Z / (Zr[ii] * Zr[jj]))

    # Truncate to positive only.
    ppmi = np.maximum(0, pmi)  # take positive only
    
    # Re-format as sparse matrix.
    ret = scipy.sparse.csc_matrix((ppmi, (ii,jj)), shape=C.shape,
                                  dtype=np.float64)
    ret.eliminate_zeros()  # remove zeros
    return ret

# Display the PPMI'd version of the co-occurrence matrix.
pretty_print_matrix(PPMI(toy_C).toarray(), rows=toy_labels, 
                    cols=toy_labels, dtype=float)

Unnamed: 0,<s>,</s>,<unk>,nlp,is,awesome,fun,class
<s>,0.0,0.0,0.0,0.9163,0.0,0.2231,0.9163,0.0
</s>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<unk>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nlp,0.9163,0.0,0.0,0.0,0.2231,0.0,0.0,0.9163
is,0.0,0.0,0.0,0.2231,0.0,0.9163,0.0,0.9163
awesome,0.2231,0.0,0.0,0.0,0.9163,0.0,0.9163,0.0
fun,0.9163,0.0,0.0,0.0,0.0,0.9163,0.0,0.0
class,0.0,0.0,0.0,0.9163,0.9163,0.0,0.0,0.0


In [None]:
def SVD(X, d=2):
    """Returns word vectors from SVD.
    
    Args:
      X: [m, n] matrix
      d: word vector dimension
      
    Returns:
      Wv : [m, d] matrix, where each row is a word vector.
    """
    transformer = TruncatedSVD(n_components=d, random_state=0)
    Wv = transformer.fit_transform(X)
    
    # Normalize all vectors to unit length.
    Wv = Wv / np.linalg.norm(Wv, axis=1).reshape([-1,1])
    
    print 'Computed embeddings:', Wv.shape
    return Wv

# Compute 3-dimensional word embeddings for the toy corpus.
dim = 3
embeddings = SVD(PPMI(toy_C).toarray(), d=dim)
pretty_print_matrix(embeddings, rows=toy_labels, cols=range(dim), dtype=float)

Computed embeddings: (8, 3)


Unnamed: 0,0,1,2
<s>,0.6871,-0.5533,0.4709
</s>,0.0785,-0.1472,0.986
<unk>,-0.0505,0.1088,0.9928
nlp,0.6871,0.5533,-0.4709
is,0.6871,0.5533,0.4709
awesome,0.6871,-0.5533,-0.4709
fun,0.726,0.6877,0.0
class,0.726,-0.6877,-0.0


Now let's try computing word vectors on the large corpus.

In [61]:
# Compute 100-dimensional embeddings with a window of size 2.
C = PPMI(co_occurrence_matrix(token_ids, vocab.size, K=2))
embeddings = SVD(C, d=100)
word_list = vocab.ids_to_words(range(48174))

Counting pairs (i, i ± 1) ...
Counting pairs (i, i ± 2) ...
Co-occurrence matrix: 48174 words x 48174 words
  1.5e+06 nonzero elements
Computed embeddings: (48174, 100)


In [62]:
# Compute 300-dimensional embeddings with a window of size 2.
embeddings2300 = SVD(C, d=300)
embeddings2300

Computed embeddings: (48174, 300)


array([[ 2.14057577e-01,  2.76499605e-01,  2.63717840e-01, ...,
        -8.20463560e-03,  2.32135146e-02,  6.25396034e-03],
       [ 4.65172740e-06,  6.01838270e-04, -3.23303182e-03, ...,
         4.15323461e-03, -2.75751686e-02,  1.35739778e-02],
       [-9.23847547e-07, -3.26437064e-04,  2.77146934e-04, ...,
         2.23685013e-02, -3.05506652e-02,  7.14481164e-02],
       ...,
       [ 1.26243448e-01, -1.64350582e-01,  1.15027950e-01, ...,
        -7.36444925e-03, -5.37011144e-04,  1.55631558e-03],
       [ 7.82907828e-02, -6.54041587e-02, -9.23114304e-03, ...,
         2.42974959e-02, -7.63937485e-02,  1.01232290e-02],
       [ 1.62967932e-01, -2.30945697e-01,  3.57959357e-01, ...,
        -3.63380710e-03,  4.77436752e-02,  7.66685009e-02]])

In [64]:
def writedata(filename , X):
  np.savetxt(filename, X)
  with open(filename, 'r' ) as file:
    data= file.readlines()
  count=0
  while count < 48174:
    data[count]= word_list[count] + " "+ data[count]
    count=count+1
  with open(filename, 'w') as file:
    file.writelines(data)


In [65]:
writedata("/content/drive/My Drive/Colab Notebooks/data/f_SVD_2_300.txt", embeddings2300)

In [66]:
embeddings21000 = SVD(C, d=1000)
writedata("/content/drive/My Drive/Colab Notebooks/data/SVD_2_1000.txt", embeddings21000)

Computed embeddings: (48174, 1000)


In [73]:
# Compute 100-dimensional embeddings with a window of size 10.
C = PPMI(co_occurrence_matrix(token_ids, vocab.size, K=10))
embeddings10100 = SVD(C, d=100)

Counting pairs (i, i ± 1) ...
Counting pairs (i, i ± 2) ...
Counting pairs (i, i ± 3) ...
Counting pairs (i, i ± 4) ...
Counting pairs (i, i ± 5) ...
Counting pairs (i, i ± 6) ...
Counting pairs (i, i ± 7) ...
Counting pairs (i, i ± 8) ...
Counting pairs (i, i ± 9) ...
Counting pairs (i, i ± 10) ...
Co-occurrence matrix: 48174 words x 48174 words
  5.9e+06 nonzero elements
Computed embeddings: (48174, 100)


In [68]:
writedata("/content/drive/My Drive/Colab Notebooks/data/SVD_10_100.txt", embeddings10100)

In [74]:
embeddings10300 = SVD(C, d=300)
writedata("/content/drive/My Drive/Colab Notebooks/data/SVD_10_300.txt", embeddings10300)

Computed embeddings: (48174, 300)


In [70]:
embeddings101000 = SVD(C, d=1000)
writedata("/content/drive/My Drive/Colab Notebooks/data/SVD_10_1000.txt", embeddings101000)

Computed embeddings: (48174, 1000)


In [76]:
# Compute 100-dimensional embeddings with a window of size 5.
C = PPMI(co_occurrence_matrix(token_ids, vocab.size, K=5))
embeddings5100 = SVD(C, d=100)
writedata("/content/drive/My Drive/Colab Notebooks/data/SVD_5_100.txt", embeddings5100)

Counting pairs (i, i ± 1) ...
Counting pairs (i, i ± 2) ...
Counting pairs (i, i ± 3) ...
Counting pairs (i, i ± 4) ...
Counting pairs (i, i ± 5) ...
Co-occurrence matrix: 48174 words x 48174 words
  3.4e+06 nonzero elements
Computed embeddings: (48174, 100)


In [77]:
embeddings5300 = SVD(C, d=300)
writedata("/content/drive/My Drive/Colab Notebooks/data/SVD_5_300.txt", embeddings5300)

Computed embeddings: (48174, 300)


In [78]:
embeddings51000 = SVD(C, d=1000)
writedata("/content/drive/My Drive/Colab Notebooks/data/SVD_5_1000.txt", embeddings51000)

Computed embeddings: (48174, 1000)
