In [9]:
import numpy as np
import pickle
from collections import Counter
# shared global variables to be imported from model also
UNK = "$UNK$"
NUM = "$NUM$"


class Dataset(object):
    def __init__(self, filename, emb_file):
        self.filename = filename
        self.emb_file = emb_file


    def generate_vocab(self):
        text_vocab = self.get_vocabs()
        emb_vocab = self.get_w2v_vocab()
        self.vocab = text_vocab & emb_vocab
        # self.vocab.add(UNK)
        return self.vocab



    def get_vocabs(self):
        """
        This function just returns the vocabulary in the text
        :return:
        """
        print "Building vocab from text ..."
        with open(self.filename) as f:
            data = pickle.load(f)
            data = data[0]
            data = [x.split() for x in data]
            vocab_words= set()
            for line in data:
                vocab_words.update(line)
            print("- done. {} tokens".format(len(vocab_words)))
        return vocab_words

    def get_w2v_vocab(self):
        """
        This function return the vocabulary we have in the
        word embedding file
        :return:
        """
        print "Building vocab from embeddings ..."
        vocab = set()
        with open(self.emb_file) as f:
            for line in f:
                word = line.strip().split(' ')[0]
                vocab.add(word)
        print("- done. {} tokens".format(len(vocab)))
        return vocab

def write_vocab(vocab, filename):
    """
    Writes a vocab to a file
    :param vocab:
    :param filename:
    :return: write a word per line
    """

    print("Writing vocab...")
    with open(filename, "w") as f:
        for i, word in enumerate(vocab):
            if i != len(vocab) - 1:
                f.write("{}\n".format(word))
            else:
                f.write(word)
    print("- done. {} tokens".format(len(vocab)))

def load_vocab(filename):
    """
    Loads vocab from a file
    :param filename:
    :return: d: dict[word] = index
    """
    try:
        d = dict()
        with open(filename) as f:
            for idx, word in enumerate(f):
                word = word.strip()
                d[word] = idx

    except IOError:
        print "Generate the vocabulary and embeddings first"
    return d


def export_trimmed_w2v_vectors(vocab, glove_filename, trimmed_filename, dim=300):
    """Saves glove vectors in numpy array

    Args:
        vocab: dictionary vocab[word] = index
        glove_filename: a path to a glove file
        trimmed_filename: a path where to store a matrix in npy
        dim: (int) dimension of embeddings

    """
    embeddings = np.zeros([len(vocab), dim])
    with open(glove_filename) as f:
        for line in f:
            line = line.strip().split(' ')
            word = line[0]
            embedding = [float(x) for x in line[1:]]
            if word in vocab:
                word_idx = vocab[word]
                embeddings[word_idx] = np.asarray(embedding)

    np.savez_compressed(trimmed_filename, embeddings=embeddings)


def get_trimmed_w2v_vectors(filename):
    """
    Load embeddings vectors
    :param filename:
    :return:
    """
    try:
        with np.load(filename) as data:
            return data["embeddings"]

    except IOError:
        print "File: %s, NOT FOUND."%(filename)

In [4]:
# dataset = Dataset("sentences_train_10000.pkl", "../glove.840B.300d.txt")
# dataset.generate_vocab()
# write_vocab(dataset.vocab, "vocab_quora_train.txt")
# vocab = load_vocab("vocab_quora_train.txt")
# export_trimmed_w2v_vectors(vocab, "../glove.840B.300d.txt", "trimmed_embeddings_train_quora.npz")
# glove_array = get_trimmed_w2v_vectors("trimmed_embeddings_train_quora.npz")
# glove_array.shape

Building vocab from text ...
- done. 17443 tokens
Building vocab from embeddings ...
- done. 2196016 tokens
Writing vocab...
- done. 16836 tokens


(16836, 300)

In [1]:
import pickle
a = pickle.load(open("sentences_train_10000.pkl"))

In [16]:
import numpy as np
data = np.load("trimmed_embeddings_train_quora.npz")
data["embeddings"][vocab["yellow"]]

array([ -2.49890000e-01,   2.49400000e-01,  -4.85640000e-02,
        -2.43340000e-01,   3.33380000e-01,  -8.33040000e-02,
        -4.11020000e-01,  -1.42940000e-01,  -6.68310000e-01,
         6.69200000e-01,  -2.36680000e-01,  -2.91170000e-01,
        -1.72890000e-01,  -2.87750000e-01,   2.73950000e-02,
         1.00810000e-01,   3.77060000e-01,   1.96930000e+00,
         1.28460000e-01,  -9.41180000e-01,   3.93660000e-01,
        -5.04500000e-01,  -3.09290000e-01,   2.56340000e-01,
         7.97100000e-02,  -4.47000000e-01,  -3.25850000e-01,
        -3.42420000e-01,   3.11050000e-01,  -9.23360000e-02,
        -2.46230000e-01,   4.64460000e-02,   2.24810000e-01,
        -2.90690000e-01,  -3.74410000e-01,  -1.30850000e-01,
         9.29780000e-02,  -1.13270000e-01,   2.56250000e-01,
        -9.98050000e-02,   5.37940000e-01,   3.41820000e-01,
        -5.11250000e-02,   1.21520000e-02,   9.17360000e-01,
        -4.14920000e-01,  -2.32530000e-01,   2.25690000e-01,
         4.35490000e-01,

In [10]:
vocab = load_vocab("vocab_quora_train.txt")

In [14]:
vocab

{'mohini': 0,
 'Duncan': 13910,
 'unscientific': 1,
 'writings': 2,
 'Szczecin': 3,
 'childern': 4,
 'yellow': 5,
 'Sugar': 6,
 'four': 7,
 'prefix': 8,
 'Does': 9,
 'Olympics': 10,
 'spiders': 11,
 'ornate': 66,
 'centimeter': 14,
 'Until': 15,
 'aggression': 16,
 'Foundation': 101,
 'granting': 18,
 'eligible': 19,
 'electricity': 20,
 'adviced': 21,
 'unanswered': 22,
 'wheeled': 23,
 'Ronald': 24,
 'SPI': 25,
 'outwit': 26,
 'Belgians': 27,
 'Western': 28,
 'Climbing': 30,
 'Avada': 31,
 'Euro': 32,
 'sinking': 33,
 'digit': 34,
 'Airprint': 36,
 'oceans': 37,
 'Www': 11035,
 'experimentally': 40,
 'cobia': 41,
 'bringing': 42,
 'lumia': 43,
 'Comlete': 44,
 'differentiated': 45,
 'basics': 46,
 'internally': 47,
 'scholar': 48,
 'JNU': 49,
 'Less': 50,
 'wooden': 51,
 'CGPA': 52,
 'unblurred': 53,
 'HDFC': 54,
 'Nasdaq': 55,
 'differentiates': 56,
 'deferments': 58,
 'Paul': 59,
 'employee': 16355,
 'straight': 2705,
 'Sandy': 61,
 'Omegle': 62,
 'Rachel': 11042,
 'specially': 64,

In [11]:
len(vocab)

16836

In [13]:
data["embeddings"][]

{'mohini': 0,
 'Duncan': 13910,
 'unscientific': 1,
 'writings': 2,
 'Szczecin': 3,
 'childern': 4,
 'yellow': 5,
 'Sugar': 6,
 'four': 7,
 'prefix': 8,
 'Does': 9,
 'Olympics': 10,
 'spiders': 11,
 'ornate': 66,
 'centimeter': 14,
 'Until': 15,
 'aggression': 16,
 'Foundation': 101,
 'granting': 18,
 'eligible': 19,
 'electricity': 20,
 'adviced': 21,
 'unanswered': 22,
 'wheeled': 23,
 'Ronald': 24,
 'SPI': 25,
 'outwit': 26,
 'Belgians': 27,
 'Western': 28,
 'Climbing': 30,
 'Avada': 31,
 'Euro': 32,
 'sinking': 33,
 'digit': 34,
 'Airprint': 36,
 'oceans': 37,
 'Www': 11035,
 'experimentally': 40,
 'cobia': 41,
 'bringing': 42,
 'lumia': 43,
 'Comlete': 44,
 'differentiated': 45,
 'basics': 46,
 'internally': 47,
 'scholar': 48,
 'JNU': 49,
 'Less': 50,
 'wooden': 51,
 'CGPA': 52,
 'unblurred': 53,
 'HDFC': 54,
 'Nasdaq': 55,
 'differentiates': 56,
 'deferments': 58,
 'Paul': 59,
 'employee': 16355,
 'straight': 2705,
 'Sandy': 61,
 'Omegle': 62,
 'Rachel': 11042,
 'specially': 64,