In [37]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import numpy as np

In [38]:
class sentencesDataset(Dataset):
    def __init__(self, text_file, root_dir, transform=None):
        """
        Args:
            text_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the data
        
        """
        self.root_dir = root_dir
        self.transform = transform
        
        with open(text_file, "r") as f:
            self.sentences = f.readlines()
        
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        sample = self.sentences[idx]
            
        if self.transform:
            sample = self.transform(sample)
            
        return sample 

In [52]:
yelpData = sentencesDataset("data/yelp/sentiment.train.0", "data/yelp/", 
                            transform=GenerateWordEmbeddings(50, "data/glove.6B/"))

print(yelpData[0])

Word   not in GloVe dataset
Word   not in GloVe dataset
Word   not in GloVe dataset
Word   not in GloVe dataset
Word 
 not in GloVe dataset
[array([ 1.1891e-01,  1.5255e-01, -8.2073e-02, -7.4144e-01,  7.5917e-01,
       -4.8328e-01, -3.1009e-01,  5.1476e-01, -9.8708e-01,  6.1757e-04,
       -1.5043e-01,  8.3770e-01, -1.0797e+00, -5.1460e-01,  1.3188e+00,
        6.2007e-01,  1.3779e-01,  4.7108e-01, -7.2874e-02, -7.2675e-01,
       -7.4116e-01,  7.5263e-01,  8.8180e-01,  2.9561e-01,  1.3548e+00,
       -2.5701e+00, -1.3523e+00,  4.5880e-01,  1.0068e+00, -1.1856e+00,
        3.4737e+00,  7.7898e-01, -7.2929e-01,  2.5102e-01, -2.6156e-01,
       -3.4684e-01,  5.5841e-01,  7.5098e-01,  4.9830e-01, -2.6823e-01,
       -2.7443e-03, -1.8298e-02, -2.8096e-01,  5.5318e-01,  3.7706e-02,
        1.8555e-01, -1.5025e-01, -5.7512e-01, -2.6671e-01,  9.2121e-01],
      dtype=float32), array([-0.9432  ,  1.6448  ,  1.202   ,  1.1443  , -0.063614, -0.18644 ,
        0.70952 , -1.3699  , -0.14877 , -0.

      dtype=float32)]


In [53]:
yelpData[0]

Word   not in GloVe dataset
Word   not in GloVe dataset
Word   not in GloVe dataset
Word   not in GloVe dataset
Word 
 not in GloVe dataset


[array([ 1.1891e-01,  1.5255e-01, -8.2073e-02, -7.4144e-01,  7.5917e-01,
        -4.8328e-01, -3.1009e-01,  5.1476e-01, -9.8708e-01,  6.1757e-04,
        -1.5043e-01,  8.3770e-01, -1.0797e+00, -5.1460e-01,  1.3188e+00,
         6.2007e-01,  1.3779e-01,  4.7108e-01, -7.2874e-02, -7.2675e-01,
        -7.4116e-01,  7.5263e-01,  8.8180e-01,  2.9561e-01,  1.3548e+00,
        -2.5701e+00, -1.3523e+00,  4.5880e-01,  1.0068e+00, -1.1856e+00,
         3.4737e+00,  7.7898e-01, -7.2929e-01,  2.5102e-01, -2.6156e-01,
        -3.4684e-01,  5.5841e-01,  7.5098e-01,  4.9830e-01, -2.6823e-01,
        -2.7443e-03, -1.8298e-02, -2.8096e-01,  5.5318e-01,  3.7706e-02,
         1.8555e-01, -1.5025e-01, -5.7512e-01, -2.6671e-01,  9.2121e-01],
       dtype=float32),
 array([-0.9432  ,  1.6448  ,  1.202   ,  1.1443  , -0.063614, -0.18644 ,
         0.70952 , -1.3699  , -0.14877 , -0.88396 , -0.89586 ,  0.29615 ,
        -0.81383 , -0.50923 , -0.60819 , -0.27781 , -0.48124 , -0.40438 ,
        -0.67217 , -0.23

In [54]:
class GenerateWordEmbeddings(object):
    """Transform sentence into list of word embeddings"""
    
    def __init__(self, dim, glove_path):
        """
        args:
            dim (int): dimension of word vectors (50, 100, 200, 300)
        """
        
        filenames = {
            50: "glove.6B.50d.txt",
            100: "glove.6B.100d.txt",
            200: "glove.6B.200d.txt",
            300: "glove.6B.300d.txt",
        }
            
        self.embeddings_dict = {}
        with open(glove_path+filenames[dim], 'r', encoding="utf8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                self.embeddings_dict[word] = vector
    
    def __call__(self, sample):
        """
        args:
            sample (string): sentence to be converted into list of word embeddings
        """
        token_list = self.__tokenize(sample)

        sentence_embedding = []
        for word in sample:
            try:
                vector = self.embeddings_dict[word]
                sentence_embedding.append(vector)
            except:
                print("Word " + word + " not in GloVe dataset")
                
        return sentence_embedding
            
    def __tokenize(self, string):        
        tokens = string.split(sep=" ")
        punctuation = [" ", ".", ",", "/n"]
        
        tokens_clean = []
        for t in tokens:
            if t not in punctuation:
                tokens_clean.append(t)
        
        return tokens_clean
    