In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import numpy as np

In [2]:
class sentencesDataset(Dataset):
    def __init__(self, text_file, root_dir, transform=None):
        """
        Args:
            text_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the data
            transform (callable, optional): Optional transform to be applied
                on a sample.   
        """
        self.root_dir = root_dir
        self.transform = transform
        
        with open(text_file, "r") as f:
            self.sentences = f.readlines()
        
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        sample = self.sentences[idx]
            
        if self.transform:
            sample = self.transform(sample)
            
        return sample 

In [3]:
class GenerateWordEmbeddings(object):
    """Transform sentence into list of word embeddings"""
    
    def __init__(self, dim, glove_path):
        """
        args:
            dim (int): dimension of word vectors (50, 100, 200, 300)
        """
        
        filenames = {
            50: "glove.6B.50d.txt",
            100: "glove.6B.100d.txt",
            200: "glove.6B.200d.txt",
            300: "glove.6B.300d.txt",
        }
            
        self.embeddings_dict = {}
        with open(glove_path+filenames[dim], 'r', encoding="utf8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                self.embeddings_dict[word] = vector
    
    def __call__(self, sample):
        """
        args:
            sample (string): sentence to be converted into list of word embeddings
        """
        token_list = self.__tokenize(sample)

        sentence_embedding = []
        for word in token_list:
            try:
                vector = self.embeddings_dict[word]
                sentence_embedding.append(vector)
            except:
                print("Word " + word + " not in GloVe dataset")
                
        return sentence_embedding
            
    def __tokenize(self, string):        
        tokens = string.split(sep=" ")
        punctuation = [" ", ".", ",", ".\n"]
        
        tokens_clean = []
        for t in tokens:
            if t not in punctuation:
                tokens_clean.append(t)
        
        return tokens_clean

In [9]:
rawSentences = sentencesDataset("../data/yelp/sentiment.train.0", "../data/yelp/")
print(rawSentences[0])

# dimension of word embeddings set to 200 per Zhao et al.
yelpData = sentencesDataset("../data/yelp/sentiment.train.0", "../data/yelp/", 
                            transform=GenerateWordEmbeddings(200, "../data/glove.6B/"))  
print(yelpData[0])

i was sadly mistaken .

[array([ 2.6805e-01,  3.6032e-01, -3.3200e-01, -5.4642e-01, -5.0451e-01,
       -1.3461e-02, -8.0432e-01, -2.4214e-01,  5.3736e-01,  7.7581e-01,
       -3.2554e-01,  4.8300e-01,  8.4265e-01,  3.7780e-01, -1.4767e-01,
        5.3192e-01, -7.0518e-01,  4.4037e-01,  7.5035e-01, -1.8171e-01,
        7.0139e-01,  2.9383e+00,  4.5612e-02, -2.1176e-01,  1.9947e-01,
       -4.8175e-01, -2.5815e-01,  4.6200e-01, -5.6841e-03, -3.0563e-01,
       -5.7541e-01, -1.9527e-02, -1.3751e-01, -5.9450e-01, -3.8216e-01,
       -1.3541e-01, -6.6444e-01, -2.3028e-01, -5.5466e-02,  3.8421e-01,
       -1.6888e-01,  5.1462e-02, -2.8293e-01,  4.5076e-01, -3.6464e-01,
        3.6101e-01,  1.0935e+00, -1.1947e-01,  4.9729e-02,  4.8765e-02,
        4.8944e-01, -3.3138e-04,  1.6365e-01,  4.9743e-01,  3.3814e-01,
        1.5570e-02,  2.5762e-01, -5.8483e-01, -5.5821e-01, -2.9092e-01,
        2.3611e-01, -2.8951e-01, -3.1919e-01,  6.5705e-02, -3.1602e-01,
       -1.2054e-01, -7.7942e-01,  6.013

In [10]:
print(len(yelpData[0][0]))
    

200
