In [42]:
import numpy as np
import pandas as pd
import spacy
import torch
from torchtext.data import Field, TabularDataset, Iterator, BucketIterator
from torchtext import vocab
from sklearn.model_selection import train_test_split

spacy_en = spacy.load('en')

In [2]:
#load in the data
df = pd.read_csv('~/Dropbox/DS-1012/PoetryEmotionClassification/Data/poems_cleaned.csv')
df.head()

Unnamed: 0,text,anger,anticipation,fear,joy,love,optimism,pess,sad
0,I’m laying with you In my room In your room in...,False,True,False,False,True,False,False,False
1,You were a soft blur in everyday sight you lef...,False,False,False,True,False,False,False,False
2,Pocket thoughts thinking away itches I’ll have...,False,False,True,False,False,False,True,False
3,An Open Letter to the Girl in My Bed Remember ...,True,False,True,False,False,False,False,True
4,Come lie at my side in the dark In the fresh s...,False,False,False,False,False,False,False,True


In [3]:
#converting T/F to 0/1
for e in df.columns[1:]:
    df[e] = df[e].astype(int)
df.head()

Unnamed: 0,text,anger,anticipation,fear,joy,love,optimism,pess,sad
0,I’m laying with you In my room In your room in...,0,1,0,0,1,0,0,0
1,You were a soft blur in everyday sight you lef...,0,0,0,1,0,0,0,0
2,Pocket thoughts thinking away itches I’ll have...,0,0,1,0,0,0,1,0
3,An Open Letter to the Girl in My Bed Remember ...,1,0,1,0,0,0,0,1
4,Come lie at my side in the dark In the fresh s...,0,0,0,0,0,0,0,1


In [4]:
'''
ONLY PERFORM THIS ONCE: SPLIT INTO TRAIN/VAL/TEST SETS FOR TORCHTEXT
'''

#split into train/val/test
#df_train, df_test = train_test_split(df, test_size = 0.2)
#df_train, df_val = train_test_split(df_train, test_size = 0.2)

#write out to train/val/test csv
#df_train.to_csv('../Data/poems_train.csv',index=False) #80% of the original 80%
#df_val.to_csv('../Data/poems_val.csv',index=False) #20% of the original 80%
#df_test.to_csv('../Data/poems_test.csv',index=False) #20% of original data

'\nNEED ONLY PERFORM ONCE: SPLIT INTO TRAIN/VAL/TEST SETS FOR TORCHTEXT\n'

In [5]:
#create a tokenizer function
def tokenizer(text): 
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = Field(sequential=False, use_vocab=False)

datafields = [("text", TEXT), ("anger", LABEL),
                 ("anticipation", LABEL),("fear", LABEL),
                 ("joy", LABEL),("love", LABEL),
                 ("optimism", LABEL),('pess', LABEL),('sad', LABEL)]
train, val, test = TabularDataset.splits(
               path='../Data/',
               train='poems_train.csv', validation="poems_val.csv",
               test='poems_test.csv',format='csv',
               skip_header=True,fields=datafields)

In [6]:
#view example text
val[1].text

['brittle',
 'bones',
 'on',
 'sinew',
 'string',
 'slice',
 'of',
 'beak',
 'and',
 'plume',
 'and',
 'wing',
 'scraps',
 'of',
 'fur',
 'with',
 'ebon',
 'beads',
 'twisted',
 'bark',
 'from',
 'septic',
 'seeds',
 'crystal',
 'charms',
 'and',
 'stones',
 'that',
 'float',
 'rest',
 'upon',
 'the',
 'shamans',
 'throat',
 'a',
 'chilling',
 'hum',
 'which',
 'glows',
 'bright',
 'red',
 'spells',
 'out',
 'letters',
 'never',
 'read',
 'and',
 'sings',
 'in',
 'voices',
 'seldom',
 'heard',
 'to',
 'speak',
 'in',
 'tongues',
 'without',
 'a',
 'word',
 'ensnared',
 'or',
 'caught',
 'with',
 'all',
 'perceived',
 'the',
 'shaman',
 'sits',
 'beneath',
 'the',
 'trees',
 'which',
 'whisper',
 'loud',
 'with',
 'leaves',
 'that',
 'peel',
 'and',
 'splinter',
 'open',
 'to',
 'reveal',
 'a',
 'face',
 'that',
 'frowns',
 'with',
 'mirthless',
 'glee',
 'in',
 'asymmetric',
 'symmetry',
 'it',
 'screams',
 'to',
 'tear',
 'the',
 'jagged',
 'sky',
 'the',
 'shaman',
 'never',
 'wonder

In [23]:
#building vocabulary w/ glove word embeddings
TEXT.build_vocab(train, vectors='glove.6B.300d') #glove42B.300d/glove.840B.300d take way too long to load 

.vector_cache/glove.6B.zip: 862MB [52:43, 273kB/s]                                
100%|█████████▉| 399484/400000 [01:10<00:00, 7589.12it/s]

In [35]:
train_iter, val_iter = BucketIterator.splits((train, val), batch_sizes=(64, 64),\
                                             device='cpu',sort_key=lambda x: len(x.text),\
                                             sort_within_batch=False,\
                                             repeat=False)
test_iter = Iterator(test, batch_size=64, device='cpu', sort=False, sort_within_batch=False, repeat=False)

In [46]:
'''
NOTE: 
BucketIterator returns a Batch object instead of text index and labels. 
Also, the Batch object is not iterable like pytorch Dataloader
This is a wrapper to extract the text and labels + make the Batch iterable
'''

class BatchWrapper:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            # concatenate y into a single tensor
            y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_field], dim=1).float()
            yield (X,y)

In [52]:
emotions = ['anger','anticipation','fear','joy','love','optimism','pess','sad']
train_batch = BatchWrapper(train_iter, "text", emotions)
valid_batch = BatchWrapper(val_iter, "text", emotions)
test_batch = BatchWrapper(test_iter, "text", emotions)

In [53]:
next(train_batch.__iter__()) #same as next(iter(train_dl))

(tensor([[ 377, 1611,   49,  ...,   11,    6,   12],
         [  50,  109,    4,  ...,  855,  152,   34],
         [ 140,   30,  124,  ...,   48,   13,   23],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]]),
 tensor([[0., 0., 0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 0., 1., 1., 0., 0.],
         [0., 0., 1., 1., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 0., 1.],
         [1., 0., 1., 0., 0., 0., 1., 0.],
         [1., 0., 1., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 1., 1.],
         [1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 1.],
         [0., 0., 0.,