# Install torchtext

In [4]:
#!conda install -c pytorch torchtext --y

# Loading Data
1. Internal 
2. External

In [26]:
import torch

In [27]:
from torchtext.data.utils import get_tokenizer
import string
import spacy


nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser', 'tagger'])


all_stopwords = list(nlp.Defaults.stop_words)
stop_punctuations = list(string.punctuation)

all_stopwords.extend(stop_punctuations)
all_stopwords.extend(["/><br", "---", "...", "-pron-"])

In [28]:
def tokenizer(document, print_n=1000, label=None):

    import re
    import string

    doc = nlp(document)
    lemmas = [token.lemma_ for token in doc]

    return lemmas

# Specify how to handle each column in the data

In [29]:
from torchtext import data
text_field = data.Field(sequential = True,
                        tokenize = tokenizer, 
                        use_vocab=True, #Set to false for numeric
                        lower=True,
                        stop_words=all_stopwords, 
                        batch_first = True
                       )
target_field = data.Field(sequential = False,
                         use_vocab=False, # Set to false for numeric
                        batch_first = True
                         )

# Loading an external dataset

In [30]:
from torchtext.data import TabularDataset

#Specify how to handle each column in the data
imdb_datafields = [("index", None),
                   ("review", text_field), 
                   ("sentiment", target_field)]

# Loading starts here
train_ds = TabularDataset(path = "data/imdb_reviews_train.csv",
                           format = "csv",
                           fields = imdb_datafields,
                           skip_header=True # True if the file has header
                        )

In [31]:
# Can be accessed using index 

In [32]:
print("   Review : {}".format(train_ds[0].review[0:10]))
print("Sentiment : {}".format(train_ds[0].sentiment))

   Review : ['movie', 'respect', 'sure', 'lot', 'memorable', 'quote', 'list', 'gem', 'imagine', 'movie']
Sentiment : 1


# Loading Train and Test Files in a single go

In [33]:
train_ds, test_ds = TabularDataset.splits(path = "data",
                       train = "imdb_reviews_train.csv",
                       test = "imdb_reviews_test.csv",
                       format = "csv",
                       fields = imdb_datafields,
                       skip_header=True 
                       )

In [34]:
print("Number of Documents in the Corpus:")
print("Train : {}".format(len(train_ds)))
print(" Test : {}".format(len(test_ds)))

Number of Documents in the Corpus:
Train : 25000
 Test : 25000


In [35]:
#!conda install -c conda-forge spacy  --y
#!python3 -m spacy download en_core_web_sm

# Build Vocabulary

1. Building only based on train data.
2. Creates index for each token

In [47]:
text_field.build_vocab(train_ds,max_size = 500, min_freq = 50)

In [48]:
text_field.vocab.freqs.most_common(20)

[('movie', 50366),
 ('film', 47007),
 ('good', 23827),
 ('like', 22170),
 ('time', 15559),
 ('character', 13852),
 ('watch', 13413),
 ('story', 12848),
 ('think', 12199),
 ('little', 11409),
 ('scene', 10330),
 ('look', 9841),
 ('great', 9835),
 ('know', 9310),
 ('people', 9222),
 ('end', 9182),
 ('bad', 9119),
 ('way', 8592),
 ('play', 8533),
 ('act', 8452)]

# Frequency of each token in the vocab

In [49]:
import pandas as pd
df_token_count = pd.DataFrame.from_dict(text_field.vocab.freqs, 
                                        orient="index").reset_index()
df_token_count.columns = ["token", "frequency"]
df_token_count.head()

Unnamed: 0,token,frequency
0,movie,50366
1,respect,640
2,sure,2621
3,lot,4619
4,memorable,649


# Word index creation

In [50]:
df_token_idx = pd.DataFrame.from_dict(text_field.vocab.stoi, 
                                        orient="index").reset_index()
df_token_idx.columns = ["token", "idx"]
df_token_idx.head(10)


Unnamed: 0,token,idx
0,<unk>,0
1,<pad>,1
2,movie,2
3,film,3
4,good,4
5,like,5
6,time,6
7,character,7
8,watch,8
9,story,9


# How to create mini batches for training

## Iterator

Equivalent to DataLoader

In [51]:
from torchtext.data import Iterator

In [52]:
train_dl, test_dl = Iterator.splits(datasets = (train_ds, test_ds),
                batch_sizes = (2, 2))              

In [92]:
train_ds[0].review

['movie',
 'respect',
 'sure',
 'lot',
 'memorable',
 'quote',
 'list',
 'gem',
 'imagine',
 'movie',
 'joe',
 'piscopo',
 'actually',
 'funny',
 'maureen',
 'stapleton',
 'scene',
 'stealer',
 'moroni',
 'character',
 'absolute',
 'scream',
 'watch',
 'alan',
 'skipper',
 'hale',
 'jr',
 'police',
 'sgt']

In [53]:
for batch in train_dl:
    print(batch.review, batch.sentiment)
    break

tensor([[  5, 117,   0,   0,   0,   0, 130,   2,   0,   0,   0,   0,   0,   0,
         347,   0,   3,   0,   0,   0,   0,   0, 276,   0, 488,   0,   0,   0,
           0,   0,  37,   0,   0, 235,   0, 191,   0,   0,   0,   0,   0,   0,
           0,   0, 200,   0,   0,  94,   0, 425,  17, 131,  55,   0,   0, 132,
          69,   0,  41,   0,   0,   2,   0,   0,   0,   0,   2,   0,   0, 353,
           0,  55,  55,   9,   0,  82, 100,  82,   6,   6,   0, 166,   0,   0,
           0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [102,   0,  78,   0,   0,   0

In [22]:
# Pipeline

# - chapter TBD

In [23]:
# Bucket Iterator 

# - LSTM chapter
# - valid for sequence data - groups sequences of similar length together for 
# computational efficiency while padding

In [24]:
# BPTT Iterator
# - Language Model chapter
