This tutorial will use torchtext and spacy for preprocessing and creating data for pytorch language models. Here we will use, IMDB movie review dataset as an example.

In [1]:
import torch
from torchtext import data
from torchtext import datasets
from torchtext.data import Field, LabelField, TabularDataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import spacy
from sklearn.model_selection import train_test_split

First, let us load the data, convert the label from String to Numeric values

In [8]:
pth = '~/Dropbox/DataRepo/IMDB_Movie/'
fil = 'imdb_master.csv'
encoder = LabelEncoder()
df = pd.read_csv(pth+fil, encoding = "ISO-8859-1")
df['label'] = encoder.fit_transform(df['label'])
# select necessary columns
# df = df[['review','label']]
df.head(5)

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,0,0_2.txt
1,1,test,This is an example of why the majority of acti...,0,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",0,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,0,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,0,10003_3.txt


In [9]:
# split data into test, validation and train
train = df[df['type'] == 'train']
test = df[df['type'] == 'test']
# select necessary columns and discard the rest
s_clm = ['review','label']
train,valid = train_test_split(df,train_size=0.8)
train,valid,test = train[s_clm],valid[s_clm],test[s_clm]
print('length of train data:{}'.format(len(train)))
print('length of validation data:{}'.format(len(valid)))
print('length of validation data:{}'.format(len(test)))
# write files
train.to_csv(pth+'train.csv',index=False)
valid.to_csv(pth+'valid.csv',index=False)
test.to_csv(pth+'test.csv',index=False)

length of train data:80000
length of validation data:20000
length of validation data:25000


In [10]:
# load the spacy english model, and return the tokenized text
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    # load the spacy language model, NOTE make sure to download the model in terminal using: python -m spacy download <language name>
    spacy_en = spacy.load('en')
    # tokenize text
    return [tok.text for tok in spacy_en.tokenizer(text)]

txt = Field(sequential=True, tokenize='spacy', lower=True)
label = LabelField(dtype = torch.int)
# first you  have to say how the data is structured 
d_format = [('review',txt),('label',label)]

In [11]:
trn,val,tst = TabularDataset.splits(path=pth,train='train.csv',validation='valid.csv',\
                                    test='test.csv',format='csv',skip_header=True,fields=d_format)

Visualize the splits as follows

In [12]:
print(vars(trn.examples[0]))

{'review': ['i', 'love', 'this', 'movie', '.', 'even', 'though', 'i', 'rated', 'it', 'a', '"', '4', '"', ',', 'that', "'s", 'because', 'the', 'acting', ',', 'the', 'plot', 'and', 'the', 'budget', 'were', 'all', 'slated', 'to', 'the', '"', 'b', '"', 'universe', 'even', 'before', 'this', 'movie', 'was', 'released', '.', 'but', 'that', "'s", 'ok', '!', 'it', 'is', 'an', 'entertaining', 'film', 'that', 'has', 'a', 'lot', 'to', 'offer', '!', '<', 'br', '/><br', '/>i', 'remember', 'what', 'leonard', 'maltin', 'said', 'about', '"', 'plan', '9', 'from', 'outer', 'space', '"', ':', 'a', 'film', 'so', 'bad', 'that', 'it', "'s", 'great', '!', 'lacking', 'the', 'ufo', '-', 'alien', 'plot', ',', 'the', 'thing', 'the', 'could', "n't", 'die', 'relies', 'on', 'the', 'supernatural', '(', 'divination', ',', 'a', 'buried', 'head', 'looking', 'for', 'it', "'s", 'body', ',', 'hypnosis', ',', 'etc', ')', 'to', 'tell', 'it', "'s", 'story', '.', 'the', 'acting', 'is', 'stilted', ',', 'the', 'camera', 'work', 