In [6]:
import torch
import pandas as pd
import numpy as np
from torchtext import datasets
from torchtext.legacy import data

In [3]:
input_data = pd.read_csv('polarity.tsv',delimiter='\t',header = None, names = ['text','label'])
input_data

Unnamed: 0,text,label
0,films adapted from comic books have had plenty...,1
1,every now and then a movie comes along from a ...,1
2,you've got mail works alot better than it dese...,1
3,jaws is a rare film that grabs your attentio...,1
4,moviemaking is a lot like being the general ma...,1
...,...,...
1995,"if anything , "" stigmata "" should be taken as ...",0
1996,"john boorman's "" zardoz "" is a goofy cinematic...",0
1997,the kids in the hall are an acquired taste .it...,0
1998,there was a time when john carpenter was a gre...,0


In [8]:
import re
def remove_non_char(x):
  x = ' '.join(x)
  x = re.sub("[^a-zA-Z]", ' ',x)
  x = x.split()

  return x

In [12]:
#create field and dataset
text_field = data.Field(sequential = True, dtype=torch.float64,lower=True, tokenize='spacy',preprocessing = remove_non_char)
label_field = data.Field(sequential = False)
input_data = data.TabularDataset(path='polarity.tsv', 
                                 format='tsv', 
                                 fields=[('text', text_field), ('label', label_field)])

In [13]:
#create examples
examples = input_data.examples
np.random.shuffle(examples)
#train_test_split
train_ex = examples[:int(len(examples)*0.8)]
test_ex = examples[int(len(examples)*0.8):]
#create dataset
train_data = data.Dataset(examples=train_ex, fields={'text':text_field, 'label':label_field})
test_data = data.Dataset(examples=test_ex, fields={'text':text_field, 'label':label_field})
train_data[0].label, train_data[0].text

('1',
 ['upon',
  'taking',
  'a',
  'seat',
  'at',
  'the',
  'theater',
  'and',
  'surveying',
  'the',
  'crowd',
  'i',
  'soon',
  'realized',
  'that',
  'i',
  'was',
  'the',
  'only',
  'person',
  'under',
  'forty',
  'in',
  'the',
  'premises',
  'i',
  'm',
  'twenty',
  'two',
  'but',
  'have',
  'been',
  'accused',
  'of',
  'looking',
  'sixteen',
  'yet',
  'as',
  'the',
  'film',
  'began',
  'any',
  'consciousness',
  'of',
  'setting',
  'shifted',
  'to',
  'the',
  'home',
  'of',
  'five',
  'close',
  'knit',
  'yet',
  'troubled',
  'sisters',
  'director',
  'pat',
  'o',
  'connor',
  'inventing',
  'the',
  'abbots',
  'circle',
  'of',
  'friends',
  'weaves',
  'a',
  'quiet',
  'yet',
  'affecting',
  'tale',
  'of',
  'loss',
  'need',
  'and',
  'the',
  'bonds',
  'between',
  'five',
  'sisters',
  'the',
  'family',
  'is',
  'ruled',
  'by',
  'kate',
  'meryl',
  'streep',
  'an',
  'unconsciously',
  'strict',
  'schoolteacher',
  'who',
  

In [14]:
#create dictionary
text_field.build_vocab(train_data)
label_field.build_vocab(train_data)
print(f"Vocabularies of index 0-5: {text_field.vocab.itos[:10]} \n")
print(f"words to index {text_field.vocab.stoi}")

Vocabularies of index 0-5: ['<unk>', '<pad>', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 's'] 



In [18]:
train_iter, test_iter = data.Iterator.splits(datasets = (train_data,test_data),batch_sizes =(2,2) ,repeat=False,sort_key = lambda ex:len(ex.text))

In [19]:
for train_batch in train_iter:
    print(train_batch.text, train_batch.text.shape)
    print(train_batch.label, train_batch.label.shape)
    break

tensor([[2.7960e+03, 1.2500e+02],
        [6.0000e+00, 4.4480e+03],
        [2.9540e+03, 2.6570e+04],
        [4.0000e+00, 9.0000e+00],
        [8.5900e+02, 5.3060e+03],
        [7.8000e+01, 3.0000e+00],
        [7.4300e+02, 3.1840e+03],
        [4.1690e+03, 7.0100e+02],
        [3.0432e+04, 9.8900e+02],
        [1.9093e+04, 1.6000e+01],
        [3.6490e+03, 1.6800e+02],
        [1.4630e+04, 7.0000e+00],
        [4.0000e+00, 2.9000e+01],
        [7.7900e+02, 6.6980e+03],
        [2.6777e+04, 4.0830e+03],
        [8.0000e+00, 1.6200e+02],
        [2.4500e+02, 4.5000e+01],
        [1.2725e+04, 7.9000e+01],
        [6.8000e+01, 4.5020e+03],
        [2.5000e+01, 3.9780e+03],
        [2.2900e+02, 1.6160e+04],
        [1.6520e+03, 8.0000e+00],
        [2.3200e+02, 2.9000e+01],
        [1.4000e+01, 3.4270e+03],
        [3.9470e+03, 7.2000e+01],
        [3.4804e+04, 4.2000e+01],
        [3.4000e+01, 3.0000e+00],
        [2.0000e+00, 2.4140e+03],
        [5.0900e+02, 1.3000e+01],
        [5.000