<a href="https://colab.research.google.com/github/RoyElkabetz/Text-Summarization-with-Deep-Learning/blob/main/Understanding_pytorch_vocab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline
import time

import torch
# from torchtext.datasets import IMDB as the_dataset
from torchtext.datasets import AG_NEWS as the_dataset
import torchtext.data as data
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torch import nn




print(f'torch {torch.__version__}')
print('Device properties:')
if torch.cuda.is_available():
    device = torch.device("cuda")
    gpu_data = torch.cuda.get_device_properties(0)
    gpu_name = gpu_data.name
    gpu_mem  = f'{gpu_data.total_memory * 1e-9:.02f} Gb'
    print(f'GPU: {gpu_name}\nMemory: {gpu_mem}')
else:
    device = torch.device("cpu")
    print('CPU')

torch 1.9.0+cu102
Device properties:
CPU


In [28]:
tokenizer = get_tokenizer('basic_english')
train_iter = the_dataset(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<sos>", "<eos>"])
vocab.set_default_index(vocab["<unk>"])

In [29]:
# get index of a single word
vocab['yes']

5166

In [30]:
# get indices of a list of words
words = ['my', 'name', 'is', 'jdsckdsc']
vocab.forward(words)

[1302, 953, 23, 0]

In [31]:
# size of vocabulary
print(len(vocab))

95813


In [32]:
# setting the out of vocabulary (OOV) index
vocab.set_default_index(99999999)
vocab['wrbcwkcbwkj']

99999999

In [33]:
vocab.get_default_index()

99999999

In [34]:
# append a token at a specific index
print(vocab['elkabetz'])
vocab.insert_token('elkabetz', 12)
print(vocab['elkabetz'])

99999999
12


In [35]:
# try to append an excisting token
vocab.append_token('elkabetz')

RuntimeError: ignored

In [37]:
# append a nonexcisting token
vocab.append_token('royelkabetz1')
print(vocab['royelkabetz1'])

95815


In [38]:
vocab.get_stoi()

{'zzzzzz': 95813,
 'zzz': 95812,
 'zygmunt': 95810,
 'zvezda': 95807,
 'zurine': 95806,
 'zurab': 95804,
 'zuloaga': 95802,
 'zulf': 95801,
 'zuhua': 95800,
 'zseries': 95797,
 'zotinca': 95794,
 'zos': 95793,
 'zopyros': 95790,
 'zoock': 95787,
 'zoo\\as': 95786,
 'zone-h': 95782,
 'zoll': 95780,
 'zolecki': 95779,
 'zocalo': 95775,
 'zits': 95772,
 'zirconium': 95770,
 'zinter': 95768,
 'zinged': 95767,
 'zims': 95766,
 'zimmerman': 95765,
 'zilch': 95763,
 'zif': 95758,
 'zico': 95757,
 'ziadi': 95756,
 'zhukov\\said': 95755,
 'zhirkov': 95754,
 'zhiguli': 95753,
 'zesn': 95748,
 'zero-tolerance': 95746,
 'zero-profit': 95744,
 'zero-energy': 95742,
 'zeppieri': 95740,
 'zen-like': 95734,
 'zemaryalai': 95733,
 'zeitoun': 95731,
 'zeitgeist': 95730,
 'zealous': 95729,
 'zealand-qantas': 95726,
 'zdenek': 95722,
 'zd8000': 95721,
 'zawodny': 95720,
 'zavos': 95718,
 'zauchensee': 95717,
 'zauberfloete': 95716,
 'zarya': 95714,
 'zarqawis': 95713,
 'zapruder': 95711,
 'zapatistas': 95

In [39]:
vocab.get_itos()

['<unk>',
 '<sos>',
 '<eos>',
 '.',
 'the',
 ',',
 'to',
 'a',
 'of',
 'in',
 'and',
 's',
 'elkabetz',
 'on',
 'for',
 '#39',
 '(',
 ')',
 '-',
 "'",
 'that',
 'with',
 'as',
 'at',
 'is',
 'its',
 'new',
 'by',
 'it',
 'said',
 'reuters',
 'has',
 'from',
 'an',
 'ap',
 'his',
 'will',
 'after',
 'was',
 'us',
 'be',
 'over',
 'have',
 'their',
 '&lt',
 'are',
 'up',
 'quot',
 'but',
 'more',
 'first',
 'two',
 'he',
 'world',
 'u',
 'this',
 '--',
 'company',
 'monday',
 'wednesday',
 'tuesday',
 'oil',
 'out',
 'thursday',
 'one',
 'not',
 'against',
 'inc',
 'friday',
 'into',
 'they',
 'about',
 'last',
 'iraq',
 'year',
 'than',
 'york',
 'yesterday',
 'who',
 'president',
 'microsoft',
 'no',
 'were',
 '?',
 'been',
 'million',
 't',
 'says',
 'week',
 'had',
 'corp',
 'united',
 'game',
 'when',
 'sunday',
 'prices',
 'could',
 'three',
 'would',
 'today',
 'years',
 'group',
 'security',
 'government',
 'time',
 'people',
 'which',
 'may',
 'afp',
 'percent',
 'software',
 '1

In [42]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
counter = Counter(["a", "a", "b", "b", "b"])
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
print(counter)
print(sorted_by_freq_tuples)

Counter({'b': 3, 'a': 2})
[('b', 3), ('a', 2)]


In [43]:
ordered_dict = OrderedDict(sorted_by_freq_tuples)
print(ordered_dict)

OrderedDict([('b', 3), ('a', 2)])


In [44]:
v1 = vocab(ordered_dict)
print(v1['a']) #prints 1
print(v1['out of vocab']) #raise RuntimeError since default index is not set


1


RuntimeError: ignored

In [45]:
tokens = ['e', 'd', 'c', 'b', 'a']
v2 = vocab(OrderedDict([(token, 1) for token in tokens]))
#adding <unk> token and default index
unk_token = '<unk>'
default_index = -1
if unk_token not in v2: v2.insert_token(unk_token, 0)
v2.set_default_index(default_index)
print(v2['<unk>']) #prints 0
print(v2['out of vocab']) #prints -1
#make default index same as index of unk_token
v2.set_default_index(v2[unk_token])
v2['out of vocab'] is v2[unk_token] #prints True

0
-1


True