# Word Embeddings

### Imports

In [None]:
# 跑一个单个文件都需要几分钟 可以不跑大的数据集 感觉这个数据后面也用不到

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import pickle
import re

torch.manual_seed(1)

<torch._C.Generator at 0x20d78e92a10>

### Check GPU

In [5]:
gpu_status = 'OK' if torch.cuda.is_available() else 'Unavailable'
print(f'Estado GPU: {gpu_status}')
print(f'GPU: {torch.cuda.get_device_name(0)}')

Estado GPU: OK
GPU: NVIDIA GeForce RTX 3060 Laptop GPU


### Read Data

In [6]:
with open("D:/try/split_text/split_0.txt",'r',encoding='utf-8') as f:
    text = f.read()

In [7]:
text[:600]

"['Multifaceted', 'Russian', 'experiencen', 'According', 'to', 'current', 'live', 'statistics', 'at', 'the', 'time', 'of', 'editing', 'this', 'letter', 'Russia', 'has', 'been', 'the', 'third', 'country', 'in', 'the', 'world', 'to', 'be', 'affected', 'by', 'with', 'both', 'new', 'cases', 'and', 'death', 'rates', 'rising', 'It', 'remains', 'in', 'a', 'position', 'of', 'advantage', 'due', 'to', 'the', 'later', 'onset', 'of', 'the', 'viral', 'spread', 'within', 'the', 'country', 'since', 'the', 'worldwide', 'disease', 'outbreakn', 'The', 'first', 'step', 'in', 'fighting', 'the', 'epidemic', 'was', "

### Tokenizer casero

In [8]:
PATTERN = r"\w+"
text_tuneado = re.findall(PATTERN, text)
text_tuneado

['Multifaceted',
 'Russian',
 'experiencen',
 'According',
 'to',
 'current',
 'live',
 'statistics',
 'at',
 'the',
 'time',
 'of',
 'editing',
 'this',
 'letter',
 'Russia',
 'has',
 'been',
 'the',
 'third',
 'country',
 'in',
 'the',
 'world',
 'to',
 'be',
 'affected',
 'by',
 'with',
 'both',
 'new',
 'cases',
 'and',
 'death',
 'rates',
 'rising',
 'It',
 'remains',
 'in',
 'a',
 'position',
 'of',
 'advantage',
 'due',
 'to',
 'the',
 'later',
 'onset',
 'of',
 'the',
 'viral',
 'spread',
 'within',
 'the',
 'country',
 'since',
 'the',
 'worldwide',
 'disease',
 'outbreakn',
 'The',
 'first',
 'step',
 'in',
 'fighting',
 'the',
 'epidemic',
 'was',
 'nationwide',
 'lock',
 'down',
 'on',
 'March',
 'th',
 'Most',
 'of',
 'the',
 'multidisciplinary',
 'hospitals',
 'have',
 'been',
 'repurposed',
 'as',
 'dedicated',
 'centres',
 'so',
 'the',
 'surgeons',
 'started',
 'working',
 'as',
 'infectious',
 'disease',
 'specialists',
 'Such',
 'a',
 'reallocation',
 'of',
 'health'

### N-grama modelado simple

In [10]:
# Params de modelo
CONTEXT_SIZE = 3
EMBEDDING_DIM = 10
HIDDEN_DIM = 128
EPOCHS = 100
LEARNING_RATE = 0.01

In [11]:
ngrams = [
    (
        [text_tuneado[i - j - 1] for j in range(CONTEXT_SIZE)],
        text_tuneado[i]
    )
    for i in range(CONTEXT_SIZE, len(text_tuneado))
]

print(ngrams[:3])


[(['experiencen', 'Russian', 'Multifaceted'], 'According'), (['According', 'experiencen', 'Russian'], 'to'), (['to', 'According', 'experiencen'], 'current')]


In [12]:
# encoder
vocab = set(text_tuneado)
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [13]:
# 打印key，value
for i,j in word_to_ix.items():
    print(i,j)

advantage 0
risk 1
via 2
First 3
evacuation 4
testing 5
an 6
DOI 7
management 8
negative 9
option 10
approach 11
economic 12
patients 13
surgery 14
for 15
June 16
McGrath 17
onset 18
guidance 19
epidemic 20
rising 21
hospitals 22
BA 23
get 24
mostly 25
statistical 26
qualified 27
postoperative 28
Coloproctology 29
letter 30
Such 31
was 32
within 33
equipment 34
electiveness 35
spread 36
already 37
protective 38
S 39
not 40
in 41
well 42
admission 43
Medkova 44
lot 45
era 46
result 47
rearrangements 48
work 49
personal 50
need 51
treatmentn 52
remains 53
can 54
Korea 55
so 56
since 57
as 58
performedn 59
statistics 60
changed 61
up 62
Tsarkov 63
Invasive 64
operating 65
only 66
Medical 67
Minimally 68
outbreakn 69
admitted 70
endotracheal 71
A 72
reconstructive 73
one 74
curve 75
respiratory 76
services 77
live 78
significantly 79
multidisciplinary 80
undergone 81
are 82
Her 83
lockdown 84
printn 85
Multifaceted 86
procedures 87
nationwide 88
types 89
doffing 90
due 91
people 92
tests 9

In [14]:
# Modelo

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [15]:
model = NGramLanguageModeler(
                            len(vocab),
                            EMBEDDING_DIM,
                            CONTEXT_SIZE,
                            HIDDEN_DIM).cuda()

In [16]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [17]:
losses = []

for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in ngrams:

        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).cuda()

        model.zero_grad()
        
        log_probs = model(context_idxs)

        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long).cuda())

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f'Epoch: {epoch} Loss: {total_loss}')
    losses.append(total_loss)

Epoch: 0 Loss: 4232.83390712738
Epoch: 1 Loss: 3923.931035876274
Epoch: 2 Loss: 3709.273306131363
Epoch: 3 Loss: 3511.8804318904877
Epoch: 4 Loss: 3305.346753537655
Epoch: 5 Loss: 3076.7606232762337
Epoch: 6 Loss: 2818.2889121472836
Epoch: 7 Loss: 2525.879929587245
Epoch: 8 Loss: 2202.110669732094
Epoch: 9 Loss: 1857.6769819557667
Epoch: 10 Loss: 1511.8522354960442
Epoch: 11 Loss: 1189.5340244621038
Epoch: 12 Loss: 913.4277119971812
Epoch: 13 Loss: 693.2970515750349
Epoch: 14 Loss: 525.9585544075817
Epoch: 15 Loss: 402.2911569047719
Epoch: 16 Loss: 311.2117184540257
Epoch: 17 Loss: 244.93320626113564
Epoch: 18 Loss: 197.6689312653616
Epoch: 19 Loss: 164.73960142955184
Epoch: 20 Loss: 141.55613748636097
Epoch: 21 Loss: 124.69833179563284
Epoch: 22 Loss: 111.98307395260781
Epoch: 23 Loss: 102.16077346960083
Epoch: 24 Loss: 94.46898445719853
Epoch: 25 Loss: 88.44665966834873
Epoch: 26 Loss: 83.46777015179396
Epoch: 27 Loss: 79.29145686840639
Epoch: 28 Loss: 75.70381501037627
Epoch: 29 Los

In [18]:
# To get the embedding of a particular word, e.g. " status"
# 这里的word需要在 word_to_ix 中有出现才行
print(model.embeddings.weight[word_to_ix["status"]])

tensor([-2.0155, -0.6443,  0.0796, -0.5784,  0.7583,  0.3287, -2.2575, -1.1584,
         0.4179, -0.1887], device='cuda:0', grad_fn=<SelectBackward0>)
