### 数据预处理
### 构建损失器及网络
### 模型训练

In [6]:
#导入包
import numpy as np
import torch
from torch import nn,optim
import random
from collections import Counter

### read file

In [20]:
def read_file(file_path):
    #读取文件
    with open(file_path) as f:
        text = f.read()
    

### preprocess

In [17]:
#预处理
def preprocess(text,freq):
    #英文预料
    text = text.lower()
    text = text.replace('.','PERIO')
    words = text.split()
    
    #去除低频词
    word_counts = Counter(words)
    #Counter = [key,value]
    trimmed_words = [word for word in words if word_counts[word]>freq]
    return trimmed_words


### 准备工作：词典、文本转换为数字 、训练样本准备

In [19]:
def word2id(text):
    words = preprocess(text)
    vocab = set(word)
    vocab2id = {w:c for c,w in enumerate(list(vocab))}
    id2vacab = {c:w for c,w in enumerate(list(vocab))}
    #将语料转换为数字形式
    int_words = [vocab2id[w] for w in words]
    #对高频词进行处理p(wi) = 1-(t/f(wi))^1/2  每个词去除的概率 t是超参数 f（wi）是 wi 的频率
    t = 1e-5 
    int_words_counts = Counter(int_words)
    total_count = len(int_words)
    word_freqs = {w:c/total_count for w,c in int_word_counts.items()}
    prob_drop = {w:1-up.sqrt(t/word_freqs[w]) for w in int_word_counts} #计算保留率
    train_words = [w for w in int_words if random.random()<(1-prob_drop[w])] #处理完成后用于训练的语料数据
    return word_freqs, vicab2id, id2vocab, train_words




### 获取周边词/target

In [10]:
def get_target(words, idx , window_size = 5):#窗口大小是不固定的，随机的从窗口阈值里选取1个
    target_window = np.random.randint(1,window_size) #从 1-window_size 中选取一个窗口大小
    start_point = max(0,idx - target_window)
    end_point = idx + target_window
    targets = set(words[start_point:idx]+words[idx+1:end_point+1])
    return list(targets)
        

### batch 迭代器

In [11]:
def get_batch(words, batch_size, window_size):
    n_batches = len(words)//batch_size
    #修剪预料至刚好能被batch整除
    words = words[:n_batches*batch_size]
    #构建 x,y
    for idx in range(0,len(words),batch_size):
        batch_x, batch_y = [],[]
        batch = words[idx:idx+batch_size]  #获取batch
        for i in range(len(batch)):
            x = batch[i]
            y = get_target(batch , i , window_size)
            batch_x.extend([x]*len(y)) #虽然一个中心词对应多个周边词，但每次训练时，是一对一对的进行训练的，所以x的数量要与y的数量一样
            batch_y.extend(y)
        yield batch_x, batch_y  
        #yield 将函数变成一个可迭代的 generate 生成器 每次执行函数 会 返回当前的 yield 下次调用会从当前循环继续产生新的返回值 

### 构造网络结构

In [25]:
class SkipGramNeg(nn.Module):#从 nn.module 继承类
    def __init__(self, n_vocab, n_embed, noise_dist = None): #vocab的大小 embed 的大小 noise_distributuon 负采样的参数
        
        super().__init__()
        
        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.noise_dist = noise_dist
        
        #define embedding layers for input and output words
        self.in_embed = nn.Embedding(n_vocab, n_embed)
        self.out_embed = nn.Embedding(n_vocab, n_embed)
        
        #initialize embedding tables with uniform distribution s使用均匀分布初始化 embedding tables
        # i believe this helps with convergence
        self.in_embed.weight.data.uniform_(-1,1)
        self.out_embed.weight.data.uniform_(-1,1)
        
    def forward_input(self, input_words):
        # input_words  --- onehot
        # return  --- embedding vector
        input_vectors = self.in_embed(input_words)
        return input_vectors
    
    def forward_output(self, output_words):
        output_vectors = self.out_embed(output_words)
        return output_vectors
    
    def forward_noise(self, batch_size, n_samples):
        # n_sample 负采样的个数
        # Generate noise vectors with shape (batch_size, n_samples, n_embed)
        # 负采样的分布，如果没有预先设置的负采样分布 ，就在字典中进行均匀采样，否则使用预先设置的负采样分布
        if self.noise_dist is None:
            #sample words uniformly
            noise_dist = torch.ones_like(self.n_vocab)
        else:
            noise_dist = self.noise_dist
            
        # sample words from our noise distribution
        noise_words = torch.multinomial(noise_dist, batch_size * n_samples, replacement = True)
        #多项式采样
        #noise_dist => [0.1, 0.2, 0.2, 0.5] 每个单词对应的分布
        #batch_size * n_samples 每个中心点对应的负采样个数
        
        noise_vectors = self.out_embed(noise_words).view(batch_size, n_samples, self.n_embed)
        # .view 将数据整理成 batch_size * n_samples * n_embed 的形式，即每个 中心词 对应的 负采样词 的 词向量    
        # 矩阵的形式 矩阵在后续的点积计算中非常重要
        return noise_vectors
            
    

### 用于负采样的单词分布

In [27]:
def get_noise_dist(word_freqs):
    word_freqs = np.array(word_freqs.values())
    unigram_dist = word_freqs / word_freqs.sum()
    noise_dist = torch.form_numpy(unigram_dist ** (0.75)) / np.sum(unigram_dist **(0.75))

### 构造损失函数 

In [16]:
class NegtiveSamplingLoss(nn.Module):
    def __init__ (self):
        super().__init__()
        
    def forward(self, input_vectors, output_vectors, noise_vectors):
        batch_size , embed_size = input_vectors.shape
        
        # input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size , embed_size,1)
        
        # output vectors should be a batch of row vectors
        output_vectors = output_vectors.vive(batch_size, 1, embed_size)
        
        #bmm = batch matrix multiplication 整个 batch 的 embed_vector 进行运算 
        # correct log-sigmoid loss
        # 此处的 要 output 在前面 input在后面 保证乘出来是 batch_size 个 数
        out_loss = torch.bmm(output_vectors,input_vectors).sigmoid().log()
        out_loss = out_loss.squeeze()
        
        #incorrect log-sigmoid loss
        # .neg() 是取负号
        noise_loss = torch.bmm(noise_vectors.neg(),input_vectors).sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1) # sum the losses over the sample of noise vectors
        
        # negate and sum correct and noisy log-sigmoid losses
        # return average batch loss
        return -(out_loss + noise_loss).mean()
        
        

### 模型的训练 

In [None]:
# instantiating the model
embedding_dim = 300
noise_dist = get_noise_dist(word_freqs)
model = SkipGramNeg(len(vecab2id), embedding_dim, noise_dist= noise_dist)

#using the loss that we defined
criterion = NegativeSamplingLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

print_every = 1500
steps = 0
epochs = 5
batch_size = 500
n_samples = 5

#train for some number of epochs
for e in range(epochs):
    
    #get our input,target batches
    for input_words, target_words in get_batch(train_words,batch_size):
        steps +=1
        inputs , targets = torch.logTensor(input_words),torch.LongTensor(target_words)
        
        #input output and noise vectors
        input_vectors = model.forward_input(inputs)
        output_vectors = model.forward_output(targets)
        noise_vectors = model.forward_noise(batch_size, n_samples)
        
        #negative sampling loss
        loss = criterion(input_vectors, output_vectors, noise_vectors)
        if steps//print_every == 0:
            print(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
