In [1]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F
from torch.optim import Optimizer
import os
import spacy
import unicodedata
import re
nlp=spacy.load('en_core_web_md')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [57]:


data=None
word2idx={}
idx2word={}
idx2vec={}
embed_filename='test.file'
word2vec_filename='dict.file'
data=[]
data_dir=os.path.abspath('.\\movie_lines.txt')
if os.name=='posix':
    data_dir.replace('\\','/')

'<10 padding >10 truncate'
LENGTH=10
TRAIN_LINE_NUM=100
ABBRS={'re':'are','m':'am','s':'is'}
SPECIAL=['BOS','EOS','OOV','PAD']

    
def normalizeString(s):
    s=s[:-3]
    s=s.lower()
    if '\n' in s:
        s.remove('\n')
    s=re.sub(r'([.!?])',r' \1',s)
    s=re.sub(r'[^a-zA-Z.!?]+',r' ',s)
    return s
    
    
def load_data():
    with open(data_dir,'rb') as f:
        count=0
        for line in f:
            sentence=str(line).split('+++$+++')[-1]
            sentence=normalizeString(sentence)
            sentence=sentence+' {0}'.format(SPECIAL[1])
            words=sentence.split()
            if len(words)<10:
                for i in range(10-len(words)):
                    words.append(SPECIAL[-1])
            elif len(words)>10:
                words=words[:10]
            if 'bianca' in words:
                print(words)
                print('STOP')
            data.append(words)
            #print(words)
            count+=1
            if count>=TRAIN_LINE_NUM:
                break


def embed_two_dim(idxtensor):
    'should use torch embeddings'
    vectensor=torch.Tensor(
        [[idx2vec[int(idx)] for idx in sentence] for sentence in idxtensor]
        ).to(device)
    return vectensor
    
    
'batch_first===False'
class EncoderDecoder(nn.Module):
    def __init__(self,encoder,decoder,input,output,hidden_dim,loss_fn,max_len,batch_first=True):
        'input:input_num*sourceL*batch_size=1'
        super(EncoderDecoder,self).__init__() 
        self.encoder=encoder
        self.decoder=decoder
        self.batch_size=1
        self.input_num=input.size(0)
        self.enc_len=input.size(1)
        self.dec_len=output.size(1)
        print(input.size())
        print(output.size())
        self.hidden_dim=hidden_dim
        'parameter is changing, the data should be saved in the list'
        #self.encode_hid_list=[torch.rand(1,self.batch_size,hidden_dim)*0.01]
        self.dec_out_list=[]
        self.input=input
        self.output=output
        'NLLloss'
        self.loss_fn=loss_fn
        self.loss_val=0
        self.max_len=max_len
        
        self.BOS=torch.Tensor(idx2vec[word2idx[SPECIAL[0]]]).unsqueeze(0).unsqueeze(0).to(device)


    def forward(self,input_id):
        encode_hid=(0.01*torch.rand(1,self.batch_size,self.hidden_dim)).to(device)
        enc_input_seq=embed_two_dim(self.input[input_id])
        for enc_i in range(self.enc_len):
            enc_input=enc_input_seq[enc_i].unsqueeze(0)
            enc_output,encode_hid=self.encoder(enc_input,encode_hid)
        'omit c'
        output=self.BOS
        decode_hid=encode_hid
        dec_input_seq=embed_two_dim(self.output[input_id])
        for dec_i in range(self.dec_len):
            'teacher forcing'
            dec_input=dec_input_seq[dec_i].unsqueeze(0)
            output,decode_hid=self.decoder(dec_input,decode_hid)
            target=self.output[input_id,dec_i].to(device)
            self.loss_val+=self.loss_fn(output,target)


    def loss(self):
        self.loss_val=0
        for i in range(self.input_num):
            self.forward(i)
        return self.loss_val
    
            
    def predict(self,sentence):
        with torch.no_grad():
            inputidx=[[word2idx[word] for word in sentence]]
            enc_input_seq=embed_two_dim(inputidx)[0].unsqueeze(1)
            enc_len=len(sentence)
            encode_hid=(0.01*torch.rand(1,self.batch_size,self.hidden_dim)).to(device)
            for enc_i in range(enc_len):
                enc_input=enc_input_seq[enc_i].unsqueeze(0)
                enc_output,encode_hid=self.encoder(enc_input,encode_hid)
            decode_hid=encode_hid
            decoded_words=[]
            dec_input=self.BOS
            for dec_i in range(self.max_len):
                output,decode_hid=self.decoder(dec_input,decode_hid)
                _,wordidx=output[0].data.topk(1)
                idx=int(wordidx[0])
                dec_input=embed_two_dim([[idx]])
                decoded_words.append(idx2word[idx])
                if idx==word2idx[SPECIAL[1]]:
                    break
            return decoded_words
        
        
    def check_cuda_params(self):
        print(torch.cuda.memory_allocated())
        print('cuda contains:')
        print('input/output:{0}/{1}'.format(self.input.is_cuda,self.output.is_cuda))
        if 'bianca' in word2idx:
            print('ABNORMAL:STOP')
        

class Encoder(nn.Module):
    def __init__(self,input_size,hidden_size):
        super(Encoder,self).__init__()
        self.dim=input_size
        self.hidden_dim=hidden_size
        self.rnn=nn.GRU(input_size,hidden_size)


    def forward(self,x,hidden):
        output,hidden=self.rnn(x,hidden)
        return output,hidden


class Decoder(nn.Module):
    def __init__(self,hidden_size,output_size,g=None):
        super(Decoder,self).__init__()
        self.hidden_dim=hidden_size
        self.rnn=nn.GRU(hidden_size,hidden_size)
        self.out=nn.Linear(hidden_size,output_size)
        self.softmax=nn.LogSoftmax(dim=1)


    def forward(self,y,hidden):
        output,hidden=self.rnn(y,hidden)
        output=self.softmax(self.out(output[0]))
        return output,hidden


def embed(sentences):
    one_dim_words=[]
    two_dim_words=SPECIAL
    for words in sentences:
        one_dim_words.append(words)
    embed_input=one_dim_words[0::2]
    embed_output=one_dim_words[1::2]
    
    for l in one_dim_words:
        two_dim_words.extend(l)
    two_dim_words=list(set(two_dim_words))
    idx2word=dict(enumerate(two_dim_words))
    word2idx={value:key for key,value in idx2word.items()}
    whole_sentence=' '.join(two_dim_words)

    doc=nlp(whole_sentence)
    for token in doc:
        if token.text in word2idx:
            idx2vec[word2idx[token.text]]=token.vector
        elif token.text in ABBRS:
            idx2vec[word2idx[ABBRS[token.text]]]=token.vector
        else:
            pass
    for idx in idx2word.keys():
        if idx not in idx2vec:
            idx2vec[idx]=idx2vec[word2idx['OOV']]

    embed_input=[[word2idx[word] for word in sentence] for sentence in embed_input]
    embed_output=[[word2idx[word] for word in sentence] for sentence in embed_output]
    
    with open(word2vec_filename,'wb') as f:
        pickle.dump((word2idx,idx2word,idx2vec),f)
    
    #print(word2idx)
    #print(idx2word)
    #print(idx2vec.keys())
    embed_input=torch.LongTensor(embed_input).unsqueeze(2)
    embed_output=torch.LongTensor(embed_output).unsqueeze(2)
    return embed_input,embed_output


def init_model(dataset):
    (input,target)=dataset
    input_size=300
    hidden_size=300
    encoder=Encoder(input_size,hidden_size)

    batch_size=input.size(1)
    decoder=Decoder(hidden_size,len(word2idx))

    loss_fn=nn.NLLLoss()
    max_len=10
    
    q=lambda x:x[-1]
    model=EncoderDecoder(encoder=encoder,
                        decoder=decoder,
                        input=input,
                        output=target,
                        hidden_dim=hidden_size,
                        loss_fn=loss_fn,
                        max_len=max_len,
                        )
    return model


def change():
    if os.path.exists(embed_filename):
        os.remove(embed_filename)
    if os.path.exists(word2vec_filename):
        os.remove(word2vec_filename)

        
load_data()
TEST=True
IGNORE_OUTPUT=False
model=None
if TEST:
    import pickle
    dataset=None
    change()
    if os.path.exists(embed_filename):
        with open(embed_filename,'rb') as f:
            dataset=pickle.load(f)
    else:
        dataset=embed(data)
        with open(embed_filename,'wb') as f:
            pickle.dump(dataset,f)
    with open(word2vec_filename,'rb') as f:
        (word2idx,idx2word,idx2vec)=pickle.load(f)
    model=init_model(dataset)
    model=model.cuda()
    #print(word2idx)
    model.check_cuda_params()

['bianca', 'i', 'don', 't', 'think', 'the', 'highlights', 'of', 'dating', 'joey']
STOP
torch.Size([50, 10, 1])
torch.Size([50, 10, 1])
116546560
cuda contains:
input/output:False/False
ABNORMAL:STOP


In [32]:
import numpy as np
for name,parameters in model.named_parameters():
    print(name,':',parameters.size())
para = sum([np.prod(list(p.size())) for p in model.parameters()])
type_size=4
print('Model {} : params: {:4f}M'.format(model._get_name(), para * type_size / 1000 / 1000))

encoder.rnn.weight_ih_l0 : torch.Size([900, 300])
encoder.rnn.weight_hh_l0 : torch.Size([900, 300])
encoder.rnn.bias_ih_l0 : torch.Size([900])
encoder.rnn.bias_hh_l0 : torch.Size([900])
decoder.rnn.weight_ih_l0 : torch.Size([900, 300])
decoder.rnn.weight_hh_l0 : torch.Size([900, 300])
decoder.rnn.bias_ih_l0 : torch.Size([900])
decoder.rnn.bias_hh_l0 : torch.Size([900])
decoder.out.weight : torch.Size([787, 300])
decoder.out.bias : torch.Size([787])
Model EncoderDecoder : params: 5.281948M


In [23]:
def train():
    print('start training')
    stop_loss=0.1
    encoder=model.encoder
    decoder=model.decoder
    enc_optimizer=torch.optim.SGD(encoder.parameters(),lr=0.01)
    dec_optimizer=torch.optim.SGD(decoder.parameters(),lr=0.01)
    max_train_step=400
    for step_i in range(max_train_step):
        enc_optimizer.zero_grad()
        dec_optimizer.zero_grad()
        loss=model.loss()
        loss.backward(retain_graph=True)
        enc_optimizer.step()
        dec_optimizer.step()
        if not IGNORE_OUTPUT:
            print('step:{0} loss:{1}'.format(step_i,loss))
            if loss<stop_loss:
                print('loss is considerable, stop')
                break
train()

start training
step:0 loss:1776575.0
step:1 loss:1705883.0
step:2 loss:1650433.75
step:3 loss:1607476.375
step:4 loss:1541445.0
step:5 loss:1654816.625
step:6 loss:1847900.0
step:7 loss:1845371.0
step:8 loss:1735834.625
step:9 loss:1645601.375
step:10 loss:1573419.0
step:11 loss:1566861.75
step:12 loss:1502872.875
step:13 loss:1432879.25
step:14 loss:1433208.375
step:15 loss:1408189.875
step:16 loss:1486647.5
step:17 loss:1550496.5
step:18 loss:1761083.125
step:19 loss:1824789.375
step:20 loss:1742971.75
step:21 loss:1760153.625
step:22 loss:1667416.25
step:23 loss:1550041.625
step:24 loss:1446041.375
step:25 loss:1339805.25
step:26 loss:1399628.875
step:27 loss:1567038.875
step:28 loss:1475331.5
step:29 loss:1429477.625
step:30 loss:1482535.25
step:31 loss:1415085.375
step:32 loss:1345158.5
step:33 loss:1265688.25
step:34 loss:1189510.875
step:35 loss:1146524.625
step:36 loss:1191531.25
step:37 loss:1328532.125
step:38 loss:1243207.125
step:39 loss:1195996.875
step:40 loss:1188357.5
s

step:313 loss:91379.328125
step:314 loss:109221.359375
step:315 loss:100471.5390625
step:316 loss:120577.125
step:317 loss:108541.9296875
step:318 loss:114885.3515625
step:319 loss:122065.1796875
step:320 loss:134950.796875
step:321 loss:122243.578125
step:322 loss:128284.078125
step:323 loss:123235.6796875
step:324 loss:121662.265625
step:325 loss:103515.9765625
step:326 loss:108896.5703125
step:327 loss:99835.0390625
step:328 loss:108170.046875
step:329 loss:92291.078125
step:330 loss:107209.1953125
step:331 loss:106066.5859375
step:332 loss:112334.7109375
step:333 loss:93903.734375
step:334 loss:98267.8671875
step:335 loss:91081.0859375
step:336 loss:99094.5390625
step:337 loss:91089.484375
step:338 loss:102351.03125
step:339 loss:97360.3828125
step:340 loss:111865.65625
step:341 loss:97851.3984375
step:342 loss:102074.203125
step:343 loss:95213.2421875
step:344 loss:104049.8828125
step:345 loss:83630.4453125
step:346 loss:96542.5625
step:347 loss:110436.78125
step:348 loss:122487.7

In [38]:
for i in range(10):
    print(data[i])

['they', 'do', 'not', '!', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['they', 'do', 'to', '!', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['i', 'hope', 'so', '.', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['she', 'okay', '?', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['let', 's', 'go', '.', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['wow', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['okay', 'you', 're', 'gonna', 'need', 'to', 'learn', 'how', 'to', 'lie']
['no', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['i', 'm', 'kidding', '.', 'you', 'know', 'how', 'sometimes', 'you', 'just']
['like', 'my', 'fear', 'of', 'wearing', 'pastels', '?', 'EOS', 'PAD', 'PAD']


In [49]:
sentence='they'
sentence=sentence.split()
print(sentence)
print(model.predict(sentence))

['they']
['bianca', 'bianca', 'bianca', 'bianca', 'bianca']
