In [None]:
import hazm
import numpy as np
import pandas as pd
from hazm import sent_tokenize, word_tokenize
import nltk
from nltk.tokenize import word_tokenize as eng_tokenize
import pickle
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from torch.utils.data import Dataset
import torch.nn.functional as F
from tensorflow.keras.utils import pad_sequences
from sklearn.preprocessing import MinMaxScaler
import keras

In [None]:
class dataset(Dataset):
    def __init__(self,en_data,fa_data,en_index,fa_index):
        super(dataset,self).__init__()
        for seq in en_data:
            for i in range(len(seq)):
                seq[i]=en_index[seq[i]]
        for seq in fa_data:
            for i in range(len(seq)):
                seq[i]=fa_index[seq[i]]
        self.en_data=torch.from_numpy(pad_sequences(en_data,padding="post",maxlen=10,value=1))
        self.fa_data=torch.from_numpy(pad_sequences(fa_data,padding="post",maxlen=10,value=1))
    def __len__(self):
        return len(self.en_data)
    def __getitem__(self,idx):
        return self.en_data[idx],self.fa_data[idx]

In [21]:
with open("./hugg_preprocessed/vocab_en", "rb") as fp:   # Unpickling
    vocab_en = pickle.load(fp)
with open("./hugg_preprocessed/vocab_fa", "rb") as fp:   # Unpickling
    vocab_fp = pickle.load(fp)
with open("./hugg_preprocessed/dataset_en", "rb") as fp:   # Unpickling
    dataset_en = pickle.load(fp)
with open("./hugg_preprocessed/dataset_fa", "rb") as fp:   # Unpickling
    dataset_fa = pickle.load(fp)
with open("./hugg_preprocessed/en_index.json", "r") as fp:   #Pickling
    en_index=json.load(fp)
with open("./hugg_preprocessed/fa_index.json", "r") as fp:   #Pickling
    fa_index=json.load(fp)

In [None]:
ds=dataset(dataset_en,dataset_fa,en_index,fa_index)

In [28]:
class CELL(nn.Module):
    def __init__(self,hidden_size,embeding_size):
        super(CELL,self).__init__()
        # self.input_net=nn.Embedding(num_embeddings=len(vocab),embedding_dim=embeding_dim)
        
        self.WF=nn.Parameter(torch.rand(hidden_size+embeding_size,hidden_size),requires_grad=True)
        self.BF=nn.Parameter(torch.rand(1,hidden_size),requires_grad=True)
        self.sigF=nn.Sigmoid()
        
        self.WI1=nn.Parameter(torch.rand(hidden_size+embeding_size,hidden_size),requires_grad=True)
        self.BI1=nn.Parameter(torch.rand(1,hidden_size),requires_grad=True)
        self.sigI=nn.Sigmoid()
        self.WI2=nn.Parameter(torch.rand(hidden_size+embeding_size,hidden_size),requires_grad=True)
        self.BI2=nn.Parameter(torch.rand(1,hidden_size),requires_grad=True)
        self.tanhI=nn.Tanh()
        
        self.WO=nn.Parameter(torch.rand(hidden_size+embeding_size,hidden_size),requires_grad=True)
        self.BO=nn.Parameter(torch.rand(1,hidden_size),requires_grad=True)
        self.tanhO=nn.Tanh()
        self.sigO=nn.Sigmoid()
    
    def forward(self,x_batch,short_memory,long_memory):
        """
        x_batch = (batch_size,embeding_size)
        short_memory =(batch_size,hidden_size)
        long_memory =(batch_size,hidden_size)
        """
        # emb_batch=self.input_net(x_batch)
        emb_batch=x_batch
        scaler=emb_batch.shape[0]
        #Forget gate
        new_batch=torch.concat((short_memory,emb_batch),dim=1) #(batch_size,hidden_size+embeding_size)
        zF=torch.matmul(new_batch,self.WF) +self.BF #batch_size,hidden_size
        aF=self.sigF(zF)
        
        #Input gate
        zI1=torch.matmul(new_batch,self.WI1) + self.BI1 #batch_size,hidden_size
        aI1=self.sigI(zI1)
        
        zI2=torch.matmul(new_batch,self.WI2) +self.BI2 #batch_size,hidden_size
        aI2=self.tanhI(zI2)
        aI=aI1*aI2
        
        #Output gate
        long_memory=(long_memory*aF)+(long_memory+aI) #batch_size,hidden_size
        
        zO1=torch.matmul(new_batch,self.WO) +self.BO #batch_size,hidden_size
        aO1=self.sigO(zO1)
        
        aO2=self.tanhO(long_memory)
        
        short_memory=aO1*aO2
        
        return short_memory,long_memory
class LSTM(nn.Module):
    def __init__(self,layer_num,hidden_size,embeding_size):
        super(LSTM,self).__init__()
        layers=[]
        for i in range(layer_num):
            layers.append(CELL(hidden_size,embeding_size))
            layers.append(nn.Linear(in_features=hidden_size,out_features=embeding_size))
        self.Pipeline=nn.ParameterList(layers[:-1])
    def forward(self,x,memory_cache):
        new_memory_cache=[]
        for i,l in enumerate(self.Pipeline):
            if((i+1)%2!=0):
                h,c=memory_cache[i//2][0],memory_cache[i//2][1]
                h_new,c_new=self.Pipeline[i](x,h,c)
                new_memory_cache.append([h_new,c_new])
                x=h
            else:
                x=self.Pipeline[i](x)
        return new_memory_cache,x
class Model(nn.Module):
    def __init__(self,hidden_size,embeding_size):
        super(Model,self).__init__()
        self.cell=CELL(hidden_size,embeding_size)
        self.output_network=nn.ParameterList([
            nn.Linear(in_features=hidden_size,out_features=1),
        ])
    def forward(self,x_batch,short_memory,long_memory):
        short_memory,long_memory=self.cell(x_batch,short_memory,long_memory)
        x=long_memory
        for l in self.output_network:
            x=l(x)
        return short_memory ,long_memory ,x

In [22]:
docs=["my name is parsa","hi , nice to meet you","i am very sad.","its nice to see you again"]

['my name is parsa', 'hi , nice to meet you', 'i am very sad.']

In [None]:
vectorizer=keras.layers.TextVectorization()
vectorizer.adapt(docs)
vocab=vectorizer.get_vocabulary()
dataset=torch.from_numpy(vectorizer(docs).numpy())

In [None]:
price=pd.read_csv("./archive/prices.csv")
data=price[price["symbol"]=="NOC"].reset_index(drop=True)

In [None]:
hidden_size=256
embeding_dim=1
batch_size=32
layer_number=2
# vocab_size=len(vocab)
epochs=100

In [36]:
lstm1=LSTM(1,2,3)
lstm2=LSTM(1,2,3)
oprtimizer=torch.optim.Adam(params=[
    {'params': lstm1.parameters()},
    {'params': lstm2.parameters()}
],lr=0.0001)
# lstm1.parameters()

In [None]:
len(data)

In [None]:

class dataset(Dataset):
    def __init__(self,data,label,symbol,Train=True):
        self.x=price[price["symbol"]==symbol].reset_index(drop=True)[label].values.reshape((-1,1))
        scaler=MinMaxScaler()
        self.x=scaler.fit_transform(self.x)
        self.x=torch.tensor(self.x)
        self.x=self.x.reshape((-1,2))
    def __len__(self):
        return len(self.x)
    def __getitem__(self,indx):
        return torch.unsqueeze(self.x[indx][0],dim=-1).float(),torch.unsqueeze(self.x[indx][1],dim=-1).float()
train_ds=dataset(data,"open","NOC")
# valid_ds=dataset(valid,"open","NOC")
train_dataloader=DataLoader(train_ds,batch_size=batch_size,shuffle=False)
# valid_dataloader=DataLoader(valid_ds,batch_size=len(valid_ds),shuffle=False)

In [None]:
for x,y in train_dataloader:
    print(x.shape)
    print(y.shape)
    break

In [None]:
torch.autograd.set_detect_anomaly(True)

In [None]:
model=Model(hidden_size=hidden_size,embeding_size=embeding_dim)
oprtimizer=torch.optim.Adam(params=model.parameters(),lr=0.0001)
loss_fn=torch.nn.MSELoss()
model.train()
for ep in range(epochs):
    loss_list=[]
    
    for x,y in train_dataloader:
        short_memory=torch.zeros(32,hidden_size)
        long_memory=torch.zeros(32,hidden_size)
        if(x.shape[0]!=batch_size):
            continue
        x_batch=x
        new_short,new_long,y_pred=model(x_batch,short_memory,long_memory)
        # print(new_short)
        loss=loss_fn(input=y_pred.reshape(-1),target=y.reshape(-1))
        loss.backward()
        oprtimizer.step()
        oprtimizer.zero_grad()
        short_memory=new_short.detach()
        long_memory=new_long.detach()
        loss_list.append(loss.detach().item())
    print(np.mean(loss_list))
    

In [None]:
nn.Parameter(torch.Tensor(hidden_size+2,hidden_size)).shape

In [None]:
ls=torch.tensor([[1,2,10,4,5],[3,4,5,4,5],[10,7,9,4,5]])
emb=nn.Embedding(num_embeddings=11,embedding_dim=4)
emb(ls[:,1]).shape