In [68]:
from torchtext import datasets
import pandas as pd
import numpy as np
import spacy
from torchtext.vocab import FastText
from torch.utils.data import DataLoader,Dataset
import torch
from torch import nn
import  torch.nn.functional as F
from torch.optim import Adam
from sklearn.model_selection import train_test_split

In [69]:
train,test = datasets.AmazonReviewFull(root='data/')

### **Loading the data**

In [70]:
def load(path):
    """
    takes - path str
    returns -> features-ndarray and labels-ndarray
    """
    df = pd.read_csv(path,header=None)
    # ordinal embedding targets
    func = lambda x: x-1
    df['stars'] = df[0].apply(func)
    #mergin'
    df['reviews'] = df.iloc[:,1]+ ' ' + df.iloc[:,2]
    #dropping
    df.drop([0,1,2],axis=1,inplace=True)

    return df['reviews'].values,df['stars'].values

### **splitting - train and validation set**

In [71]:
def split(path,test_size=0.2):
    '''
    takes path, test ratio
    returns x_tr, x_val, y_train, y_val
    '''
    x,y = load(path)
    print('splitted')

    return train_test_split(x,y,test_size=test_size,random_state=0)

### **Preprocessing**

**tokens**

In [72]:
def prep(text):
    nlp =spacy.load('en_core_web_sm')
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

**encoding**

In [73]:
def token_encoder(token,vec):
    if token == '<pad>':
        return 1
    else:
        try:
            return vec.stoi[token]
        except:
            if type(token) != str :
                print(f'expected str, but got {type(token)} instead.')
            else:
                return 0

def text_encoder(tokens,vec):
    '''
    input - list of lemmatized tokens
    returns - list of encoded tokens
    '''
    return [token_encoder(token,vec) for token in tokens]

**padding**

In [74]:
def padding(list_indexed,max_length=32,pad=1):
    #padding short reviews
    res = list_indexed + (max_length - len(list_indexed))*[pad]
    # slicing prior to return, if review is longer
    return res[:max_length]

### **Dataset and dataloader**

**dataset**

In [75]:
class CustomDataset(Dataset):
    def __init__(self,x,y,max_length=32):
        self.max_length = max_length
        self.vec = FastText('simple')
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0])
        self.vec.vectors[0] = -torch.ones(self.vec.vectors[0].shape[0])
        self.labels = y
        self.inputs = [padding(text_encoder(prep(review),self.vec),self.max_length) for review in x]
        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        assert len(self.inputs[i]) == self.max_length
        return self.inputs[i],self.labels[i]

**collate**

In [76]:
def collate(batch,vectorizer=FastText('simple').vectors):
    #inner torch.stack is stacking vectorized words into review tensor
    #outer torch.stack is stacking that review into batch tensor
    inputs = torch.stack([torch.stack([vectorizer[token] for token in item[0]]) for item in batch])
    #
    #converting labels into Long type tensors, as criterion functions expects that dtype
    labels = torch.LongTensor([item[1] for item in batch])
    return inputs, labels

**dataloader**

In [77]:
def loader(x,y):
    sett = CustomDataset(x,y)
    return DataLoader(sett,batch_size=64,collate_fn=collate)

**saving dataloaders**

In [78]:
def save_loader(loader,filename):
  torch.save(loader,f'{filename}.pth')
  return f'{filename}.pth'

**Loading saved dataloader**

In [79]:
def load_loader(filname):
  return torch.load(filname)

### **Model**

In [80]:
embed_dim =300
class Model(nn.Module):
    def __init__(self,hidden1,hidden2,out_dim,max_length=32):
        super(Model,self).__init__()
        self.fc = nn.Linear(max_length*embed_dim,hidden1)
        self.fc2 = nn.Linear(hidden1,hidden2)
        self.fc3 = nn.Linear(hidden2,out_dim)
    
    def forward(self,x):
        z = F.relu(self.fc(x))
        z = F.relu(self.fc2(z))
        z = F.log_softmax(self.fc3(z),dim=1)

        return z


### **training**

In [81]:
def train(model,epochs,dataloader_tr,dataloader_val,lr=0.003):
    model = Model(100,50,5)
    criterion = nn.NLLLoss()
    optimizer = Adam(model.parameters(),lr=lr)
    epochs = 2
    pr = 2
    for epoch in range(epochs):
        losses_val = []
        losses_tr = []
        accs_val = []
        accs_tr = []
        acc_tr = 0
        acc_val = 0
        loss_tr = 0
        loss_val = 0

        print(f'running {epoch+1} out of {epochs} epochs ')
        model.train()
        for i,(x_tr,y_tr) in enumerate(iter(dataloader_tr)):
            optimizer.zero_grad()
            pred_tr = model.forward(x_tr.view(x_tr.shape[0],-1))
            pred_tr_ps = torch.exp(pred_tr.detach())
            loss_t = criterion(pred_tr,y_tr)
            loss_t.backward()
            optimizer.step()

            #metrics
            loss_tr += loss_t.item()
            acc_tr += y_tr.eq(pred_tr_ps.argmax()).float().mean().item()
            if (i+1) % pr == 0 :
                print(f'\trunning train iteration {i+1}')
                print(f'\t\ttrain loss : {loss_tr/pr:.3f}\t accuracy : {acc_tr/pr:.3f}')
                losses_tr.append(loss_tr/pr)
                accs_tr.append(acc_tr/pr)
                loss_tr = 0
                acc_tr = 0
        print(f'train loss : {np.mean(losses_tr)}\t accuracy : {np.mean(accs_tr)}')


        model.eval()
        with torch.no_grad():
            for j,(x_val,y_val) in enumerate(iter(dataloader_val)):
                pred_val = model.forward(x_val.view(x_val.shape[0],-1))
                pred_val_ts = torch.exp(pred_val.detach())
                loss_v = criterion(pred_val,y_val)
                #metrics
                loss_val += loss_v.item()
                acc_val += y_val.eq(torch.exp(pred_val_ts).argmax()).float().mean().item()
                if (j+1) % pr == 0 :
                    print(f'\trunning validation iteration {j+1}')
                    print(f'\t\tvalidation loss : {loss_val/pr:.3f}\t accuracy : {acc_val/pr:.3f}')
                    losses_val.append(loss_val/pr)
                    accs_val.append(acc_val/pr)
                    loss_val = 0
                    acc_val = 0
            print(f'** train loss : {np.mean(losses_val):.3f}\t accuracy : {np.mean(accs_val):3f} **')

    
    

## Testing

In [82]:
def test(model,dataloader,pr):
    model.eval()
    with torch.no_grad():
            loss_test = 0
            acc_test=0
            accs_test = []
            losses_test = []
            for j,(x_test,y_test) in enumerate(iter(dataloader)):
                pred_test = model.forward(x_test.view(x_test.shape[0],-1))
                pred_test_ts = torch.exp(pred_test.detach())
                loss_t = criterion(pred_test,y_test)
                #metrics
                loss_test += loss_t.item()
                acc_test += y_val.eq(torch.exp(pred_test_ts).argmax()).float().mean().item()
                if (j+1) % pr == 0 :
                    print(f'\trunning test iteration {j+1}')
                    print(f'\t\ttest loss : {loss_test/pr:.3f}\t accuracy : {acc_test/pr:.3f}')
                    losses_test.append(loss_test/pr)
                    accs_test.append(acc_test/pr)
                    loss_test = 0
                    acc_test = 0
            print(f'** test loss : {np.mean(losses_test):.3f}\t accuracy : {np.mean(accs_test):3f} **')

# **Running**

In [83]:
test_path = './data/AmazonReviewFull/amazon_review_full_csv/test.csv'
train_path = './data/AmazonReviewFull/amazon_review_full_csv/train.csv'


x_tr,x_val,y_tr,y_val = split(train_path)
x_test,y_test = load(test_path)

loader generate

In [None]:
train_loader = loader(x_tr,y_tr)

save loader

In [None]:
tr_loader = save_loader(train_loader,'trainloader')