In [62]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
import torch
from torch.utils.data import DataLoader,Dataset
import matplotlib.pyplot as plt
from rowan_ds_tools.EDA import nlp
import math

In [55]:
class TextDataset(Dataset):

    def __init__(self,X,Y):
        self.X = X
        self.Y = Y
        self.n_samples = X.shape[0]
    
    def __getitem__(self,index):
        return self.X[index], self.Y[index]
    
    def __len__(self):
        return self.n_samples


# Train data prep

In [123]:
df = pd.read_csv('..\Data\goodreads_train.csv')

tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>', char_level=False)
tokenizer.fit_on_texts(df['review_text'])
model_input_dim = tokenizer.num_words

# Tokenize and Pad sequences
sequences = tokenizer.texts_to_sequences(df['review_text'])
sequences = pad_sequences(sequences,maxlen=1500, padding='post')

X = torch.tensor(sequences)
Y = torch.tensor(df['rating'])

# Validation Data

In [124]:
# Take 90% of data for training and 10% for validation
train_len = int(X.shape[0]*0.9 )

X_train, X_val = X[:train_len,:], X[train_len:,:]
Y_train, Y_val = Y[:train_len], Y[train_len:]

In [121]:
Y_train.shape

torch.Size([9, 1500])

In [125]:
dataset_train = TextDataset(X_train,Y_train)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=10,shuffle=True, num_workers=0)

In [138]:
len(dataloader_train)

81000

In [126]:
dataset_val = TextDataset(X_val,Y_val)
dataloader_val = DataLoader(dataset=dataset_val, batch_size=10,shuffle=True, num_workers=0)

In [127]:


class RatingModel(torch.nn.Module):

    def __init__(self, model_input_dim):

        super(RatingModel,self).__init__()
        self.layer1 = torch.nn.Embedding(model_input_dim, 150) # -> (*,H)
        self.layer2 = torch.nn.GRU(150,100,num_layers=2,bidirectional=True) # -> (D*num_layers,N,H) D-2 if bidrectional, N-batch_size, H-hidden_size 
        self.layer3 = torch.nn.Linear(2*100,6)

    
    def forward(self,x):
        
        out = self.layer1(x)
        out, hidden = self.layer2(out)
        # Take the last output of RNN
        out = out[:,-1,:]
        out = self.layer3(out)
        return out


In [128]:
model = RatingModel(model_input_dim)

In [129]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.001, momentum=0.9)

In [139]:
def train_one_epoch(model, dataloader, criterion, optimiser, device='cpu', backpropogate=True):
    epoch_loss = 0
    for i, (X,Y) in enumerate(dataloader):
        optimiser.zero_grad()

        outputs = model(X)
        loss = criterion(outputs,Y)

        if backpropogate:
            loss.backwards()
            optimiser.step()
        
        # Get average epoch loss per observation
        epoch_loss += loss.item()/(dataloader.batch_size*len(dataloader))  

        return epoch_loss

In [130]:
# Training loop
num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/10)
print(total_samples, n_iterations)

training_loss = []
val_loss = []
for epoch in range(num_epochs):

    train_epoch_loss = train_one_epoch(model,dataloader_train, criterion, optimizer)
    training_loss.append(train_epoch_loss)

    with torch.no_grad():
        val_epoch_loss = train_one_epoch(model,dataloader_train, criterion, optimizer, backpropogate=False)
    val_loss.append(val_epoch_loss)

            
print('Finsihed Training')

810000 81000
[1,   200] loss: 1.512
[1,   400] loss: 1.497


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'model.pth')

In [71]:
loss

tensor(1.8341, grad_fn=<NllLossBackward0>)

In [63]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,833,606 trainable parameters
