In [2]:
import torch as t
import numpy as np
import preprocssing
from torch.utils.data import DataLoader, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
class TweetsDataset(Dataset):
    def __init__(self, csvPath):
        '''initialize the data and labels'''
        super().__init__()
        data, labels = preprocssing.preprocessDF(csvPath)
        self.data = data
        self.labels = t.tensor(labels)[:, 1]
        self.data_size = len(data)
        self.vocabs = self.calculate_vocabs()
        self.vocab_size = len(self.vocabs)
        self.data_padding()
        self.replace_with_vocab_id()
        self.data = t.tensor(self.data)
    
    def __getitem__(self, index):
        ''' get the row item'''
        return self.data[index], self.labels[index]

    def data_padding(self):
        max_length = self.get_max_data_length()
        for data_row in self.data:
            if len(data_row) < max_length:
                padding = max_length - len(data_row)
                padding_list = [' '] * padding
                data_row += padding_list

    def get_max_data_length(self):
        max_length = -1e9
        for data_row in self.data:
            if len(data_row) > max_length:
                max_length = len(data_row)
        return max_length

    def calculate_vocabs(self):
        '''Calculate the vocabs book from the data'''
        vocabs_set = set()
        vocab_book = {}
        for tweet in self.data:
            vocabs_set.update(tweet)
        # here I started with token 1 as to leave zero token to the unknown word
        for i, word in enumerate(vocabs_set):
            vocab_book[word] = i + 1
        vocab_book['UNK'] = 0
        return vocab_book

    def replace_with_vocab_id(self):
        '''Replace the string word with its id in vocabs book'''
        for i, sentence in enumerate(self.data):
            self.data[i] = [self.vocabs[word] if word in self.vocabs else self.vocabs['UNK'] for word in sentence]


    def __len__(self):
        '''get the dataset size'''
        return self.data_size

In [112]:
class RNN(t.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_size = embedding_dim
        self.embedding = t.nn.Embedding(vocab_size, embedding_dim=embedding_dim)
        self.rnn = t.nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = t.nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        input = self.embedding(t.tensor(input))
        out, _ = self.rnn(input)
        print(out.shape)
        out = out[:, -1, :]
        out = self.fc(out)
        return out


In [113]:
class TweetsClassifier():
    def __init__(self):
        pass
    def train(self):
        pass
    def predict(self):
        pass

In [114]:
class RnnTweetsClassifier(TweetsClassifier):
    def __init__(self, dataset, embedding_dim, hidden_size, num_layers, num_classes, epoch_size, learning_rate):
        super().__init__()
        self.dataset = dataset
        self.embedding_dim = embedding_dim
        self.epoch_size = epoch_size
        self.learning_rate = learning_rate
        self.model = RNN(dataset.vocab_size, embedding_dim, hidden_size, num_layers, num_classes)
        self.data_loader = DataLoader(dataset=dataset, batch_size=256, shuffle=True)
        self.criterion = t.nn.CrossEntropyLoss()
        self.optimizer = t.optim.Adam(self.model.parameters(), self.learning_rate)
         
    def train(self):
        for epoch in range(self.epoch_size):
            total_model_acc = 0
            for data in self.data_loader:
                train_data = data[:][0]
                labels = data[:][1]
                # reshape the row data
                output = self.model(train_data)
                # calculate the accuracy
                print(train_data)
                total_model_acc += (t.argmax(output, dim=1) == labels).sum()

                loss = self.criterion(output, labels)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            
            total_model_acc = total_model_acc / len(self.dataset)
            print(f"epoch num: {epoch} has accuracy: {total_model_acc}")

    def predict(self, tweet):
        pass

In [115]:
# load the dataset and dataloader
dataset = TweetsDataset('./dataset/train.csv')
tweet_classifier = RnnTweetsClassifier(dataset, 50, 50, 2, 2, 5, 0.001)

In [116]:
tweet_classifier.train()

torch.Size([256, 109, 50])
tensor([[ 7229, 22507,  7040,  ...,     0,     0,     0],
        [24036, 21792, 20482,  ...,     0,     0,     0],
        [ 8823,  9739, 15673,  ...,     0,     0,     0],
        ...,
        [ 7121, 21995, 13240,  ...,     0,     0,     0],
        [16416, 26242,  7559,  ...,     0,     0,     0],
        [23132,  6867, 10682,  ...,     0,     0,     0]])


  input = self.embedding(t.tensor(input))


IndexError: Target -1 is out of bounds.