In [1]:
import torch as t
import numpy as np
import preprocssing
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class TweetsDataset(Dataset):
    '''Generic dataset'''
    def __init__(self, csvPath):
        '''initialize the data and labels'''
        super().__init__()
        self.data, self.labels = preprocssing.preprocessDF(csvPath)
        self.vocabs = self.calculate_vocabs()
        self.vocab_size = len(self.vocabs)
        self.data_padding()
        self.replace_with_vocab_id()
        self.data = t.tensor(self.data)

    def __getitem__(self, index):
        ''' get the row item'''
        return self.data[index], self.labels[index]

    def data_padding(self):
        max_length = self.get_max_data_length()
        for data_row in self.data:
            if len(data_row) < max_length:
                padding = max_length - len(data_row)
                padding_list = [' '] * padding
                data_row += padding_list

    def get_max_data_length(self):
        max_length = -1e9
        for data_row in self.data:
            if len(data_row) > max_length:
                max_length = len(data_row)
        return max_length

    def calculate_vocabs(self):
        '''Calculate the vocabs book from the data'''
        vocabs_set = set()
        vocab_book = {}
        for tweet in self.data:
            vocabs_set.update(tweet)
        # here I started with token 1 as to leave zero token to the unknown word
        for i, word in enumerate(vocabs_set):
            vocab_book[word] = i + 1
        vocab_book['UNK'] = 0
        return vocab_book

    def replace_with_vocab_id(self):
        '''Replace the string word with its id in vocabs book'''
        for i, sentence in enumerate(self.data):
            self.data[i] = [self.vocabs[word] if word in self.vocabs else self.vocabs['UNK'] for word in sentence]


    def __len__(self):
        '''get the dataset size'''
        return len(self.data)
        

In [3]:
class CategoriesDataset(TweetsDataset):
    '''Tweets dataset with their categories'''
    def __init__(self, csvPath):
        '''initialize the data and labels'''
        super().__init__(csvPath)
        self.labels = t.tensor(self.labels)[:, 0]
        

In [4]:
class StancesDataset(TweetsDataset):
    '''Tweets dataset with their stances'''
    def __init__(self, csvPath):
        '''initialize the data and labels'''
        super().__init__(csvPath)
        self.labels = t.tensor(list(map(lambda x: 2 if x[1] == -1 else x[1], self.labels)))

In [5]:
class RNN(t.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_size = embedding_dim
        self.embedding = t.nn.Embedding(vocab_size, embedding_dim=embedding_dim)
        self.rnn = t.nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = t.nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        input = self.embedding(t.tensor(input))
        out, _ = self.rnn(input)
        out = self.fc(out)
        return out


In [6]:
class TweetsClassifier():
    def __init__(self):
        pass
    def train(self):
        pass
    def predict(self):
        pass

In [7]:
class RnnTweetsClassifier(TweetsClassifier):
    def __init__(self, dataset, embedding_dim, hidden_size, num_layers, num_classes, epoch_size, learning_rate):
        super().__init__()
        self.dataset = dataset
        self.embedding_dim = embedding_dim
        self.epoch_size = epoch_size
        self.learning_rate = learning_rate
        self.model = RNN(dataset.vocab_size, embedding_dim, hidden_size, num_layers, num_classes)
        self.data_loader = DataLoader(dataset=dataset, batch_size=256, shuffle=True)
        self.criterion = t.nn.CrossEntropyLoss()
        self.optimizer = t.optim.Adam(self.model.parameters(), self.learning_rate)
         
    def train(self):
        for epoch in range(self.epoch_size):
            total_model_acc = 0
            total_num_samples = 0
            for data in self.data_loader:
                train_data = data[:][0]
                labels = data[:][1]
                output = self.model(train_data)
                total_model_acc += (t.argmax(output[:, -1, :], dim=1) == labels).sum()
                total_num_samples += len(labels)
                loss = self.criterion(output[:, -1, :], labels)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            
            total_model_acc = total_model_acc / total_num_samples
            print(f"epoch num: {epoch} has accuracy: {100 * total_model_acc}")

    def predict(self, testLoader):
        with t.no_grad():
            predected = t.tensor([])
            real_labels = t.tensor([])
            for tweets in testLoader:
                test_data = tweets[:][0]
                labels = tweets[:][1]
                output = self.model(test_data)
                predected = t.cat((predected, t.argmax(output[:, -1, :], dim=1)))
                real_labels = t.cat((real_labels, labels))
            return predected, real_labels
            


In [8]:
class Evaluation():
    def __init__(self, predicted, real_labels):
        self.pred = predicted
        self.labels = real_labels
        accuracy = accuracy_score(real_labels, predicted)
        recall = recall_score(real_labels, predicted)
        precision = precision_score(real_labels, predicted)
        f1 = f1_score(real_labels, predicted)
        print(f'''  Accuracy: {accuracy}
                    Recall: {recall}
                    Precision: {precision}
                    F1-Score: {f1}''')

In [9]:
# load the dataset and dataloader
dataset = StancesDataset('./dataset/train.csv')
tweet_classifier = RnnTweetsClassifier(dataset, 50, 50, 2, 3, 10, 0.001)

  self.data = t.tensor(self.data)


In [10]:
# train the stances model
tweet_classifier.train()

  input = self.embedding(t.tensor(input))


epoch num: 0 has accuracy: 73.7550048828125
epoch num: 1 has accuracy: 79.23583221435547
epoch num: 2 has accuracy: 79.2501449584961
epoch num: 3 has accuracy: 79.2501449584961
epoch num: 4 has accuracy: 79.2501449584961
epoch num: 5 has accuracy: 79.2501449584961
epoch num: 6 has accuracy: 79.2501449584961
epoch num: 7 has accuracy: 79.2501449584961
epoch num: 8 has accuracy: 79.2501449584961
epoch num: 9 has accuracy: 79.2501449584961


In [11]:
testset = StancesDataset('./dataset/dev.csv')
testloader = DataLoader(dataset=testset, batch_size=256, shuffle=True)

In [16]:
predicted, labels = tweet_classifier.predict(testloader)
print(type(predicted))
Evaluation(predicted.numpy(), labels.numpy())

  input = self.embedding(t.tensor(input))


<class 'torch.Tensor'>


RuntimeError: Numpy is not available

In [141]:
# load the dataset and dataloader
categories_dataset = CategoriesDataset('./dataset/train.csv')
gategories_classifier = RnnTweetsClassifier(categories_dataset, 50, 50, 2, 10, 10, 0.001)

In [142]:
gategories_classifier.train()

  input = self.embedding(t.tensor(input))


epoch num: 0 has accuracy: 49.7567253112793
epoch num: 1 has accuracy: 51.745849609375
epoch num: 2 has accuracy: 51.745849609375
epoch num: 3 has accuracy: 51.745849609375
epoch num: 4 has accuracy: 51.745849609375
epoch num: 5 has accuracy: 51.745849609375
epoch num: 6 has accuracy: 51.745849609375
epoch num: 7 has accuracy: 51.745849609375
epoch num: 8 has accuracy: 51.745849609375
epoch num: 9 has accuracy: 51.745849609375


In [143]:
categories_testset = CategoriesDataset('./dataset/dev.csv')
categories_testloader = DataLoader(dataset=categories_testset, batch_size=256, shuffle=True)

In [144]:
predicted, labels = gategories_classifier.predict(categories_testloader)
Evaluation(predicted, labels)

  input = self.embedding(t.tensor(input))


Model accuracy: 8.199999809265137
