In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}

    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.
        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)
    
    
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]   

In [3]:
import numpy as np
from collections import Counter
import string

class NewsVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""    
    def __init__(self, text_vocab, type_vocab):
        self.text_vocab = text_vocab
        self.type_vocab = type_vocab

    def vectorize(self,text, vector_length=-1):
        """
        Args:
            title (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector
        Returns:
            the vetorized title (numpy.array)
        """
        text = str(text)
        indices = [self.text_vocab.begin_seq_index]
        indices.extend(self.text_vocab.lookup_token(token) 
                       for token in text.split(" "))
        indices.append(self.text_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.text_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, news_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the target dataset
            cutoff (int): frequency threshold for including in Vocabulary 
        Returns:
            an instance of the NewsVectorizer
        """
        type_vocab = Vocabulary()        
        for type in sorted(set(news_df.type)):
            type_vocab.add_token(type)

        word_counts = Counter()
        for text in news_df.text:
            text = str(text)
            for token in text.split(' '):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        text_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                text_vocab.add_token(word)
        
        return cls(text_vocab, type_vocab)

    def to_serializable(self):
        return {'text_vocab': self.text_vocab.to_serializable(),
                'type_vocab': self.type_vocab.to_serializable()}

In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import json

class NewsDataset(Dataset):
    def __init__(self, news_df, vectorizer):
        """
        Args:
            news_df (pandas.DataFrame): the dataset
            vectorizer (NewsVectorizer): vectorizer instantiated from dataset
        """
        self.news_df = news_df
        self._vectorizer = vectorizer

        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(str(context).split(" "))
        self._max_seq_length = max(map(measure_len, news_df.text)) + 2
        

        self.train_df = self.news_df[self.news_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.news_df[self.news_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.news_df[self.news_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')
        
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv):
        """Load dataset and make a new vectorizer from scratch
        
        Args:
            surname_csv (str): location of the dataset
        Returns:
            an instance of SurnameDataset
        """
        news_df = pd.read_csv(news_csv)
        train_news_df = news_df[news_df.split=='train']
        return cls(news_df, NewsVectorizer.from_dataframe(train_news_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        """Load dataset and the corresponding vectorizer. 
        Used in the case in the vectorizer has been cached for re-use
        
        Args:
            surname_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of SurnameDataset
        """
        news_df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """a static method for loading the vectorizer from file
        
        Args:
            vectorizer_filepath (str): the location of the serialized vectorizer
        Returns:
            an instance of SurnameVectorizer
        """
        with open(vectorizer_filepath) as fp:
            return NewsVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json
        
        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        text_vector = \
            self._vectorizer.vectorize(row.text, self._max_seq_length)

        type_index = \
            self._vectorizer.type_vocab.lookup_token(row.type)

        return {'x_data': text_vector,
                'y_target': type_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"): 
    """
        A generator function which wraps the PyTorch DataLoader. It will 
        ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [5]:
import numpy as np
import torch

def load_glove_from_file(glove_filepath):
    """
    Load the GloVe embeddings 
    
    Args:
        glove_filepath (str): path to the glove embeddings file 
    Returns:
        word_to_index (dict), embeddings (numpy.ndarary)
    """

    word_to_index = {}
    embeddings = []
    with open(glove_filepath, "r") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_embedding_matrix(glove_filepath, words):
    """
    Create embedding matrix for a specific set of words.
    
    Args:
        glove_filepath (str): file path to the glove embeddigns
        words (list): list of words in the dataset
    """
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
            
    return final_embeddings

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import timeit
import numpy as np
import random
import datetime

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

class TextRnnVAE(nn.Module):
    
    def __init__(self, args, pretrained_embeddings):
        super(TextRnnVAE, self).__init__()

        self.args = args
        pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
        self.emb = nn.Embedding(embedding_dim=args.embedding_size,
                                num_embeddings=args.num_embeddings,
                                padding_idx=args.padding_idx,
                                _weight=pretrained_embeddings)
        
        self.encoder = Encoder(args, self.emb)
        self.decoder = Decoder(args, self.emb)
        self.encoder.to(args.device)
        self.decoder.to(args.device)
        self.encoder_optimizer = optim.Adam(self.encoder.parameters(), lr=0.001)
        self.decoder_optimizer = optim.Adam(self.decoder.parameters(), lr=0.001)
        encoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=self.encoder_optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)
        decoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=self.decoder_optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)
       
    def train(self):
        self.encoder.train()
        self.decoder.train()

    def train_epoch(self, epoch, batch_generator):
        epoch_loss = 0.0
        
        for batch_index, batch_dict in enumerate(batch_generator):
            # Get size of batch (can differ between batches due to bucketing)
            inputs = batch_dict['x_data']
            batch_size = inputs.shape[0]

            # Convert to tensors and move to device
            inputs = torch.tensor(inputs).to(self.args.device)

            # Train batch and get batch loss
            batch_loss = self.train_batch(inputs)
        
            # Update epoch loss given als batch loss
            epoch_loss += batch_loss

            print('Epoch: {} #batches {}, loss: {:.8f}'.format(epoch + 1, batch_index + 1, (batch_loss / ((batch_index + 1) * batch_size))))

        print()
        return epoch_loss

    def train_batch(self, inputs):
        batch_size, num_steps = inputs.shape

        # Initialize hidden state
        self.encoder.hidden = self.encoder.init_hidden(batch_size)
        
        # Zero gradients of both optimizers
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        mean, logv, z = self.encoder(inputs)

        loss,_ = self.decoder(inputs, z)

        kld_loss = (-0.5 * torch.sum((logv - torch.pow(mean, 2) - torch.exp(logv) + 1), 1)).mean()

        loss += (kld_loss * 0.1)

        # Backpropagation
        loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), 0.5)
        torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), 0.5)
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()

        result = loss.item() / (num_steps) 
        return result


class Encoder(nn.Module):

    def __init__(self, args, embedding):
        
        super(Encoder, self).__init__()
        self.args = args
        
        # Embedding layer
        self.embedding = embedding
        
        # RNN layer
        self.num_directions = 2 
        self.num_hidden_states = 1
        
        self.rnn = nn.GRU(input_size=100,
                          hidden_size=100,
                          num_layers=1,
                          batch_first=True,
                          bidirectional=True)
        
        # Initialize hidden state
        self.hidden = None
        
        self.linear_dims = [100 * 2 * 1 * 1]
                
        # Define last linear output layer
        self.hidden_to_mean = nn.Linear(self.linear_dims[-1], 1024)
        self.hidden_to_logv = nn.Linear(self.linear_dims[-1], 1024)

        self._init_weights() 

    def init_hidden(self, batch_size):
        return torch.zeros(1 * 2, batch_size, 100).to(self.args.device)
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)


    def _sample(self, mean, logv):
        std = torch.exp(0.5 * logv)
        # torch.randn_like() creates a tensor with values samples from N(0,1) and std.shape
        eps = torch.randn_like(std)
        # Sampling from Z~N(μ, σ^2) = Sampling from μ + σX, X~N(0,1)
        z = mean + std * eps
        return z

    def forward(self, inputs):
        # inputs.shape = (batch_size, seq_len)   
        batch_size, _ = inputs.shape
        
        # Push through embedding layer ==> X.shape = (batch_size, seq_len, embed_dim)
        X = self.embedding(inputs)
           
        _, self.hidden = self.rnn(X, self.hidden)
        
        X = self._flatten(self.hidden, batch_size)
        
        mean = self.hidden_to_mean(X)
        logv = self.hidden_to_logv(X)
        
        z = self._sample(mean, logv)
        
        return mean, logv, z

    def _flatten(self, h, batch_size):
        # (num_layers*num_directions, batch_size, hidden_dim)  ==>
        # (batch_size, num_directions*num_layers, hidden_dim)  ==>
        # (batch_size, num_directions*num_layers*hidden_dim)
        return h.transpose(0,1).contiguous().view(batch_size, -1)
    
    
class Decoder(nn.Module):
    
    def __init__(self,args,embedding):
        super(Decoder, self).__init__()

        self.criterion = nn.CrossEntropyLoss()
        self.args = args
        
        # Embedding layer
        self.embedding = embedding
        
        self.rnn = nn.GRU(100,
                       2*100,
                       num_layers=1,
                       batch_first=True)
        self.linear_dims = [100 * 2 * 1 * 1]
        self.z_to_hidden = nn.Linear(1024, self.linear_dims[0])
        
        self.out = nn.Linear(100 * 2, self.args.num_embeddings)
        

    def forward(self, inputs, z, return_outputs=False):
        batch_size, num_steps = inputs.shape
        
        X = self.z_to_hidden(z)
        
        # Unflatten hidden state for GRU or LSTM
        hidden = self._unflatten(X, batch_size)
        
        # Restructure shape of hidden state to accommodate bidirectional encoder (decoder is unidirectional)
        hidden = self._init_hidden_state(hidden)
        
        # Create SOS token tensor as first input for decoder
        input = torch.LongTensor([[self.args.vectorizer.text_vocab.begin_seq_index]] * batch_size).to(self.args.device)
        
        # Initiliaze loss
        loss = 0
        outputs = torch.zeros((batch_size, num_steps), dtype=torch.long).to(self.args.device)
    
        for i in range(num_steps):
            output, hidden = self._step(input, hidden)
            topv, topi = output.topk(1)
            
            input = topi.detach()
            t = topi.detach().squeeze()
      
            outputs[:, i] = topi.detach().squeeze()
            
            loss += self.criterion(output, inputs[:, i])
            if input[0].item() == self.args.vectorizer.text_vocab.begin_seq_index:
                break
        
        # Return loss
        return loss, outputs
    
    
    def _unflatten(self, X, batch_size):
        # (batch_size, num_directions*num_layers*hidden_dim)    ==>
        # (batch_size, num_directions * num_layers, hidden_dim) ==>
        # (num_layers * num_directions, batch_size, hidden_dim) ==>
        return X.view(batch_size, 1 * 2, 100).transpose(0, 1).contiguous()
    
    def _init_hidden_state(self, encoder_hidden):
        return self._concat_directions(encoder_hidden)

    def _concat_directions(self, hidden):
        hidden = torch.cat([hidden[0:hidden.size(0):2], hidden[1:hidden.size(0):2]], 2)
        return hidden
    
    def _step(self, input, hidden):
        # Get embedding of current input word:
        X = self.embedding(input)
        
        # Push input word through rnn layer with current hidden state
        output, hidden = self.rnn(X, hidden)
       
        # Push output through linear layer to get to vocab_size
        output = F.log_softmax(self.out(output.squeeze(dim=1)), dim=1)
       
        # return the output (batch_size, vocab_size) and new hidden state
        return output, hidden

In [7]:
import torch
import os
import numpy as np

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.
    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better
    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [8]:
from argparse import Namespace
import torch
import os
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm_notebook


In [9]:
news_csv="/content/drive/MyDrive/FND using Unsupervised Learning /politifact/data/complete_processed_dataset_only_real_train.csv"

In [10]:
args = Namespace(
    # Data and Path hyper parameters
    news_csv="/content/drive/MyDrive/FND using Unsupervised Learning /politifact/data/complete_processed_dataset_only_real_train.csv",
    vectorizer_file="vectorizer.json",
    model_state_file='/content/drive/MyDrive/FND using Unsupervised Learning /data/politifact/auto_model.pth',
    save_dir='/content/drive/MyDrive/FND using Unsupervised Learning /data/politifactmodels/classification',
    # Model hyper parameters
    glove_filepath='/content/drive/MyDrive/FND using Unsupervised Learning /data/politifact/glove.6B.100d.txt', 
    use_glove=True,
    embedding_size=100, 
    rnn_hidden_dim=100, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.1, 
    batch_size=64, 
    num_epochs=10, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
    # New params
    num_embeddings = None,
    num_classes = None,
    padding_idx = 0,
    linear_dims = [],
    z_dim = 1024,
    num_layers = 1,
    bidirectional_encoder = True,
    vectorizer = None,
    dropout = 0.5, 
) 

In [11]:
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

Expanded filepaths: 
	/content/drive/MyDrive/FND using Unsupervised Learning /data/politifactmodels/classification/vectorizer.json
	/content/drive/MyDrive/FND using Unsupervised Learning /data/politifact/auto_model.pth


In [12]:
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

Using CUDA: False


In [13]:
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

In [14]:
if args.reload_from_files:
    # training from a checkpoint
    dataset = NewsDataset.load_dataset_and_load_vectorizer(args.news_csv, args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = NewsDataset.load_dataset_and_make_vectorizer(args.news_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()
args.vectorizer = vectorizer
args.num_embeddings = len(vectorizer.text_vocab)
args.num_classes = len(vectorizer.type_vocab)

FileNotFoundError: ignored

In [None]:
# Use GloVe or randomly initialized embeddings
if args.use_glove:
    words = vectorizer.text_vocab._token_to_idx.keys()
    embeddings = make_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

In [None]:
model =  TextRnnVAE(args,embeddings)

#print(model)

model = model.to(args.device)

In [None]:
losses = []
model.train()

for epoch in range(args.num_epochs):
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, 
                                      batch_size=args.batch_size, 
                                      device=args.device)
    epoch_loss = model.train_epoch(epoch, batch_generator)
    print(epoch_loss)
    losses.append(epoch_loss)

In [None]:
epochs = [(i+1) for i in range(args.num_epochs)]
epochs

In [None]:
import matplotlib.pyplot as plt

plt.plot(epochs, losses)
plt.xlabel("No. of epochs")  # add X-axis label
plt.ylabel("Loss")  # add Y-axis label
plt.show()

In [None]:
torch.save(model,"/content/drive/MyDrive/FND using Unsupervised Learning /politifact/models/trained_model.pth")

In [None]:
def calc_reconstruction_error(txt_vector):
    model.encoder.eval()
    model.decoder.eval()
    model.encoder.hidden = model.encoder.init_hidden(1)
    mean, logv, z = model.encoder(txt_vector)
    loss,o = model.decoder(txt_vector, z)
    return loss

In [None]:
dataset.set_split("test")

fake_losses = []
real_losses = []
fake_indices = []
real_indices = []

for ind in dataset.test_df.index:
    text = dataset.test_df['text'][ind]
    label = dataset.test_df['type'][ind]
    
    text = vectorizer.vectorize(text,3062)
    text = torch.tensor(text)
    text = torch.unsqueeze(text,dim=0)
    text = text.to(device=args.device)

    loss = calc_reconstruction_error(text)
    loss = loss.item()
    if(label == 0):
        fake_losses.append(loss)
        fake_indices.append(ind)
    else:
        real_losses.append(loss)
        real_indices.append(ind)      


In [None]:
plt.scatter(fake_indices, fake_losses, c ="blue")
plt.scatter(real_indices, real_losses, c ="red")

plt.xlabel("Sample index")
plt.ylabel("Reconstruction error")
plt.show()