This notebook shows one possible way of how to approach exercise two - mainly of how to preprocess, build and train the CBOW model(s). It is generally shown how the questions for Part II can be answered, but there is no code showing an answer for each individual question.

In [None]:
#based on sample code from github
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Package Import

In [None]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
from tqdm import tqdm_notebook
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
args = Namespace(
    window_size=2,
    window_size_CBOW5 = 5,
    train_proportion=0.7,
    val_proportion=0.3,
    seed=1337
)


# Preprocessing Function

In [None]:
def preprocess(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"[.,!?\n]", r" ", text)
    text = re.sub(r"[^a-zA-Z.,!?\s]+", r"", text)
    #table = str.maketrans('', '', string.punctuation) # remove punctuation
    #stripped = [w.translate(table) for w in text]
    #words = [word for word in stripped if word.isalpha()]
    #ls_alpha = [i for i in stripped if not i.isdigit()]
    stop_words = set(stopwords.words('english')) # Removing stop words from the corpus
    words = [w for w in text.split(' ') if not w in stop_words]
    text = ' '.join(words)
    return text

## File loading

In [None]:
# Split the raw text book into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open("/content/drive/My Drive/tripadvisor_hotel_reviews.csv", "r", encoding="utf-8") as file:
    x1 = file.readlines()[:]
    x = ' '.join(x1)
sentences = tokenizer.tokenize(x)
cleaned_sentences = [preprocess(sentence) for sentence in sentences]

In [None]:
# Global vars
MASK_TOKEN = "<MASK>"

# Create windows
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([list(nltk.ngrams([MASK_TOKEN] * args.window_size + word_tokenize(sentence) + [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1)) \
    for sentence in tqdm_notebook(cleaned_sentences)])

windows2 = flatten([list(nltk.ngrams([MASK_TOKEN] * args.window_size_CBOW5 + word_tokenize(sentence) + \
    [MASK_TOKEN] * args.window_size_CBOW5, args.window_size_CBOW5 * 2 + 1)) \
    for sentence in tqdm_notebook(cleaned_sentences)])

# Create cbow data
data = []
data2 = []
for window in tqdm_notebook(windows):
    target_token = window[args.window_size]
    context = []
    for i, token in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size:
            continue
        else:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])
    
for window in tqdm_notebook(windows2):
    target_token = window[args.window_size_CBOW5]
    context = []
    for i, token in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size_CBOW5:
            continue
        else:
            context.append(token)
    data2.append([' '.join(token for token in context), target_token])           
# Convert to dataframe for datasource for CBOW2 & CBOW5
cbow_data = pd.DataFrame(data, columns=["context", "target"])
cbow_data_CBOW5 = pd.DataFrame(data2, columns=["context", "target"])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/376 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/376 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


  0%|          | 0/2017305 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/2017305 [00:00<?, ?it/s]

## Splitting train and validation data from dataframe

In [None]:
# Create split data
n = len(cbow_data)
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    
cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)

n = len(cbow_data_CBOW5)
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    
cbow_data_CBOW5['split']= cbow_data_CBOW5.apply(lambda row: get_split(row.name), axis=1)

In [None]:
len(cbow_data)

2017305

#Vocabulary Class

In [None]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, token_to_idx=None, mask_token="<MASK>", add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            mask_token (str): the MASK token to add into the Vocabulary; indicates
                a position that will not be used in updating the model's parameters
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
            
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._mask_token = mask_token
        
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        """Add a list of tokens into the Vocabulary
        
        Args:
            tokens (list): a list of string tokens
        Returns:
            indices (list): a list of indices corresponding to the tokens
        """
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]


    def __len__(self):
        return len(self._token_to_idx)

#CBOW Vectorizer Class

In [None]:
class CBOWVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""    
    def __init__(self, cbow_vocab):
        """
        Args:
            cbow_vocab (Vocabulary): maps words to integers
        """
        self.cbow_vocab = cbow_vocab

    def vectorize(self, context, vector_length=-1):
        """
        Args:
            context (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector
        """

        indices = [self.cbow_vocab.lookup_token(token) for token in context.split(' ')]
        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.cbow_vocab.mask_index

        return out_vector
    
    @classmethod
    def from_dataframe(cls, cbow_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            cbow_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the CBOWVectorizer
        """
        cbow_vocab = Vocabulary()
        for index, row in cbow_df.iterrows():
            for token in row.context.split(' '):
                cbow_vocab.add_token(token)
            cbow_vocab.add_token(row.target)
            
        return cls(cbow_vocab)

#CBOWDataset Class 

In [None]:
class CBOWDataset(Dataset):
    def __init__(self, cbow_df, vectorizer):
        """
        Args:
            cbow_df (pandas.DataFrame): the dataset
            vectorizer (CBOWVectorizer): vectorizer instatiated from dataset
        """
        self.cbow_df = cbow_df
        self._vectorizer = vectorizer
        
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, cbow_df.context))
        
        self.train_df = self.cbow_df[self.cbow_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.cbow_df[self.cbow_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.cbow_df[self.cbow_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, cbow_df):
        """Load dataset and make a new vectorizer from scratch
        
        Args:
            cbow_csv (str): location of the dataset
        Returns:
            an instance of CBOWDataset
        """
        #cbow_df = cbow_df  #change it to t
        train_cbow_df = cbow_df[cbow_df.split=='train']
        return cls(cbow_df, CBOWVectorizer.from_dataframe(train_cbow_df))

    
    def get_vectorizer(self):       
        """ returns the vectorizer """
        return self._vectorizer
        
    def __len__(self):
        return self._target_size


    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        context_vector = self._vectorizer.vectorize(row.context, self._max_seq_length)
        target_index = self._vectorizer.cbow_vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

#CBOW Classifier Class

In [None]:
class CBOWClassifier(nn.Module): # Simplified cbow Model
    def __init__(self, vocabulary_size, embedding_size, padding_idx=0):
        """
        Args:
            vocabulary_size (int): number of vocabulary items, controls the
                number of embeddings and prediction vector size
            embedding_size (int): size of the embeddings
            padding_idx (int): default 0; Embedding will not use this index
        """
        super(CBOWClassifier, self).__init__()
        
        self.embedding =  nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_size, padding_idx=padding_idx)
        #self.fc1 = nn.Linear(in_features=embedding_size, out_features=vocabulary_size)
        self.fc1 = nn.Linear(in_features=embedding_size, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=vocabulary_size)

    def forward(self, x_in, apply_softmax=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, input_dim)
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, output_dim)
        """
        x_embedded_sum = F.dropout(self.embedding(x_in).sum(dim=1), 0.3)
        y_out = F.relu(self.fc1(x_embedded_sum))
        y_out = self.fc2(y_out)
        #y_out = self.fc1(x_embedded_sum)
        
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)  #use log_softmax
            
        return y_out

#Auxilary Functions

In [None]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1}

def generate_batches(dataset, batch_size, shuffle=True,   
                     drop_last=True, device="cpu"):     #comeback again
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict
        

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [None]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def pretty_print(results):
    """
    Pretty print embedding results.
    """
    for item in results:
        print ("...[%.2f] - %s"%(item[1], item[0]))

def get_closest(target_word, word_to_idx, embeddings, n=5):
    """
    Get the n closest
    words to your word.
    """

    # Calculate distances to all other words
    
    word_embedding = embeddings[word_to_idx[target_word.lower()]]
    distances = []
    for word, index in word_to_idx.items():
        if word == "<MASK>" or word == target_word:
            continue
        distances.append((word, torch.dist(word_embedding, embeddings[index])))
    
    results = sorted(distances, key=lambda x: x[1])[1:n+2]
    return results

#CUDA Configuration

In [None]:
args = Namespace(
    # Model hyper parameters
    embedding_size=50,
    # Training hyper parameters
    seed=1337,
    num_epochs=15,
    learning_rate=0.02,
    learning_rate_CBOW5=0.03,
    batch_size=2000,
    # Runtime options
    cuda=True,
    catch_keyboard_interrupt=True
)
    

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")  
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

Using CUDA: True


#Vectorizing the context

In [None]:
print("Loading dataset and creating vectorizer")
dataset = CBOWDataset.load_dataset_and_make_vectorizer(cbow_data)
dataset_CBOW = CBOWDataset.load_dataset_and_make_vectorizer(cbow_data_CBOW5)
#dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()
vectorizer_CBOW5 = dataset_CBOW.get_vectorizer()

Loading dataset and creating vectorizer


#Classifiers for CBOW2 & CBOW5

In [None]:
classifier = CBOWClassifier(vocabulary_size=len(vectorizer.cbow_vocab), embedding_size=args.embedding_size)
classifier_CBOW5 = CBOWClassifier(vocabulary_size=len(vectorizer_CBOW5.cbow_vocab), embedding_size=args.embedding_size)

##CBOW2 Training

In [None]:
classifier = classifier.to(args.device)
    
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)
train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', total=args.num_epochs,position=0)

dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train', total=dataset.get_num_batches(args.batch_size), position=1, leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val', total=dataset.get_num_batches(args.batch_size), position=1,leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset,batch_size=args.batch_size,device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(x_in=batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        #train_state = update_train_state(args=args, model=classifier, train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        #if train_state['stop_early']:
        #    break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


training routine:   0%|          | 0/15 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


split=train:   0%|          | 0/706 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


split=val:   0%|          | 0/302 [00:00<?, ?it/s]

In [None]:
print("Train loss: {};".format(train_state['train_loss']))
print("Validation loss: {}".format(train_state['val_loss']))

Train loss: [7.596256497899135, 7.176365122241266, 7.036511109165702, 6.953114656145782, 6.897846250966323, 6.740995705634266, 6.694492983750514, 6.602790655562972, 6.5786231293556705, 6.5266511575358415, 6.514086153959753, 6.48680519517352, 6.479241725067914, 6.4657297857084615, 6.461969693726927];
Validation loss: [7.599337768870478, 7.55161019508412, 7.545923684606494, 7.594780295100428, 7.642247848952846, 7.597471873491809, 7.607452103633752, 7.610273239628368, 7.636446397036121, 7.6426038947326465, 7.6677843577025, 7.663557486818326, 7.6796144397053485, 7.684438100713771, 7.682676585304817]


In [None]:
target_words = ['good', 'best', 'soldier', 'lord', 'eyes', 'gods', 'let', 'gone', 'sing']

embeddings = classifier.embedding.weight.data
word_to_idx = vectorizer.cbow_vocab._token_to_idx

for target_word in target_words: 
    print(f"======={target_word}=======")
    if target_word not in word_to_idx:
        print("Not in vocabulary")
        continue
    pretty_print(get_closest(target_word, word_to_idx, embeddings, n=5))

...[1.71] - excellent
...[2.02] - ok
...[2.03] - best
...[2.09] - better
...[2.11] - decent
...[2.14] - fantastic
...[1.91] - loved
...[2.00] - better
...[2.03] - liked
...[2.03] - wonderful
...[2.03] - good
...[2.03] - fantastic
Not in vocabulary
...[6.61] - policy
...[6.65] - ill
...[6.78] - everytime
...[6.80] - fly
...[6.84] - care
...[6.87] - timeshare
...[3.56] - feeling
...[3.63] - bit
...[3.67] - gets
...[3.70] - obviously
...[3.76] - nothing
...[3.77] - looked
...[7.55] - adams
...[7.57] - desert
...[7.58] - massage
...[7.82] - timeshare
...[7.83] - postcards
...[7.83] - landscape
...[2.11] - said
...[2.13] - problems
...[2.19] - guests
...[2.21] - tell
...[2.29] - problem
...[2.32] - soon
...[2.75] - got
...[2.77] - usually
...[2.84] - come
...[2.91] - started
...[2.93] - wanted
...[2.94] - unfortunately
...[5.71] - sat
...[5.78] - stage
...[5.83] - drunk
...[5.85] - tables
...[5.94] - senor
...[5.97] - party


##CBOW5 Training

In [None]:
classifier2 = classifier_CBOW5.to(args.device)
    
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier2.parameters(), lr=args.learning_rate_CBOW5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)
train_state_CBOW = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', total=args.num_epochs,position=0)

dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train', total=dataset.get_num_batches(args.batch_size), position=1, leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val', total=dataset.get_num_batches(args.batch_size), position=1,leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state_CBOW['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset,batch_size=args.batch_size,device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier2.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier2(x_in=batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()

        train_state_CBOW['train_loss'].append(running_loss)
        train_state_CBOW['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier2.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier2(x_in=batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, epoch=epoch_index)
            val_bar.update()

        train_state_CBOW['val_loss'].append(running_loss)
        train_state_CBOW['val_acc'].append(running_acc)

        #train_state = update_train_state(args=args, model=classifier, train_state=train_state)

        scheduler.step(train_state_CBOW['val_loss'][-1])

        #if train_state['stop_early']:
        #    break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, description='training routine', max=15.0, style=ProgressStyle(descript…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, description='split=train', max=252.0, style=ProgressStyle(description_…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


HBox(children=(FloatProgress(value=0.0, description='split=val', max=108.0, style=ProgressStyle(description_wi…

In [None]:
print("Train loss: {};".format(train_state_CBOW['train_loss']))
print("Validation loss: {}".format(train_state_CBOW['val_loss']))

Train loss: [7.843617418455699, 7.411543138443474, 7.24949736443777, 7.150697257783683, 7.083108451631334, 6.921937535679532, 6.869814691089449, 6.833199533205183, 6.739535240899948, 6.71452906207433, 6.659168056079319, 6.644527898894417, 6.615019459573047, 6.606392949346515, 6.589446397054766];
Validation loss: [7.707993984222413, 7.618619481722515, 7.613894811383, 7.623677717314825, 7.627056157147444, 7.599351957992269, 7.649143064463577, 7.656499262209294, 7.667968136292916, 7.682904583436472, 7.6908665983765205, 7.708601669028952, 7.722981651624044, 7.72583708498213, 7.736171082214073]


In [None]:
target_words = ['good', 'best', 'soldier', 'lord', 'eyes', 'gods', 'let', 'gone', 'sing']

embeddings = classifier2.embedding.weight.data
word_to_idx = vectorizer_CBOW5.cbow_vocab._token_to_idx

for target_word in target_words: 
    print(f"======={target_word}=======")
    if target_word not in word_to_idx:
        print("Not in vocabulary")
        continue
    pretty_print(get_closest(target_word, word_to_idx, embeddings, n=5))

...[2.03] - left
...[2.04] - day
...[2.33] - went
...[2.33] - think
...[2.39] - used
...[2.52] - wanted
...[1.73] - fantastic
...[1.86] - wonderful
...[1.97] - loved
...[1.99] - different
...[2.06] - great
...[2.06] - nice
Not in vocabulary
...[6.67] - meat
...[6.70] - difficult
...[6.71] - people
...[6.77] - trying
...[6.78] - bitten
...[6.84] - let
...[5.39] - traditional
...[5.44] - rest
...[5.47] - soon
...[5.52] - total
...[5.54] - mentioned
...[5.56] - company
...[7.70] - decide
...[7.86] - driving
...[7.89] - bought
...[7.92] - bavaro
...[7.94] - cruise
...[7.95] - worth
...[2.83] - know
...[2.85] - tell
...[2.88] - people
...[2.97] - need
...[2.99] - sure
...[3.05] - getting
...[3.94] - let
...[4.07] - good
...[4.07] - bring
...[4.07] - said
...[4.08] - wanted
...[4.11] - want
...[6.20] - hair
...[6.42] - venus
...[6.63] - eyebrows
...[6.65] - candles
...[6.71] - crowd
...[6.76] - continued
