# Task 2: Word2Vec - Data Preparation and Word2Vec Embeddings
-------------------------------------------------------------------------------

This notebook includes the word2vec embedding creation and preparation of the data using the embedding model followed by an examination of resulting embeddings.

## Imports

In [None]:
#For dataset I/O
import pandas as pd
import numpy as np
import pickle, csv
from sklearn.utils import shuffle
import random
import project2Lib

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

#for text pre-processing
from nltk.tokenize import word_tokenize
import nltk

#for word clouds
from wordcloud import WordCloud

# Word2Vec
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec


#For Keras Deep Learning Models
from tensorflow.keras import models, layers, preprocessing
from tensorflow.keras import backend as K

np.random.seed(1)

## Loading preprocessed data

In [None]:
suffix = ""
mode = 1

if   mode==0:
    suffix = "lemmatization_noph"
    
elif mode==1:
    suffix = "lemmatization"
    
elif mode==2:
    suffix = "_noph"

elif mode==3:
    suffix = "_"
    
elif mode==4:
    suffix = "stemming_noph"
    
elif mode==5:
    suffix = "stemming"
    

In [None]:
train_data = pd.read_csv( f'PreprocessedData/train_{suffix}.csv').dropna()
dev_data   = pd.read_csv( f'PreprocessedData/dev_{suffix}.csv'  ).dropna()
test_data  = pd.read_csv( f'PreprocessedData/test_{suffix}.csv' ).dropna()

In [None]:
train_data[:3]

## Splitting sentences into tokens

In [None]:
train_data["tokens"] = train_data['preprocess'].apply(lambda x: nltk.word_tokenize(x))
dev_data["tokens"]   = dev_data['preprocess'].apply(lambda x: nltk.word_tokenize(x))
test_data["tokens"]  = test_data['preprocess'].apply(lambda x: nltk.word_tokenize(x))

### Inspecting sentence length distribution

In [None]:
def words_per_sent(txt):
    # split text into words and count them.
    return len(txt.split()) 

# apply to our dataframe
train_data['sent_len'] = train_data["tokens"].apply(lambda x: len(x))
#dev_data['num_words'].hist()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
train_data['sent_len'].hist(ax=ax1, bins=20)
ax1.set_title('Sentence Lenght Distribution')


#dev_data.plot( x='sent_len', y='target', kind='bar', ax=ax[1])
ax2.set_title('Sentence Lenght By Label')

sns.boxplot(x="label", y="sent_len", data=train_data, ax=ax2)

ax2.set_xticks(np.arange(5) , labels=["BACKGROUND", "OBJECTIVE", "METHODS", "RESULTS", "CONCLUSIONS"])

plt.show()

print(f"Maximum sentence length: {max(train_data['sent_len'])}")



### We see that while variance in sentence length is different for different labels, the average sentence length does not appear informative. 
- including sentence length as a variable could help a model decide that e.g. longer sentences are more likely to be "RESULTS" than "CONCLUSIONS" but the effect does not appear significant other than for outliers.
- the great majority of sentences are shorter than 50 let alone 100 words/tokens. When processing sentences sewuentially we will need a maximum length to pad sequences to. A value such as 100 or 150 will be a good compromise between preserving information from sentences with outlier length and keeping the dataset size manageable. 

## Visualising most frequent words for each sentence type

We visualize the most frequent words found in sentences of each type in word clouds. Top 10 most frequent words are not visualised as they are similar for most labels. Both tf-idf and Word2Vec vectorizers have settings to deal with too common / uninformative words so this change is not needed for those models. 

In [None]:
# ignoring most frequent 10 words
ignore_top = 10


labels = ['BACKGROUND', 'OBJECTIVE','METHODS', 'RESULTS','CONCLUSIONS']
fig, axs = plt.subplots(3, 2, figsize=(15,25))

for i in range(5): 
    
    ax = axs[i//2, i%2]
    freqs = pd.Series(np.concatenate( train_data.loc[train_data.label == i, 'tokens'].
                                     values ) ).value_counts()[ignore_top:]
    
    # generate word cloud of words with highest counts
    wordcloud = WordCloud(height=400, max_words=100, background_color="white").generate_from_frequencies(freqs) 
    ax.set_title(labels[i], fontsize=20 )
    ax.imshow(wordcloud, interpolation='bilinear') 
    ax.axis("off") 
axs[2, 1].remove()
fig.show()

## Interpretation of Word Clouds

Looking at each word cloud, we can identify certain words that seem intuitive for the respective sentence types.

### Background and Objective

These two labels display some similarities in word distribution, as we will also observe in the t-SNE plot of the averaged sentence embeddings later in the notebook. The word "aim" appears frequently in both while not appearing in the word clouds of other labels. However, these two labels are still differentiated by other words such as "whether", "assess", "investigate" which are more common for the OBJECTIVE label. We could perhaps attribute this to these words being used in establishing a formal hypothesis, which we would expect to see as an objective sentence.


### Methods

We see that words such as "day", "month", "year", "placebo","measure" and "trial" are prominent for this label, which is highly intuitive given that a research paper methodology based on clinical trials will likely have a scheduling / timing component as well as possibly a standard procedure for drug adminsitration. 


### Results and Conclusion

For these two labels we observe quantity comparison / interpretation related words such as "increase", "mean", "high", "rate", "reduce". It appears intuitive that a RESULTS sentence would uncover or confirm a relationship between quantities, possibly using numerical data and its description. A conclusion sentence may be referring to the previously declared results or may be restating them. Tt appears difficult to find an intuitive way to distinguish the word cloud for these two labels. 

# Word2Vec Model

A pretrained Gnesim Word2Vec model with 200 dimensional embeddings is given in the repository. After running the constants cell, you can choose to either skip the model creation cell and load keyed vectors directly, or run the model and then load the keyed vectors from the new saved model.

## Constants

In [None]:
epochs=15
vector_dim = 200
window = 10
min_count = 10
save_name = f'./TrainedModels/w2v_{vector_dim}_{suffix}.bin'

#choosing skip-gram over CBOW
sg = 1
#for using hierarchical softmax in word2vec model
hs = 1

# callback function for observing loss after each epoch

class callback(CallbackAny2Vec):
    '''Prints loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.accum_loss = 0

    def on_epoch_end(self, model):
        current_loss = model.get_latest_training_loss()
        model.running_training_loss=0

        print(f'Loss at epoch {self.epoch}: {current_loss}')
        self.epoch += 1


In [None]:
# shuffling data as Gensim does not do it automatically
shuffled_token_sents = train_data["tokens"].copy().sample(frac=1).values.tolist()

wv_model = Word2Vec(shuffled_token_sents, workers = 20, 
               vector_size=vector_dim,  # vector dim    
               min_count =  min_count, # min word count filter 
               window = window , # context window      
               sg = sg,
               hs = hs,
               callbacks=[callback()],
               epochs = epochs,
               compute_loss=True
               )


print("Model vocabulary size: " + str(len(wv_model.wv.key_to_index)))
  
# Save Model
wv_model.init_sims(replace=True)
wv_model.wv.save_word2vec_format(save_name, binary=True)
kv = KeyedVectors.load_word2vec_format(save_name, binary=True) 

## To load existing model keyed vectors:

In [None]:
kv = KeyedVectors.load_word2vec_format(save_name, binary=True) 

# Analysing semantic relationships between embeddings 

### Observing similarity relationships

In [None]:
kv.most_similar(positive="doctor")

In [None]:
kv.most_similar(positive="patient")

In [None]:
kv.most_similar(positive="cancer")

In [None]:
kv.most_similar(positive="diabetes")

Now, to compare word similarity performance to previous studies using Word2Vec, we use the word aspirin that was used in the following paper:

Miñarro-Giménez, J. A., Marín-Alonso, O., and Samwald, M., “Applying deep learning techniques on medical corpora from the World Wide Web: a prototypical system and evaluation”, <i>arXiv e-prints</i>, 2015.


In [None]:
kv.most_similar(positive="aspirin")

The top results appear to be mostly the same words with variations in cosine similarity value.

## Observing Analogy Relationships
-------------------------------------------------
In this section we will inspect whether binary semantic relationships between words is captured by Word2Vec embeddings and whether these relationships are directly applicable to similar expected semantic relationships between other word pairs.

As the model was trained on medical research paper abstracts, the vocabulary does not lend itself to a wide variety of analogies as a non-medical corpus may. However, we still expect to uncover reasonable examples of intuitive relationships. Simple relationships may be captured from words that are common in general language structures, however data may not be diverse enough to capture relationships such as "king is to man as queen is to woman".

_**Word 1** is to **Word 2** as **New Word 1** is to **New Word 2**_

w2v( **Word 1** ) - w2v( **New Word 1** )  =  w2v( **Word 2** ) - w2v( **New Word 2** )

In [None]:
def analogy(kv, word1, word2, new_word1):
    return kv.most_similar(negative=[word1],positive=[word2, new_word1])
    return result

In [None]:
analogy(kv, "good","bad", "successful")

In [None]:
analogy(kv, "healthy","ill", "recover")

In [None]:
analogy(kv, "significantly","slightly", "severe")

In all three of the exampels above, an opposite / dialectic relationship is captured in a significant portion of response words 

In [None]:
analogy(kv, "breakfast","morning", "dinner")

In [None]:
analogy(kv, "fever","paracetamol", "schizophrenia")

These two examples demosntrate different relationships based on associations. Time and meal type relationship is captured in the first one while condition and a corresponding treatment option relationship is captured in the last example.

# t-SNE Plot of Word2Vec Embeddings

In [None]:
def tsne_word2vec(kv, num_vecs = 20000):

    labels = list(kv.key_to_index.keys())[:num_vecs]
    
    tsne_model = TSNE(perplexity=30, n_components=2, verbose=1,
                      init='pca', n_iter=300, random_state=1)
    data = tsne_model.fit_transform(kv.vectors[:num_vecs])


    x = data[:,0]
    y = data[:,1]
    

    to_annotate = set(random.sample(range(0, len(x)), 100))
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i], cmap='viridis')
        if i in to_annotate:
            plt.annotate(labels[i],
                         xy=(x[i]+0.25, y[i]),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom',
                         size='medium',
                         backgroundcolor="white"
                        )
    plt.show()

In [None]:
tsne_word2vec(kv)

It is difficult to make intuitive interpretations based on the word embedding t-SNE produced here, partially since we annotate a random sample of embedded words. Interpreting analogies and most similar words to single words is a better way of obserbving semantic quality of the model.  

# Preparing data for classifiers

## To get embedding matrix lookup index sequences for each sentence - for sequential classifers

Instead of saving the Word2Vec Embeddings fro each word of each sentence directly in the dataset, we save the vectors' lookup indices corresponding the Word2Vec model's vector matrix. Later, this matrix will be placed in a Keras embedding layer that will retrieve the word vectors on the run. This way we save memory.

In [None]:
dev_data   = project2Lib.words_to_idx(dev_data,   kv, save_name = f"PreprocessedData/dev_{suffix}_w2v")
test_data  = project2Lib.words_to_idx(test_data,  kv, save_name = f"PreprocessedData/test_{suffix}_w2v")
train_data = project2Lib.words_to_idx(train_data, kv, save_name = f"PreprocessedData/train_{suffix}_w2v")

In [None]:
# read data
#train_data = pd.read_pickle (f"PreprocessedData/train_{suffix}_w2v")
#dev_data = pd.read_pickle (f"PreprocessedData/dev_{suffix}_w2v")
#test_data = pd.read_pickle (f"PreprocessedData/test_{suffix}_w2v")

## Averaging words to obtain sentence embeddings - for non sequential models:

For classifiers that cannot not process word embeddings sequentially, we need some type of aggregation to obtain sentence embeddings. One way of doing this is averaging word embeddings, which we carry out below. 

In [None]:
dev_data   = project2Lib.vectorize_dataset(dev_data,   kv, save_name = f"PreprocessedData/dev_{suffix}_w2v")
test_data  = project2Lib.vectorize_dataset(test_data,  kv, save_name = f"PreprocessedData/test_{suffix}_w2v")
train_data = project2Lib.vectorize_dataset(train_data, kv, save_name = f"PreprocessedData/train_{suffix}_w2v")

# Visualizing average sentence vectors with t-SNE

To get an intuition about how well the average sentence vectors cluster or whether averaging retains enough discriminative information, we observe the t-SNE plot of our averaged training sentence vectors for each sentence label. To have a reasonably limited computation time, we run t-sne on the dev set instead of the training set

In [None]:

def tsne_sentence_vec(df):

    tsne_model = TSNE(perplexity=30, n_components=2, init='pca', 
                      n_iter=500, random_state=1, learning_rate='auto', verbose=1)
    
    data = tsne_model.fit_transform(np.stack(df["avg_vectors"].values))

    df['tsne-1'] = data[:,0]
    df['tsne-2'] = data[:,1]
    
    plt.figure(figsize=(16,16))
    ax = sns.scatterplot(
        x='tsne-1', y='tsne-2',
        hue="label",
        palette=sns.color_palette("hls", 5),
        data=df,
        legend=True,
        alpha=0.9
    )
    handles, labels  =  ax.get_legend_handles_labels()
    ax.legend(handles=handles, title='Classes', loc='upper right', labels=["BACKGROUND", "OBJECTIVE", "METHODS", "RESULTS", "CONCLUSIONS"])
    
    fig = plt.gcf()
    fig.savefig(f'tsne_avg_sent_{suffix}.png')

    plt.show()

In [None]:
tsne_sentence_vec(dev_data)

On the averaged sentence vector t-SNE plot we can make the following observations:
- RESULTS  and METHODS sentences are both a large portion of the data and form more visibly seperable regions compared to other sentence types

- BACKGROUND and CONCLUSION appear to be clustering in very similar regions

- OBJECTIVE does not display an apparent clustering behavior other than being relatively sepearble from RESULTS and to a lesser degree, from METHODS

- We know that conclusions tend to come last in general English language while background information is likley to be towards the beginning, therefore the seperablitiy issue we observe can be mitigated by introducing the relative position of the sentence in the abstract as a feature. Indeed, for both averaged sentence vector based models and sequence classificaiton models in the other notebooks, we will see that a significant performance boost will come with this change. 

# For comparison with smaller dataset:

The dataset preparation and word2Vec related steps above are repeated below for the samller 20k dataset, later to be used in the classifiers as well. We aim to identify whether working with the larger dataset makes a noticable difference on classifier performance.

In [None]:
train_data_small = pd.read_csv( f'PreprocessedData/train_{suffix}_small.csv').dropna()
dev_data_small   = pd.read_csv( f'PreprocessedData/dev_{suffix}_small.csv'  ).dropna()
test_data_small  = pd.read_csv( f'PreprocessedData/test_{suffix}_small.csv' ).dropna()

In [None]:
train_data_small["tokens"] = train_data_small['preprocess'].apply(lambda x: nltk.word_tokenize(x))
dev_data_small["tokens"]   = dev_data_small['preprocess'].apply(lambda x: nltk.word_tokenize(x))
test_data_small["tokens"]  = test_data_small['preprocess'].apply(lambda x: nltk.word_tokenize(x))

In [None]:
save_name_small = f'./TrainedModels/w2v_{vector_dim}_{suffix}_small.bin'

In [None]:
# shuffling data as Gnesim does not do it automatically
shuffled_token_sents_small = train_data_small["tokens"].copy().sample(frac=1).values.tolist()

wv_model_small = Word2Vec(shuffled_token_sents_small, workers = 20, 
               vector_size=vector_dim,  # vector dim    
               min_count =  min_count, # min word count filter 
               window = window , # context window      
               sg = sg,
               hs = hs,
               callbacks=[callback()],
               epochs = epochs,
               compute_loss=True
               )


print("Model vocabulary size: " + str(len(wv_model_small.wv.key_to_index)))
  
# Save Model
wv_model_small.init_sims(replace=True)
wv_model_small.wv.save_word2vec_format(save_name_small, binary=True)
kv_small = KeyedVectors.load_word2vec_format(save_name_small, binary=True) 

In [None]:
dev_data_small   = project2Lib.words_to_idx(dev_data_small  , kv_small, save_name = f"PreprocessedData/dev_{suffix}_w2v_small")
test_data_small  = project2Lib.words_to_idx(test_data_small,  kv_small, save_name = f"PreprocessedData/test_{suffix}_w2v_small")
train_data_small = project2Lib.words_to_idx(train_data_small, kv_small, save_name = f"PreprocessedData/train_{suffix}_w2v_small")

In [None]:
dev_data_small   = project2Lib.vectorize_dataset(dev_data_small,   kv_small, save_name = f"PreprocessedData/dev_{suffix}_w2v_small")
test_data_small  = project2Lib.vectorize_dataset(test_data_small,  kv_small, save_name = f"PreprocessedData/test_{suffix}_w2v_small")
train_data_small = project2Lib.vectorize_dataset(train_data_small, kv_small, save_name = f"PreprocessedData/train_{suffix}_w2v_small")