## Word Embeddings 

This file is meant for preprocessing our final dataset into a proper training set for Word2Vec, as well as building the model itself using Tensorflow.

In [None]:
import numpy as np
import tensorflow as tf
import pickle
import nltk
import collections
import itertools
import re

In this cell we load the classical dataset of words and sentences.

In [None]:
# loading datasets
data_folder = '/Users/vibhav/Desktop/rp-project-data'
classical = pickle.load(open(data_folder + '/classical.pkl', mode='rb'))
romantic = pickle.load(open(data_folder + '/romantic.pkl', mode='rb'))
victorian = pickle.load(open(data_folder + '/victorian.pkl', mode='rb'))
modern = pickle.load(open(data_folder + '/modern.pkl', mode='rb'))

## Generating Training Data

In [None]:
#removing stopwords and stemming
stemmer = nltk.stem.SnowballStemmer('english')
stop_words = nltk.corpus.stopwords.words('english')

def standardization(sent):
    sent = tf.strings.lower(sent)
    return tf.strings.regex_replace(sent, '[^a-zA-Z\s]', '')

vocab_size = 20000
seq_length = 10

#text vectorizer for every dataset
ro_vectorizer_layer = tf.keras.layers.TextVectorization(standardize=standardization, output_mode='int', output_sequence_length=seq_length, split='whitespace')
vi_vectorizer_layer = tf.keras.layers.TextVectorization(standardize=standardization, output_mode='int', output_sequence_length=seq_length, split='whitespace')
mo_vectorizer_layer = tf.keras.layers.TextVectorization(standardize=standardization, output_mode='int', output_sequence_length=seq_length, split='whitespace')

#extracting training samples

ro_dataset = tf.data.Dataset.from_tensor_slices(romantic['sentences'])
vi_dataset = tf.data.Dataset.from_tensor_slices(victorian['sentences'])
mo_dataset = tf.data.Dataset.from_tensor_slices(modern['sentences'])

#fitting vectorize layer to corpus
ro_vectorizer_layer.adapt(ro_dataset.batch(500))
vi_vectorizer_layer.adapt(vi_dataset.batch(500))
mo_vectorizer_layer.adapt(mo_dataset.batch(500))

ro_vocab = ro_vectorizer_layer.get_vocabulary()
vi_vocab = vi_vectorizer_layer.get_vocabulary()
mo_vocab = mo_vectorizer_layer.get_vocabulary()

ro_int_sents = ro_dataset.batch(500).prefetch(tf.data.AUTOTUNE).map(ro_vectorizer_layer).unbatch()
vi_int_sents = vi_dataset.batch(500).prefetch(tf.data.AUTOTUNE).map(vi_vectorizer_layer).unbatch()
mo_int_sents = mo_dataset.batch(500).prefetch(tf.data.AUTOTUNE).map(mo_vectorizer_layer).unbatch()

ro_seqs = list(ro_int_sents.as_numpy_iterator())
vi_seqs = list(vi_int_sents.as_numpy_iterator())
mo_seqs = list(mo_int_sents.as_numpy_iterator())

print(len(ro_seqs))
print(len(vi_seqs))
print(len(mo_seqs))

In [None]:
# generating skip-grams
ro_dataset = np.array([], dtype=object)
ro_labels = np.array([], dtype=object)
vi_dataset = np.array([], dtype=object)
vi_labels = np.array([], dtype=object)
mo_dataset = np.array([], dtype=object)
vi_labels = np.array([], dtype=object)

for s in ro_seqs:
    sg, ls = tf.keras.preprocessing.sequence.skipgrams(s, vocabulary_size=len(ro_vocab), window_size=5, negative_samples=1.0)
    ro_dataset = np.append(ro_dataset, sg)
    ro_labels = np.append(ro_labels, ls)
for s in vi_seqs:
    sg, ls = tf.keras.preprocessing.sequence.skipgrams(s, vocabulary_size=len(ro_vocab), window_size=5, negative_samples=1.0)
    vi_dataset = np.append(vi_dataset, sg)
    vi_labels = np.append(vi_labels, ls)
for s in mo_seqs:
    sg, ls = tf.keras.preprocessing.sequence.skipgrams(s, vocabulary_size=len(ro_vocab), window_size=5, negative_samples=1.0)
    mo_dataset = np.append(mo_dataset, sg)
    mo_labels = np.append(mo_labels, ls)

    



## Building the Model

In [None]:
#word2vec subclassing with keras

class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_len, dim):
        super(Word2Vec, self).__init__()
        self.t_emb = tf.keras.layers.Embedding(vocab_len,dim,input_length=1,name="w2v_embedding") # input layer/target matrix
        self.c_emb = tf.keras.layers.Embedding(vocab_size,dim,input_length=6) #hidden layer/context matrix
    def call(self, tuple):
        return np.dot(self.t_emb(tuple[0]), self.c_emb(tuple[1]))
    

In [None]:
vec_dimension = 300
w2v_romantic = Word2Vec(len(ro_vocab), vec_dimension)
w2v_victorian = Word2Vec(len(vi_vocab), vec_dimension)
w2v_modern = Word2Vec(len(mo_vocab), vec_dimension)

#exemplar optimization and cosine loss

w2v_romantic.compile(optimizer='ftrl', loss=tf.keras.losses.CosineSimilarity(from_logits=True), metrics=['accuracy']) 
w2v_victorian.compile(optimizer='ftrl', loss=tf.keras.losses.CosineSimilarity(from_logits=True), metrics=['accuracy'])
w2v_modern.compile(optimizer='ftrl', loss=tf.keras.losses.CosineSimilarity(from_logits=True), metrics=['accuracy'])