<a href="https://colab.research.google.com/github/Sarztak/nlp-authorship-attribution/blob/main/yelp_restaurant_sentiments_review_doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences, skipgrams, make_sampling_table
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Input, Flatten, Dense
from tensorflow.keras import Sequential
from rich.traceback import install
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import io
from nltk.util import ngrams
import tensorflow as tf
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import CategoricalCrossentropy

install()
drive_path = Path('/content/drive/MyDrive/yelp-restaurant-reviews-sentiments')

In [2]:
sample_size = 2000
df = pd.read_csv(drive_path / "train.csv").sample(sample_size, random_state=1984).copy()

In [3]:
X, y = df.text.values, df.label.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1984)

In [4]:
vocab_size = 5000 # limit the vocabulary to the 5000 most common words
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)
tokenizer.fit_on_texts(X_test)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [5]:
len(tokenizer.word_index)

15171

In [6]:
ngram_size = 2
contexts, docids, targets = [], [], []
for docid, s in enumerate(X_train_seq):
    docids.append(docid)
    ngram_list = list(ngrams(X_train_seq[docid], ngram_size))
    for i in range(len(ngram_list) - 1):
        contexts.append([docid, *ngram_list[i]])
        targets.append(ngram_list[i + 1][-1])

In [7]:
contexts[:4], targets[:3]

([[0, 149, 16], [0, 16, 31], [0, 31, 86], [0, 86, 50]], [31, 86, 50])

In [8]:
len(contexts), len(docids), len(targets)

(178620, 1400, 178620)

In [9]:
min(targets), max(targets), vocab_size, len(tokenizer.word_index)

(1, 4999, 5000, 15171)

In [10]:
targets = tf.one_hot(targets, depth=vocab_size + 1)

In [11]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
AUTOTUNE = tf.data.AUTOTUNE
dataset = tf.data.Dataset.from_tensor_slices((contexts, targets))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=(TensorSpec(shape=(1024, 3), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 5001), dtype=tf.float32, name=None))>


In [12]:
class Doc2Vec(tf.keras.Model):
  def __init__(self, doc_size, vocab_size, embedding_dim):
    super().__init__()
    self.doc_embedding = Embedding(doc_size,
                                    embedding_dim,
                                    name="doc_embedding")
    self.context_embedding = Embedding(vocab_size,
                                       embedding_dim,
                                       name="vec_embedding")
    self.flatten = Flatten()
    self.dense = Dense(vocab_size)

  def call(self, x):
    docid = tf.reshape(x[:, 0], (x.shape[0], 1))
    _context = x[:, 1:]
    doc_emb = self.doc_embedding(docid) # (b, 1, emb_dim)
    context_emb = self.context_embedding(_context) # (b, ngram_size, emb_dim)
    input = tf.concat([doc_emb, context_emb], axis=1)
    y = self.flatten(input)
    y = self.dense(y)
    return y

In [16]:
embedding_dim = 128
doc_size = len(docids)
doc2vec = Doc2Vec(doc_size, vocab_size + 1, embedding_dim)
doc2vec.compile(optimizer="adam", loss=CategoricalCrossentropy(from_logits=True), metrics=["accuracy"])
doc2vec.fit(dataset, epochs=60)

Epoch 1/60
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 26ms/step - accuracy: 0.0526 - loss: 7.9542
Epoch 2/60
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.0932 - loss: 6.0149
Epoch 3/60
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.1171 - loss: 5.5975
Epoch 4/60
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.1359 - loss: 5.3326
Epoch 5/60
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.1487 - loss: 5.1102
Epoch 6/60
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.1628 - loss: 4.8930
Epoch 7/60
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.1747 - loss: 4.6931
Epoch 8/60
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.1877 - loss: 4.4944
Epoch 9/60
[1m174/174[0m [32m

<keras.src.callbacks.history.History at 0x7cdcac9ac5c0>