In [None]:
import pandas as pd
import numpy as np

from gensim.models import FastText, Word2Vec
from nltk.tokenize import  word_tokenize

import nltk
nltk.download('punkt_tab')

In [None]:
drive_path = "/content/drive/MyDrive/Year 4/NLP notebooks and data/"

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Year 4/NLP notebooks and data/dataset_cleaned.csv')

In [None]:
texts = df['review_text'].astype(str).tolist()

# Functions Defining

In [None]:
def tokenize_sentences(name_tokenizer = "nltk"):
  if name_tokenizer == "nltk":
    # Tokenize the sentences
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in texts]
  return tokenized_sentences

In [None]:
# Function to compute mean FastText embeddings
def mean_fasttext(tokenized_sentences, fasttext_model, embedding_dim=300):
    mean_vectors = []
    for sentence in tokenized_sentences:
        vectors = [
            fasttext_model.wv[word]
            for word in sentence
            if word in fasttext_model.wv.key_to_index
        ]
        if vectors:
            mean_vec = np.mean(vectors, axis=0)
        else:
            mean_vec = np.zeros(embedding_dim)
        mean_vectors.append(mean_vec)
    return np.array(mean_vectors)

# Compute mean vectors
mean_vectors = mean_fasttext(tokenized_sentences, fasttext_model)

print(mean_vectors.shape)  # Shape: (num_sentences, embedding_dim)

## tokenizing Sentences

In [None]:
tokenized_sentences = tokenize_sentences("nltk")

# Fasttext

In [None]:
fasttext_model = FastText(vector_size=100, window=3, min_count=1)  # instantiate
fasttext_model.build_vocab(corpus_iterable=tokenized_sentences)
fasttext_model.train(corpus_iterable=tokenized_sentences, total_examples=len(texts), epochs=10)
# get vocab keys with indices
vocab = fasttext_model.wv.key_to_index

In [None]:
print(list(fasttext_model.wv.key_to_index.keys())[:10])  # Print some vocabulary keys

In [None]:
mean_vectors = mean_fasttext(tokenized_sentences, fasttext_model, embedding_dim=100)
print(mean_vectors.shape)  # Shape: (num_sentences, 4)


In [None]:
fasttext_model.wv.most_similar('amazing')

In [None]:
fasttext_model.wv.most_similar('dirty')

In [None]:
fasttext_model.wv.most_similar('shocked')

In [None]:
# Save the NumPy array as a .npy file
np.save(drive_path+'/dense_vectors/fasttext_vectors.npy', np.array(mean_vectors))

# Word2Vec

In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
w1 = "dirty"
word2vec_model.wv.most_similar(positive=w1)

In [None]:
w2 = "polite"
word2vec_model.wv.most_similar(positive=w2)

In [None]:
w3 = "shocked"
word2vec_model.wv.most_similar(positive=w3)

In [None]:
# Get vectors for each sentence
embedding_dim = word2vec_model.vector_size
sentence_vectors = [get_text_vector(sentence, word2vec_model, embedding_dim) for sentence in tokenized_sentences]

In [None]:
# Save the NumPy array as a .npy file
np.save(drive_path+'/dense_vectors/word2vec_vectors.npy', np.array(sentence_vectors))