In [1]:
import sys
import os
notebook_dir = os.path.abspath(os.getcwd())
project_root = os.path.dirname(notebook_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

import pandas as pd
from torchtext.data import get_tokenizer
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from tqdm import tqdm
import src.config as config



In [2]:
class CorpusIterator:
    def __init__(self, dataframes, tokenizer):
        self.dataframes = dataframes
        self.tokenizer = tokenizer

    def __iter__(self):
        print("Starting corpus iteration...")
        for df in self.dataframes:
            for review in df[config.TEXT_COL]:
                if isinstance(review, str):
                    yield self.tokenizer(review)

In [3]:
files_to_load = [
    config.CLEAN_IMDB_TRAIN_PATH,
    config.CLEAN_IMDB_VAL_PATH,
    config.CLEAN_IMDB_TEST_PATH,
    config.CLEAN_IMDB_UNSUPERVISED_PATH,
    config.CLEAN_RT_TRAIN_PATH,
    config.CLEAN_RT_VAL_PATH,
    config.CLEAN_RT_TEST_PATH,
]

all_dataframes = [pd.read_csv(f_path) for f_path in files_to_load if os.path.exists(f_path)]

tokenizer = get_tokenizer("basic_english")
corpus_iterator = CorpusIterator(all_dataframes, tokenizer)

In [4]:
class TqdmCallback(CallbackAny2Vec):

    def __init__(self, total_epochs):
        self.total_epochs = total_epochs
        self.pbar = None

    def on_train_begin(self, model):
        self.pbar = tqdm(total=self.total_epochs, desc="Training Word2Vec")

    def on_epoch_end(self, model):
        self.pbar.update(1)

    def on_train_end(self, model):
        self.pbar.close()

In [5]:
w2v_params = {
    'sentences': corpus_iterator,
    'workers': 16,
    'vector_size': 256,
    'window': 5,
    'min_count': 3,
    'sg': 1, # Skip-gram
    'negative': 20,
    'ns_exponent': 0.75,
    'epochs': 7,
    'compute_loss': True,
    'callbacks': [TqdmCallback(total_epochs=7)]
}
os.makedirs(config.MODELS_DIR, exist_ok=True)
w2v_model = Word2Vec(**w2v_params)
w2v_model.save(str(config.W2V_MODEL_PATH))
w2v_model.wv.save(str(config.W2V_VECTORS_PATH))

print(f"Word2Vec model saved to: {config.MODELS_DIR}")

Starting corpus iteration...


Training Word2Vec:   0%|          | 0/7 [00:00<?, ?it/s]

Starting corpus iteration...


Training Word2Vec:  14%|█▍        | 1/7 [02:17<13:42, 137.01s/it]

Starting corpus iteration...


Training Word2Vec:  29%|██▊       | 2/7 [04:27<11:15, 135.19s/it]

Starting corpus iteration...


Training Word2Vec:  43%|████▎     | 3/7 [06:46<09:04, 136.23s/it]

Starting corpus iteration...


Training Word2Vec:  57%|█████▋    | 4/7 [09:08<06:53, 137.99s/it]

Starting corpus iteration...


Training Word2Vec:  71%|███████▏  | 5/7 [11:20<04:32, 136.21s/it]

Starting corpus iteration...


Training Word2Vec:  86%|████████▌ | 6/7 [13:32<02:14, 134.97s/it]

Starting corpus iteration...


Training Word2Vec: 100%|██████████| 7/7 [15:44<00:00, 134.96s/it]


Word2Vec model saved to: /home/projects/dharel/reuvensh/Sentiment-Analysis-Project/models
