# Mount Google Drive and Set Up Project Directories
This cell mounts your Google Drive and sets the project root directory along with subdirectories.
It also creates the necessary folders if they do not exist.

In [1]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define project root directory on Google Drive
PROJECT_ROOT = "/content/drive/MyDrive/NPUA/NLP/word2vec/"

# Define subdirectories relative to the project root
RAW_DATA_DIR      = os.path.join(PROJECT_ROOT, "data", "raw")
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
VOCAB_DIR         = os.path.join(PROJECT_ROOT, "data", "vocab")
MODELS_DIR        = os.path.join(PROJECT_ROOT, "models")
EMBEDDINGS_DIR    = os.path.join(PROJECT_ROOT, "data", "embeddings")
PLOTS_DIR         = os.path.join(PROJECT_ROOT, "plots")

# Create directories if they do not exist
for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, VOCAB_DIR, MODELS_DIR, EMBEDDINGS_DIR, PLOTS_DIR]:
    os.makedirs(directory, exist_ok=True)
    print(f"Checked/Created directory: {directory}")

Mounted at /content/drive
Checked/Created directory: /content/drive/MyDrive/NPUA/NLP/word2vec/data/raw
Checked/Created directory: /content/drive/MyDrive/NPUA/NLP/word2vec/data/processed
Checked/Created directory: /content/drive/MyDrive/NPUA/NLP/word2vec/data/vocab
Checked/Created directory: /content/drive/MyDrive/NPUA/NLP/word2vec/models
Checked/Created directory: /content/drive/MyDrive/NPUA/NLP/word2vec/data/embeddings
Checked/Created directory: /content/drive/MyDrive/NPUA/NLP/word2vec/plots


# Preprocessing Functions
This cell defines functions to preprocess Armenian text files.
The functions clean, normalize, tokenize, and split text into sentences.
Detailed documentation is provided for each function.

In [2]:
import re
import os

def preprocess_text_files(input_dir, output_base_dir):
    """
    Preprocesses all text files within the input directory and saves the processed content.

    The function iterates over 'train' and 'test' subdirectories, reads each .txt file within topic folders,
    normalizes the text (removing numbers and punctuation, normalizing characters), splits the text into sentences,
    tokenizes each sentence into words, and saves the processed tokens in a corresponding output directory.

    Args:
        input_dir (str): Absolute path to the directory containing raw text files.
        output_base_dir (str): Absolute path to the base directory where processed files will be saved.

    Returns:
        None
    """
    def normalize_text(text):
        """
        Cleans and normalizes Armenian text.

        This function removes digits, normalizes punctuation marks, and replaces certain Armenian ligatures.

        Args:
            text (str): The input text string.

        Returns:
            str: Normalized text.
        """
        # print("normalize_text: Starting normalization of text snippet:", text[:50])
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[․,։«»՞!?()\[\]{}]', '', text)
        text = text.replace('ﬓ', 'մն').replace('և', 'եւ')
        # print("normalize_text: Completed normalization. Result snippet:", text[:50])
        return text

    def split_into_sentences(text):
        """
        Splits text into sentences using Armenian-specific punctuation patterns.

        Sentences are assumed to end with common delimiters such as "։", comma, or similar punctuation.

        Args:
            text (str): The normalized text.

        Returns:
            list[str]: List of sentence strings.
        """
        print("split_into_sentences: Splitting text of length:", len(text))
        sentence_enders = re.compile(r'([,.«»։՞՜՛՝]+)')
        sentences = []
        start = 0
        for match in sentence_enders.finditer(text):
            end = match.end()
            sentence = text[start:end].strip()
            if sentence:
                sentences.append(sentence)
                # print("split_into_sentences: Found sentence:", sentence)
            start = end
        if start < len(text):
            last_sentence = text[start:].strip()
            if last_sentence:
                sentences.append(last_sentence)
                # print("split_into_sentences: Found last sentence:", last_sentence)
        print("split_into_sentences: Total sentences found:", len(sentences))
        return sentences

    def tokenize_sentence(sentence):
        """
        Tokenizes a sentence into Armenian words using regex rules.

        Words are assumed to consist of Armenian characters and are separated by spaces or punctuation.

        Args:
            sentence (str): A single sentence string.

        Returns:
            list[str]: List of word tokens.
        """
        # print("tokenize_sentence: Tokenizing sentence:", sentence)
        tokens = re.findall(r'\b[Ա-ֆա-ֆևւ]+\b', sentence)
        # print("tokenize_sentence: Tokens found:", tokens)
        return tokens

    def preprocess_file(file_path):
        """
        Preprocesses a single text file.

        Reads file content, normalizes text, splits it into sentences, tokenizes sentences into words,
        and flattens the tokens into a single list of lowercased words.

        Args:
            file_path (str): Path to the text file.

        Returns:
            list[str]: List of tokens extracted from the file.
        """
        print("preprocess_file: Processing file:", file_path)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            # print("preprocess_file: Successfully read file.")
        except Exception as e:
            print(f"preprocess_file: Error reading file {file_path}: {e}")
            return []
        text = normalize_text(text)
        sentences = split_into_sentences(text)
        tokens = [tokenize_sentence(sentence) for sentence in sentences]
        flat_tokens = [word.lower() for sublist in tokens for word in sublist]
        print("preprocess_file: Total tokens extracted:", len(flat_tokens))
        return flat_tokens

    input_dir = os.path.abspath(input_dir)
    output_base_dir = os.path.abspath(output_base_dir)
    # print("preprocess_text_files: Input directory resolved to:", input_dir)
    # print("preprocess_text_files: Output base directory resolved to:", output_base_dir)

    if not os.path.exists(input_dir):
        print(f"preprocess_text_files: Error - Input directory does not exist: {input_dir}")
        return

    total_files = 0
    saved_files = 0
    print("preprocess_text_files: Starting preprocessing...")

    for dataset in ["train", "test"]:
        dataset_path = os.path.join(input_dir, dataset)
        print("preprocess_text_files: Processing dataset folder:", dataset_path)
        if not os.path.exists(dataset_path):
            print(f"preprocess_text_files: Warning - Directory does not exist: {dataset_path}. Skipping...")
            continue
        # Iterate over topic folders in each train/test folder
        for topic in os.listdir(dataset_path):
            topic_path = os.path.join(dataset_path, topic)
            if os.path.isdir(topic_path):
                print("preprocess_text_files: Processing topic folder:", topic_path)
                for root, _, files in os.walk(topic_path):
                    txt_files = [fname for fname in files if fname.endswith('.txt')]
                    total_files += len(txt_files)
                    for fname in txt_files:
                        file_path = os.path.join(root, fname)
                        tokens = preprocess_file(file_path)
                        if tokens:
                            relative_path = os.path.relpath(file_path, dataset_path)
                            output_file_path = os.path.join(output_base_dir, dataset, relative_path)
                            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
                            with open(output_file_path, 'w', encoding='utf-8') as out_f:
                                out_f.write(' '.join(tokens))
                            saved_files += 1
                            print(f"preprocess_text_files: Saved processed file to: {output_file_path}")
    print(f"preprocess_text_files: Total .txt files found: {total_files}")
    print(f"preprocess_text_files: Total files processed and saved: {saved_files}")
    print("preprocess_text_files: Preprocessing complete.")

# preprocess_text_files(RAW_DATA_DIR, PROCESSED_DATA_DIR)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
split_into_sentences: Splitting text of length: 244
split_into_sentences: Total sentences found: 4
preprocess_file: Total tokens extracted: 31
preprocess_text_files: Saved processed file to: /content/drive/MyDrive/NPUA/NLP/word2vec/data/processed/test/accidents/text-11073.txt
preprocess_file: Processing file: /content/drive/MyDrive/NPUA/NLP/word2vec/data/raw/test/accidents/text-34283.txt
split_into_sentences: Splitting text of length: 553
split_into_sentences: Total sentences found: 3
preprocess_file: Total tokens extracted: 76
preprocess_text_files: Saved processed file to: /content/drive/MyDrive/NPUA/NLP/word2vec/data/processed/test/accidents/text-34283.txt
preprocess_file: Processing file: /content/drive/MyDrive/NPUA/NLP/word2vec/data/raw/test/accidents/text-12582.txt
split_into_sentences: Splitting text of length: 1267
split_into_sentences: Total sentences found: 11
preprocess_file: Total tokens extra

# Vocabulary Building Functions
This cell contains functions to build a vocabulary from the processed text files.
It reads through tokenized files from the 'train' and 'test' directories, counts word frequencies, and saves the vocabulary mappings (word-to-index and index-to-word) as JSON files.


In [3]:
import json
from collections import Counter

def preprocess_file_for_vocab(file_path):
    """
    Reads and tokenizes a file for vocabulary building.

    Each line in the file is split into tokens by whitespace.
    In case of an error, the function logs the error and returns an empty list.

    Args:
        file_path (str): Path to the text file.

    Returns:
        list[str]: List of tokens extracted from the file.
    """
    tokens = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                tokens.extend(line.strip().split())
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    return tokens

def build_vocab(input_dir, output_vocab_dir):
    """
    Builds vocabulary mappings from processed text files and saves them as JSON files.

    The function iterates over 'train' and 'test' subdirectories of the input directory,
    tokenizes each file, and counts word frequencies. It then creates a word-to-index mapping
    (starting from index 1) and an index-to-word mapping, which are saved as 'word_to_index.json'
    and 'index_to_word.json' in the specified output directory.

    Args:
        input_dir (str): Absolute path to the directory containing processed text files.
        output_vocab_dir (str): Absolute path to the directory where vocabulary files will be saved.

    Returns:
        None
    """
    input_dir = os.path.abspath(input_dir)
    output_vocab_dir = os.path.abspath(output_vocab_dir)
    if not os.path.exists(input_dir):
        print(f"Error: Input directory does not exist: {input_dir}")
        return

    word_counter = Counter()
    print("Building vocabulary...")

    for dataset in ["train", "test"]:
        dataset_path = os.path.join(input_dir, dataset)
        if not os.path.exists(dataset_path):
            print(f"Warning: Directory does not exist: {dataset_path}. Skipping...")
            continue
        for category in os.listdir(dataset_path):
            category_path = os.path.join(dataset_path, category)
            if os.path.isdir(category_path):
                for root, _, files in os.walk(category_path):
                    txt_files = [fname for fname in files if fname.endswith('.txt')]
                    for fname in txt_files:
                        file_path = os.path.join(root, fname)
                        tokens = preprocess_file_for_vocab(file_path)
                        if tokens:
                            word_counter.update(tokens)

    os.makedirs(output_vocab_dir, exist_ok=True)
    vocab = {word: idx for idx, (word, _) in enumerate(word_counter.most_common(), start=1)}
    word_to_index = vocab
    index_to_word = {idx: word for word, idx in word_to_index.items()}

    word_to_index_path = os.path.join(output_vocab_dir, "word_to_index.json")
    index_to_word_path = os.path.join(output_vocab_dir, "index_to_word.json")

    with open(word_to_index_path, 'w', encoding='utf-8') as f:
        json.dump(word_to_index, f, ensure_ascii=False, indent=4)
    with open(index_to_word_path, 'w', encoding='utf-8') as f:
        json.dump(index_to_word, f, ensure_ascii=False, indent=4)

    print("Vocabulary built successfully!")
    print(f"Word-to-Index saved to: {word_to_index_path}")
    print(f"Index-to-Word saved to: {index_to_word_path}")

# build_vocab(PROCESSED_DATA_DIR, VOCAB_DIR)

Building vocabulary...
Vocabulary built successfully!
Word-to-Index saved to: /content/drive/MyDrive/NPUA/NLP/word2vec/data/vocab/word_to_index.json
Index-to-Word saved to: /content/drive/MyDrive/NPUA/NLP/word2vec/data/vocab/index_to_word.json


# Utility Functions for Model and File Management
This cell defines utility functions for loading and saving JSON dataշ ensuring directories exist, and loading a trained Word2Vec model.

In [2]:
import json
from pathlib import Path
import logging

logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

def save_json(data, file_path):
    """
    Saves a dictionary as a JSON file.

    Args:
        data (dict): The data to be saved.
        file_path (str): The destination file path for the JSON file.

    Returns:
        None
    """
    file_path = Path(file_path)
    with file_path.open("w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    logging.info(f"Saved JSON data to {file_path}")

def load_json(file_path):
    """
    Loads a JSON file and returns its content as a dictionary.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        dict: The loaded data, or an empty dict if the file is not found or decoding fails.
    """
    file_path = Path(file_path)
    if not file_path.exists():
        logging.error(f"JSON file not found: {file_path}")
        return {}
    try:
        with file_path.open("r", encoding="utf-8") as f:
            data = json.load(f)
        logging.info(f"Loaded JSON data from {file_path}")
        return data
    except json.JSONDecodeError as e:
        logging.error(f"Error decoding JSON file {file_path}: {e}")
        return {}

def load_model(model_path):
    """
    Loads a trained Word2Vec model from the specified path.

    Args:
        model_path (str): Path to the saved Word2Vec model.

    Returns:
        Word2Vec or None: The loaded model or None if loading fails.
    """
    from gensim.models import Word2Vec
    model_path = Path(model_path)
    if not model_path.exists():
        logging.error(f"Model file not found: {model_path}")
        return None
    logging.info(f"Loading model from {model_path}")
    return Word2Vec.load(str(model_path))

# Visualization Functions
This cell defines functions for visualizing Word2Vec model results.

Two visualization methods are provided:
1. Plotting nearest words using PCA.
2. Plotting a word similarity graph using NetworkX.

Each function saves its plot as a PNG file in the designated plots directory.

In [3]:
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from sklearn.decomposition import PCA
import networkx as nx
import random

def plot_nearest_words(model, word, top_n=10):
    """
    Finds and plots the nearest words to the given target word using PCA for dimensionality reduction.

    If the target word is not in the model's vocabulary, the function attempts to select an alternative.
    The resulting scatter plot highlights the target word and its nearest neighbors, and is saved as a PNG file.

    Args:
        model (Word2Vec): The trained Word2Vec model.
        word (str): The target word for which nearest words are visualized.
        top_n (int): Number of nearest words to display (default is 10).

    Returns:
        None
    """
    if word not in model.wv:
        logging.warning(f"Word '{word}' not found in vocabulary! Selecting a random word.")
        words = list(model.wv.index_to_key)
        if not words:
            logging.error("Vocabulary is empty!")
            return
        word = random.choice(words)

    similar_words = model.wv.most_similar(word, topn=top_n)
    words_list = [word] + [w for w, _ in similar_words]
    vectors = [model.wv[w] for w in words_list]

    pca = PCA(n_components=2)
    reduced = pca.fit_transform(vectors)

    plt.figure(figsize=(8,6))
    plt.scatter(reduced[0,0], reduced[0,1], color="navy", label=f"Target: {word}")
    plt.scatter(reduced[1:,0], reduced[1:,1], color="skyblue", label="Nearest Words")
    for i, w in enumerate(words_list):
        plt.annotate(w, (reduced[i,0], reduced[i,1]))
    plt.title(f"Nearest Words to '{word}'")
    plt.legend()
    plt.grid(color="lavender")

    plot_path = os.path.join(PLOTS_DIR, f"{word}_nearest.png")
    plt.savefig(plot_path)
    logging.info(f"Nearest words plot saved to {plot_path}")
    plt.close()

def plot_similarity_graph(model, word, top_n=10):
    """
    Plots a word similarity graph using NetworkX.

    The graph displays the target word, its nearest neighbors, and edges weighted by similarity scores.
    Dimensionality is reduced using PCA, and ellipses are drawn around each node to indicate size.
    The plot is saved as a PNG file.

    Args:
        model (Word2Vec): The trained Word2Vec model.
        word (str): The target word to visualize.
        top_n (int): Number of nearest words to include (default is 10).

    Returns:
        None
    """
    if word not in model.wv:
        logging.warning(f"Word '{word}' not found in vocabulary!")
        return

    similar_words = model.wv.most_similar(word, topn=top_n)
    G = nx.Graph()
    G.add_node(word)
    for neighbor, sim in similar_words:
        G.add_node(neighbor)
        G.add_edge(word, neighbor, weight=round(sim, 2))

    words_list = [word] + [w for w, _ in similar_words]
    vectors = [model.wv[w] for w in words_list]
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(vectors)
    pos = {words_list[i]: reduced[i] for i in range(len(words_list))}

    plt.figure(figsize=(16,12))
    nx.draw(G, pos, with_labels=True, node_color="skyblue", edge_color="navy",
            node_size=1000, font_size=10, font_color="black")
    edge_labels = {(word, neighbor): f"{sim:.2f}" for neighbor, sim in similar_words}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=9, font_color="black")

    ax = plt.gca()
    for i, w in enumerate(words_list):
        size_factor = len(model.wv[w])
        ellipse = Ellipse(xy=(reduced[i,0], reduced[i,1]),
                          width=size_factor * 0.007, height=size_factor * 0.005,
                          edgecolor="navy", facecolor="skyblue")
        ax.add_patch(ellipse)

    plt.scatter(reduced[0,0], reduced[0,1], color="navy", s=100, edgecolor="navy")
    plt.title(f"Word Similarity Graph for '{word}'")

    plot_path = os.path.join(PLOTS_DIR, f"{word}_graph.png")
    plt.savefig(plot_path)
    logging.info(f"Similarity graph saved to {plot_path}")
    plt.close()

# Word2Vec Training with Checkpoints and Model Management
This cell defines the training function for the Word2Vec model.

The training loop runs for a specified number of epochs, saving model checkpoints every 5 epochs.

It computes the training loss (using gensim's compute_loss feature) to track the best model.

After training, only the best model and the final (last) model are retained, and other checkpoints are deleted.

In [4]:
import multiprocessing
from gensim.models import Word2Vec
import shutil
import os
import logging

# Training parameters
EMBEDDING_SIZE = 100      # Dimensionality of word embeddings
WINDOW_SIZE = 5           # Context window size
MIN_COUNT = 5             # Minimum frequency count for words
SG = 1                    # Use Skip-gram (1) or CBOW (0)
EPOCHS = 100              # Total number of epochs for training
WORKERS = max(1, multiprocessing.cpu_count())  # Number of worker threads

# Learning rate schedule parameters
INITIAL_ALPHA = 0.025     # Starting learning rate
FINAL_ALPHA = 0.0001      # Final learning rate

def load_processed_data(data_dir):
    """
    Loads preprocessed text data from the specified directory.

    The function traverses all subdirectories and collects tokenized sentences from .txt files.

    Args:
        data_dir (str): Absolute path to the directory containing processed text files.

    Returns:
        list[list[str]]: A list where each element is a tokenized sentence (list of words).
    """
    from pathlib import Path
    data_dir = Path(data_dir)
    if not data_dir.exists():
        logging.error(f"Data directory does not exist: {data_dir}")
        return []
    sentences = []
    for file_path in data_dir.rglob("*.txt"):
        with file_path.open("r", encoding="utf-8") as f:
            for line in f:
                tokens = line.strip().split()
                if tokens:
                    sentences.append(tokens)
    logging.info(f"Loaded {len(sentences)} sentences from {data_dir}")
    return sentences

def train_word2vec():
    """
    Trains a Word2Vec model using the preprocessed text data.

    The training is conducted over a fixed number of epochs. After each epoch, the training loss is computed.
    Every 5 epochs, a checkpoint of the model is saved. The best model (with the lowest loss) is tracked and saved separately.
    After training, the final model is saved, and all intermediate checkpoint files (except the best and final)
    are deleted.

    The learning rate (alpha) is manually decayed after each epoch to avoid warnings regarding effective 'alpha' being higher
    than in previous training cycles.

    Returns:
        None
    """
    # Assumes PROCESSED_DATA_DIR and MODELS_DIR are defined globally
    sentences = load_processed_data(PROCESSED_DATA_DIR)
    if not sentences:
        logging.error("No sentences loaded. Training aborted.")
        return

    # Initialize Word2Vec model with compute_loss enabled and initial alpha values
    model = Word2Vec(vector_size=EMBEDDING_SIZE,
                     window=WINDOW_SIZE,
                     min_count=MIN_COUNT,
                     sg=SG,
                     workers=WORKERS,
                     compute_loss=True,
                     alpha=INITIAL_ALPHA,
                     min_alpha=FINAL_ALPHA)
    model.build_vocab(sentences)
    best_loss = float("inf")
    best_epoch = None
    best_model_path = os.path.join(MODELS_DIR, "word2vec_best.model")
    checkpoint_paths = []  # List to keep track of checkpoint file paths

    previous_loss = 0.0
    logging.info(f"Starting training for {EPOCHS} epochs...")

    for epoch in range(1, EPOCHS + 1):
        logging.info(f"Epoch {epoch} starting with alpha: {model.alpha:.6f}")
        model.train(sentences,
                    total_examples=model.corpus_count,
                    epochs=1,
                    compute_loss=True)
        current_loss = model.get_latest_training_loss() - previous_loss
        previous_loss = model.get_latest_training_loss()
        logging.info(f"Epoch {epoch}/{EPOCHS}, Loss: {current_loss:.6f}")

        # Update best model if current epoch has lower loss
        if current_loss < best_loss:
            best_loss = current_loss
            best_epoch = epoch
            model.save(best_model_path)
            logging.info(f"Best model updated at epoch {epoch} with loss {best_loss:.6f}")

        # Save checkpoint every 5 epochs
        if epoch % 5 == 0:
            checkpoint_path = os.path.join(MODELS_DIR, f"word2vec_epoch_{epoch}.model")
            model.save(checkpoint_path)
            checkpoint_paths.append(checkpoint_path)
            logging.info(f"Checkpoint saved at epoch {epoch}: {checkpoint_path}")

        # Update learning rate (alpha) to ensure it only decreases
        new_alpha = INITIAL_ALPHA - ((INITIAL_ALPHA - FINAL_ALPHA) * epoch / EPOCHS)
        model.alpha = new_alpha
        model.min_alpha = new_alpha
        logging.info(f"Epoch {epoch} updated alpha to: {new_alpha:.6f}")

    # Save the final (last) model
    final_model_path = os.path.join(MODELS_DIR, "word2vec_last.model")
    model.save(final_model_path)
    logging.info(f"Final model saved: {final_model_path}")

    # Delete checkpoint files except for the best and final models
    for cp in checkpoint_paths:
        if os.path.abspath(cp) not in [os.path.abspath(best_model_path), os.path.abspath(final_model_path)]:
            os.remove(cp)
            logging.info(f"Deleted checkpoint: {cp}")

# Example usage:
# train_word2vec()



# Model Evaluation
This cell defines the evaluation function which loads the trained Word2Vec model and visualizes the nearest words for a specified target word.

In [4]:
def evaluate_model(target_word="հայաստան"):
    """
    Evaluates the trained Word2Vec model by visualizing the nearest words for the target word.

    The function loads the final saved model, checks if the target word exists in the model's vocabulary,
    and calls the visualization function to plot the nearest words. The generated plot is saved in the plots directory.

    Args:
        target_word (str): The Armenian word to evaluate. Default is "հայաստան".

    Returns:
        None
    """
    model_path = os.path.join(MODELS_DIR, "word2vec_last.model")
    model = load_model(model_path)
    if not model:
        print("Model loading failed. Evaluation aborted.")
        return
    print(f"Evaluating model for target word: {target_word}")
    plot_nearest_words(model, target_word)

    plot_similarity_graph(model, target_word)

evaluate_model()

Evaluating model for target word: հայաստան


In [5]:
model_path = os.path.join(MODELS_DIR, "word2vec_last.model")
model = load_model(model_path)

def calculate_vector_analogy(model, king, man, woman):
    """
    Calculates the vector analogy king - man + woman = ? using a Word2Vec model.

    Args:
        model: The trained Word2Vec model.
        king: The word "king".
        man: The word "man".
        woman: The word "woman".

    Returns:
        The word closest to the resulting vector, or None if any of the words are not in the vocabulary.
    """
    try:
        result_vector = model.wv[king] - model.wv[man] + model.wv[woman]
        most_similar = model.wv.most_similar(positive=[result_vector], topn=10)
        return most_similar
    except KeyError as e:
        print(f"Word not in vocabulary: {e}")
        return None

print(calculate_vector_analogy(model, 'թագավոր', 'տղամարդ', 'կին'))

[('թագավոր', 0.7881953120231628), ('պոեզիա', 0.492830365896225), ('բո', 0.4807527959346771), ('ցին', 0.45675045251846313), ('խմիչքների', 0.44870278239250183), ('ասած', 0.4478115737438202), ('տի', 0.430141806602478), ('կույր', 0.423759788274765), ('անունն', 0.42037123441696167), ('երկուսից', 0.41899171471595764)]


In [6]:
def calculate_word_distance(model, word1, word2):
    """
    Calculates the cosine distance between two words using a Word2Vec model.

    Args:
        model: The trained Word2Vec model.
        word1: The first word.
        word2: The second word.

    Returns:
        The cosine distance between the word vectors, or None if either word is not in the vocabulary.
    """
    try:
        distance = model.wv.distance(word1, word2)  # Use model.wv.distance for cosine distance
        return distance
    except KeyError as e:
        print(f"Word not in vocabulary: {e}")
        return None

calculate_word_distance(model, 'սիրել', 'հարգել')

0.660455971956253