# :Word2Vec Implementation - CBow ( Gensim )

In [1]:
file_path = '/content/BERT-SST2-Dataset-Paper.txt'

## Reading the file

In [2]:
text_list = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        text_list.append(line.strip())

In [4]:
text_list[:5]

['Recursive Deep Models for Semantic Compositionality',
 'Over a Sentiment Treebank',
 'Recursive Deep Models for Semantic Compositionality',
 'Over a Sentiment Treebank',
 'Richard Socher, Alex Perelygin, Jean Y. Wu, Jason Chuang,']

## Basic Text Preprocessing using re

In [11]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [14]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [9]:
def preprocess_text(text_list):
    processed_text = []
    for line in text_list:
        # Lowercase
        line = line.lower()

        # Remove special characters and numbers
        line = re.sub(r'[^\w\s]', '', line)  # Remove punctuation
        line = re.sub(r'\d+', '', line)     # Remove digits

        #  Tokenization
        tokens = word_tokenize(line)

        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]

        # Lemmatization
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        # Append cleaned line to processed_text
        if tokens:
            processed_text.append(tokens)

    return processed_text

In [15]:
cleaned_text = preprocess_text(text_list)

In [24]:
cleaned_text[:5]

[['recursive', 'deep', 'model', 'semantic', 'compositionality'],
 ['sentiment', 'treebank'],
 ['recursive', 'deep', 'model', 'semantic', 'compositionality'],
 ['sentiment', 'treebank'],
 ['richard', 'socher', 'alex', 'perelygin', 'jean', 'wu', 'jason', 'chuang']]

In [27]:
cleaned_text[-5:]

[['andhar', 'estimating', 'linear', 'model', 'composi'],
 ['tional', 'distributional', 'semantics', 'coling'],
 ['l', 'zettlemoyer', 'collins', 'learning'],
 ['map', 'sentence', 'logical', 'form', 'structured', 'classiﬁca'],
 ['tion', 'probabilistic', 'categorial', 'grammar', 'uai']]

In [25]:
for i, line in enumerate(cleaned_text[:5]):
    print(f"Line {i + 1}: {line}")

Line 1: ['recursive', 'deep', 'model', 'semantic', 'compositionality']
Line 2: ['sentiment', 'treebank']
Line 3: ['recursive', 'deep', 'model', 'semantic', 'compositionality']
Line 4: ['sentiment', 'treebank']
Line 5: ['richard', 'socher', 'alex', 'perelygin', 'jean', 'wu', 'jason', 'chuang']


## Model Implementation

In [26]:
from gensim.models import Word2Vec
import numpy as np

In [37]:
model=Word2Vec(
    sentences=cleaned_text,  # List of tokenized sentences
    vector_size=50,         # Dimensionality of word vectors
    window=5,                # Context window size
    min_count=1,             # Ignores words with total frequency lower than this
    workers=4,               # Number of threads to use
    sg=0,                    # Training algorithm: 1 for Skip-gram, 0 for CBOW
    epochs=50     # Number of iterations (epochs) over the corpus , Def-5
)

In [38]:
model

<gensim.models.word2vec.Word2Vec at 0x795c6a7b65f0>

In [39]:
model.save("word2vec_model.model")
print("Word2Vec model trained and saved.")

Word2Vec model trained and saved.


In [40]:
try:
    vector = model.wv['recursive']
    print("\nVector for 'recursive':", vector)
except KeyError:
    print("\nWord 'recursive' not in vocabulary.")


Vector for 'recursive': [ 0.06935543  0.13170142  0.10687276  0.15769222  0.13224931 -0.2856876
  0.3528954   0.6148442  -0.55559856 -0.2673498  -0.16941503 -0.4929258
  0.20731372  0.10917935 -0.4000739   0.11671156  0.31624016  0.04454136
 -0.339645   -0.5185618   0.07258268  0.3300952   0.4397388  -0.20943202
 -0.00238743  0.07161313 -0.36169243 -0.13884637 -0.53377783  0.24488422
  0.18649049 -0.10522889 -0.27879453  0.05359181 -0.18614137  0.32020405
 -0.02709083 -0.23158936  0.27120125 -0.31521833  0.3308731  -0.25354984
 -0.3364355   0.16820334  1.0420135  -0.02295529 -0.4176643  -0.23842299
  0.3924205  -0.02856647]


In [42]:
try:
    similar_words = model.wv.most_similar('recursive', topn=5)
    print("\nMost similar words to 'recursive':", similar_words)
except KeyError:
    print("\nWord 'recursive' not in vocabulary.")


Most similar words to 'recursive': [('network', 0.9953770637512207), ('neural', 0.9944674968719482), ('tensor', 0.9888547658920288), ('net', 0.9852427244186401), ('layer', 0.970910370349884)]


In [46]:
model.wv['neural'].shape

(50,)

## Visual Representation

In [54]:
from sklearn.decomposition import PCA
import plotly.express as px
import pandas as pd

In [56]:
word = "tensor"
if word in model.wv:
    # Find 10 most similar words
    similar_words = model.wv.most_similar(word, topn=10)
    words = [word] + [w[0] for w in similar_words]  # Include the main word

    # Retrieve vectors for the words
    vectors = [model.wv[w] for w in words]

    # Apply PCA to reduce dimensions to 3D
    pca = PCA(n_components=3)
    reduced_vectors = pca.fit_transform(vectors)

    # Create a DataFrame for visualization using pandas
    df = pd.DataFrame({
        "Word": words,
        "x": reduced_vectors[:, 0],
        "y": reduced_vectors[:, 1],
        "z": reduced_vectors[:, 2],
    })

    # Plot using Plotly
    fig = px.scatter_3d(
        df, x="x", y="y", z="z", text="Word", title="Word Embeddings in 3D Space"
    )
    fig.update_traces(marker=dict(size=8, color='blue'), textposition="top center")
    fig.show()
else:
    print(f"The word '{word}' is not in the vocabulary.")


In [60]:
def get_text_vector(text, model):
    # Tokenize the text into words
    tokens = text.split()  # You can modify this for more sophisticated tokenization
    word_vectors = []

    # Get the word vectors for each word in the text
    for word in tokens:
        if word in model.wv:  # Check if the word is in the model's vocabulary
            word_vectors.append(model.wv[word])

    # If no valid word vectors are found, return a zero vector
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)

    # Compute the average of the word vectors to get the text vector
    text_vector = np.mean(word_vectors, axis=0)
    return text_vector

In [67]:
text = "neural networks are a key component of deep learning"
text_vector = get_text_vector(text, model)

print(f"Text vector (averaged word vectors): {text_vector}")

Text vector (averaged word vectors): [ 0.04107774  0.05352738 -0.00358233 -0.01027171  0.12870689 -0.3826387
  0.40182886  0.8824169  -0.71874243 -0.20300122 -0.18260777 -0.7469179
  0.17772482  0.19820136 -0.45528093  0.23836736  0.45089912  0.05216053
 -0.47686398 -0.5295954   0.20186345  0.4051478   0.54907876 -0.2714127
  0.21079917  0.12605186 -0.37924075 -0.19861372 -0.6310021   0.27575043
  0.26854673 -0.05207038 -0.3594527   0.17972702 -0.39708495  0.43614402
  0.13502096 -0.20737201  0.39290252 -0.58467096  0.4217185  -0.2244641
 -0.2847762   0.15596579  1.1449791   0.05445331 -0.38640216 -0.40179268
  0.33952656  0.05289399]


In [69]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the vector for another text to compare with
another_text = "Statistics"
another_text_vector = get_text_vector(another_text, model)

# Compute the cosine similarity
similarity = cosine_similarity([text_vector], [another_text_vector])

print(f"Cosine similarity between the two texts: {similarity[0][0]}")


Cosine similarity between the two texts: 0.0


In [73]:
another_text_2 = "tensor"
another_text_vector_2 = get_text_vector(another_text_2, model)

cosine_similarity([text_vector], [another_text_vector_2])[0][0]

0.9901884