<a href="https://colab.research.google.com/github/SnehaTanwar006/NLP/blob/main/Embedding(TF_IDF_BoW__Word2Vec).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-learn spacy
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import numpy as np

In [None]:
# Our three example "movies"
corpus = [
    "The king fought the other king",        # Movie 1: Semantically about royalty/power
    "The queen ruled the powerful realm",    # Movie 2: Also about royalty/power
    "A dog ate the hot dog"                  # Movie 3: Totally different topic
]

In [None]:
# --- Method 1: Bag-of-Words (BoW) ---
print("--- 1. Bag-of-Words (BoW) Analysis ---")
vectorizer_bow = CountVectorizer()
bow_vectors = vectorizer_bow.fit_transform(corpus).toarray()

--- 1. Bag-of-Words (BoW) Analysis ---


In [None]:
# Compare Movie 1 ("king") vs Movie 2 ("queen")
sim_bow_1_2 = cosine_similarity(bow_vectors[0:1], bow_vectors[1:2])
print(f"BoW Similarity (King vs Queen): {sim_bow_1_2[0][0]:.2f}") # Expect this to be LOW

BoW Similarity (King vs Queen): 0.45


In [None]:
# Compare Movie 1 ("king") vs Movie 3 ("dog")
sim_bow_1_3 = cosine_similarity(bow_vectors[0:1], bow_vectors[2:3])
print(f"BoW Similarity (King vs Dog):  {sim_bow_1_3[0][0]:.2f}")

BoW Similarity (King vs Dog):  0.24


In [None]:
# --- Method 2: TF-IDF ---
print("\n--- 2. TF-IDF Analysis ---")
vectorizer_tfidf = TfidfVectorizer()
tfidf_vectors = vectorizer_tfidf.fit_transform(corpus).toarray()


--- 2. TF-IDF Analysis ---


In [None]:
# Compare Movie 1 ("king") vs Movie 2 ("queen")
sim_tfidf_1_2 = cosine_similarity(tfidf_vectors[0:1], tfidf_vectors[1:2])
print(f"TF-IDF Similarity (King vs Queen): {sim_tfidf_1_2[0][0]:.2f}") # Expect this to be LOW

TF-IDF Similarity (King vs Queen): 0.22


In [None]:
# Compare Movie 1 ("king") vs Movie 3 ("dog")
sim_tfidf_1_3 = cosine_similarity(tfidf_vectors[0:1], tfidf_vectors[2:3])
print(f"TF-IDF Similarity (King vs Dog):  {sim_tfidf_1_3[0][0]:.2f}")

TF-IDF Similarity (King vs Dog):  0.10


In [None]:
# --- Method 3: Word Embeddings (spaCy) ---
print("\n--- 3. Word Embedding Analysis ---")
nlp = spacy.load("en_core_web_lg")


--- 3. Word Embedding Analysis ---


In [None]:
# Get the document vectors (spaCy averages the word vectors)
doc1 = nlp(corpus[0])
doc2 = nlp(corpus[1])
doc3 = nlp(corpus[2])

In [None]:
# Compare Movie 1 ("king") vs Movie 2 ("queen")
sim_embed_1_2 = doc1.similarity(doc2)
print(f"Embedding Similarity (King vs Queen): {sim_embed_1_2:.2f}") # Expect this to be HIGH

Embedding Similarity (King vs Queen): 0.84


In [None]:
# Compare Movie 1 ("king") vs Movie 3 ("dog")
sim_embed_1_3 = doc1.similarity(doc3)
print(f"Embedding Similarity (King vs Dog):  {sim_embed_1_3:.2f}")

Embedding Similarity (King vs Dog):  0.60


In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
# --- Import necessary libraries ---
import string
from gensim.models import Word2Vec

In [None]:
# This is our custom corpus of text from a "fantasy world"
fantasy_corpus_raw = [
    "The professor delivered a lecture on quantum physics in the main auditorium.",
    "Students gathered in the library to prepare for the final examination.",
    "The researcher published a groundbreaking paper in a renowned journal.",
    "A group of scholars organized a conference on artificial intelligence.",
    "The graduate student defended her thesis before the academic committee.",
    "The university awarded scholarships to outstanding students.",
    "The dean announced a new research grant for innovative projects.",
]


In [None]:
# --- Step 2: Preprocess the Text ---
print("--- Step 2: Preprocessing Corpus ---")
processed_corpus = []
for sentence in fantasy_corpus_raw:
    # 1. Lowercase the sentence
    lower_sentence = sentence.lower()

    # 2. Remove punctuation
    # This creates a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    no_punc_sentence = lower_sentence.translate(translator)

    # 3. Tokenize (split into words)
    tokens = no_punc_sentence.split()

    processed_corpus.append(tokens)

# Let's see what our processed corpus looks like
print("Sample of processed corpus:")
print(processed_corpus[0])
print("-" * 20)

--- Step 2: Preprocessing Corpus ---
Sample of processed corpus:
['the', 'professor', 'delivered', 'a', 'lecture', 'on', 'quantum', 'physics', 'in', 'the', 'main', 'auditorium']
--------------------


In [None]:
# --- Step 3: Training the Word2Vec Model ---
print("--- Step 3: Training the Custom Word2Vec Model ---")

# The key parameters for the model:
# - sentences: Our processed corpus.
# - vector_size: The number of dimensions for our word vectors.
# - window: The maximum distance between the current and predicted word within a sentence.
# - min_count: Ignores all words with a total frequency lower than this.
# - workers: Number of CPU cores to use for training.
model = Word2Vec(sentences=processed_corpus, vector_size=100, window=5, min_count=1, workers=4)

print("Model training complete!")
print(f"Vocabulary size: {len(model.wv.key_to_index)} words")
print("-" * 20)

--- Step 3: Training the Custom Word2Vec Model ---
Model training complete!
Vocabulary size: 51 words
--------------------


In [None]:
# --- Step 4: Exploring Our Custom Model ---
print("--- Step 4: Exploring the Model's Knowledge ---")

# Find the most similar words to "student"
try:
    similar_to_student = model.wv.most_similar('student', topn=3)
    print("Words most similar to 'student':", similar_to_student)
except KeyError as e:
    print(f"Error: {e}")

# Find the most similar words to "dean"
try:
    similar_to_dean= model.wv.most_similar('dean', topn=3)
    print("Words most similar to 'dean':", similar_to_dean)
except KeyError as e:
    print(f"Error: {e}")



--- Step 4: Exploring the Model's Knowledge ---
Words most similar to 'student': [('groundbreaking', 0.2693989872932434), ('scholarships', 0.2060580551624298), ('dean', 0.1976974457502365)]
Words most similar to 'dean': [('researcher', 0.23828670382499695), ('student', 0.1976974606513977), ('published', 0.1565902978181839)]
