<a href="https://colab.research.google.com/github/RonitShetty/NLP-Labs/blob/main/C070_RonitShetty_NLPLab8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Lab 8
****
**Aim:** Explore various Word Embedding Techniques/ Vector space models

a)	Implementation of Word2Vec word embedding technique to observe similarity between two words/sentences

b)	implementation of GloVe word embedding technique to measure semantic similarity


**Roll No.:** C070  
**Name:** Ronit Shetty  
**SAP ID:** 70322000128  
**Division:** C  
**Batch:** C1  

In [None]:
# This cell handles all the necessary installations, imports, and data downloads.

# Install gensim for word embedding models
!pip install -q gensim

# Install nltk for text processing
!pip install -q nltk

print("Libraries installed.")

# --- Import Libraries ---
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim
import gensim.downloader as api
import numpy as np
from scipy import spatial
import string # To help remove punctuation
import warnings

# Suppress deprecation warnings for cleaner output
warnings.filterwarnings("ignore", category=DeprecationWarning)

# --- Download NLTK Resources ---
# 'punkt' is for tokenization.
# 'stopwords' is for the list of common English stop words.
print("\nDownloading NLTK resources...")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
print("NLTK resources downloaded.")

print("\nEnvironment is ready!")

Libraries installed.

Downloading NLTK resources...
NLTK resources downloaded.

Environment is ready!


In [None]:
# --- 1. Load a pre-trained Word2Vec model ---
# We are using Google's powerful model trained on a massive Google News dataset.
# It includes 300-dimensional vectors for 3 million words and phrases.
# Note: This download is ~1.6GB and might take a few minutes.
print("Downloading the 'word2vec-google-news-300' model. This may take a moment...")
try:
    word2vec_model = api.load('word2vec-google-news-300')
    print("\nWord2Vec Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")

nltk.download('punkt_tab')

# --- 2. Exploring Word2Vec Capabilities ---
print("\n" + "="*50)
print("PART A: EXPLORING WORD2VEC WORD RELATIONSHIPS")
print("="*50)

# Example 1: Finding the most similar words
print("\nWords most similar to 'programming':")
print(word2vec_model.most_similar('programming', topn=5))

# Example 2: Famous Analogies (Vector Arithmetic)
# This demonstrates that the model captures complex semantic relationships.
print("\nTesting analogy: 'king' - 'man' + 'woman' approx ?")
print(word2vec_model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))

print("\nTesting analogy: 'Paris' - 'France' + 'Germany' approx ?")
print(word2vec_model.most_similar(positive=['Paris', 'Germany'], negative=['France'], topn=1))


# Example 3: Measuring similarity between word pairs
print("\nMeasuring similarity scores between pairs:")
pairs = [('cat', 'dog'), ('car', 'vehicle'), ('computer', 'keyboard'), ('banana', 'rocket')]
for w1, w2 in pairs:
    score = word2vec_model.similarity(w1, w2)
    print(f"Similarity between '{w1}' and '{w2}': {score:.4f}")

# Example 4: Finding the odd one out
print("\nWhich word doesn't match?")
word_list = ['breakfast', 'cereal', 'dinner', 'lunch', 'computer']
odd_one_out = word2vec_model.doesnt_match(word_list)
print(f"In the list {word_list}, the odd one out is: '{odd_one_out}'")


# --- 3. Find Similarity Between Two Documents ---
print("\n" + "="*50)
print("PART B: CALCULATING DOCUMENT SIMILARITY WITH WORD2VEC")
print("="*50)

# Define document pairs for comparison
doc_pairs = [
    {
        "title": "Similar Pair (AI/ML)",
        "doc1": "Natural Language Processing is a fascinating field of artificial intelligence.",
        "doc2": "AI and machine learning have revolutionized the way we interact with technology."
    },
    {
        "title": "Dissimilar Pair (Tech vs. Nature)",
        "doc1": "The new GPU offers amazing performance for deep learning tasks.",
        "doc2": "The river flows gently through the lush green valley."
    }
]

# Define reusable text processing and vectorization functions
stop_words = set(stopwords.words('english'))

def preprocess(text):
    """Cleans and tokenizes a text string."""
    text = text.lower()
    tokens = word_tokenize(text)
    # Filter out punctuation and stop words
    return [word for word in tokens if word.isalpha() and word not in stop_words]

def get_document_vector(doc_tokens, model):
    """Averages word vectors to create a single document vector."""
    # Keep only words that are in the model's vocabulary
    word_vectors = [model[word] for word in doc_tokens if word in model.key_to_index]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Process and compare each pair of documents
for pair in doc_pairs:
    print(f"\n--- Comparing: {pair['title']} ---")
    doc1, doc2 = pair['doc1'], pair['doc2']
    print(f"Doc 1: '{doc1}'")
    print(f"Doc 2: '{doc2}'")

    # Pre-process the documents
    tokens1 = preprocess(doc1)
    tokens2 = preprocess(doc2)

    # Get the vector representation for each document
    vector1 = get_document_vector(tokens1, word2vec_model)
    vector2 = get_document_vector(tokens2, word2vec_model)

    # Calculate and print the Cosine Similarity
    cosine_similarity = 1 - spatial.distance.cosine(vector1, vector2)
    print(f"Cosine Similarity: {cosine_similarity:.4f}")

Downloading the 'word2vec-google-news-300' model. This may take a moment...

Word2Vec Model loaded successfully!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



PART A: EXPLORING WORD2VEC WORD RELATIONSHIPS

Words most similar to 'programming':
[('programing', 0.8606171011924744), ('Programming', 0.6899746060371399), ('NLP_neuro_linguistic', 0.6174069046974182), ('broadcasts', 0.5984179377555847), ('primetime_programming', 0.5968459248542786)]

Testing analogy: 'king' - 'man' + 'woman' approx ?
[('queen', 0.7118193507194519)]

Testing analogy: 'Paris' - 'France' + 'Germany' approx ?
[('Berlin', 0.7644002437591553)]

Measuring similarity scores between pairs:
Similarity between 'cat' and 'dog': 0.7609
Similarity between 'car' and 'vehicle': 0.7821
Similarity between 'computer' and 'keyboard': 0.3964
Similarity between 'banana' and 'rocket': 0.0650

Which word doesn't match?
In the list ['breakfast', 'cereal', 'dinner', 'lunch', 'computer'], the odd one out is: 'computer'

PART B: CALCULATING DOCUMENT SIMILARITY WITH WORD2VEC

--- Comparing: Similar Pair (AI/ML) ---
Doc 1: 'Natural Language Processing is a fascinating field of artificial intell

In [None]:
# --- 1. Load a pre-trained GloVe model ---
# We'll use a model trained on Wikipedia. It's smaller and faster to load.
# 'glove-wiki-gigaword-100' means it uses a 100-dimensional vector for each word.
print("Downloading the 'glove-wiki-gigaword-100' model...")
try:
    glove_model = api.load('glove-wiki-gigaword-100')
    print("\nGloVe Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")

# --- 2. Exploring GloVe Capabilities ---
print("\n" + "="*50)
print("PART A: EXPLORING GLOVE WORD RELATIONSHIPS")
print("="*50)

# Example 1: Finding the most similar words
print("\nWords most similar to 'technology':")
print(glove_model.most_similar('technology', topn=5))

# Example 2: Famous Analogies (Vector Arithmetic)
print("\nTesting analogy: 'king' - 'man' + 'woman' approx ?")
print(glove_model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))

print("\nTesting analogy: 'rome' - 'italy' + 'japan' approx ?")
print(glove_model.most_similar(positive=['rome', 'japan'], negative=['italy'], topn=1))

# Example 3: Measuring similarity between word pairs
print("\nMeasuring similarity scores between pairs:")
pairs = [('cat', 'dog'), ('car', 'vehicle'), ('computer', 'keyboard'), ('banana', 'rocket')]
for w1, w2 in pairs:
    score = glove_model.similarity(w1, w2)
    print(f"Similarity between '{w1}' and '{w2}': {score:.4f}")

# Example 4: Finding the odd one out
print("\nWhich word doesn't match?")
word_list = ['apple', 'banana', 'orange', 'fruit', 'bicycle']
odd_one_out = glove_model.doesnt_match(word_list)
print(f"In the list {word_list}, the odd one out is: '{odd_one_out}'")


# --- 3. Find Similarity Between Two Documents using GloVe ---
print("\n" + "="*50)
print("PART B: CALCULATING DOCUMENT SIMILARITY WITH GLOVE")
print("="*50)

# We will reuse the same document pairs from the Word2Vec section
# and the same helper functions (preprocess, get_document_vector)

doc_pairs = [
    {
        "title": "Similar Pair (AI/ML)",
        "doc1": "Natural Language Processing is a fascinating field of artificial intelligence.",
        "doc2": "AI and machine learning have revolutionized the way we interact with technology."
    },
    {
        "title": "Dissimilar Pair (Tech vs. Nature)",
        "doc1": "The new GPU offers amazing performance for deep learning tasks.",
        "doc2": "The river flows gently through the lush green valley."
    }
]

# Process and compare each pair of documents using the GloVe model
for pair in doc_pairs:
    print(f"\n--- Comparing: {pair['title']} ---")
    doc1, doc2 = pair['doc1'], pair['doc2']
    print(f"Doc 1: '{doc1}'")
    print(f"Doc 2: '{doc2}'")

    # Pre-process the documents
    tokens1 = preprocess(doc1)
    tokens2 = preprocess(doc2)

    # Get the vector representation for each document using the GLOVE model
    vector1 = get_document_vector(tokens1, glove_model)
    vector2 = get_document_vector(tokens2, glove_model)

    # Calculate and print the Cosine Similarity
    cosine_similarity = 1 - spatial.distance.cosine(vector1, vector2)
    print(f"Cosine Similarity (using GloVe): {cosine_similarity:.4f}")

Downloading the 'glove-wiki-gigaword-100' model...

GloVe Model loaded successfully!

PART A: EXPLORING GLOVE WORD RELATIONSHIPS

Words most similar to 'technology':
[('technologies', 0.8506267666816711), ('computer', 0.7642159461975098), ('tech', 0.7489413619041443), ('software', 0.7358859181404114), ('systems', 0.7292639017105103)]

Testing analogy: 'king' - 'man' + 'woman' approx ?
[('queen', 0.7698540687561035)]

Testing analogy: 'rome' - 'italy' + 'japan' approx ?
[('tokyo', 0.7762303948402405)]

Measuring similarity scores between pairs:
Similarity between 'cat' and 'dog': 0.8798
Similarity between 'car' and 'vehicle': 0.8631
Similarity between 'computer' and 'keyboard': 0.5418
Similarity between 'banana' and 'rocket': -0.0064

Which word doesn't match?
In the list ['apple', 'banana', 'orange', 'fruit', 'bicycle'], the odd one out is: 'bicycle'

PART B: CALCULATING DOCUMENT SIMILARITY WITH GLOVE

--- Comparing: Similar Pair (AI/ML) ---
Doc 1: 'Natural Language Processing is a fas