### ***Main.py***

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import numpy as np
import spacy
from nlp_module import *

nlp = spacy.load("en_core_web_sm")
VECTOR_SIZE = nlp.vocab.vectors_length or 300

documents = [
    "news article",
    "ad sales boost time warner profit",
    "quarterly profits at us media giant timewarner jumped to bn m",
    "time warner said on friday that it now owns of searchengine google"
]

cleaned_docs = preprocess_docs(documents)

print("\nCleaned Documents:")
for i, doc in enumerate(cleaned_docs, start=1):
    print(f"{i}: {doc}")

king_vec = get_word_vector(nlp, "king", VECTOR_SIZE)
print("\nWord Vector for 'king' (first 10 dims):")
print(king_vec[:10])

doc_vectors = get_document_embeddings(nlp, cleaned_docs, VECTOR_SIZE)
print("\nDocument Embedding Shape:", doc_vectors.shape)

doc_sim = cosine_sim_embeddings(doc_vectors)
print("\nCosine Similarity Between Documents:")
print(np.round(doc_sim, 3))

sentence = "dog cat car skym apple"
sim_matrix, words_used = word_similarity(nlp, sentence, VECTOR_SIZE)

if sim_matrix is not None:
    print("\nWord Similarity Matrix (words with vectors):")
    for i, w1 in enumerate(words_used):
        for j, w2 in enumerate(words_used):
            if j > i:
                print(f"{w1} ↔ {w2} : {sim_matrix[i, j]:.3f}")

print("\nObservations:")
print("• 'dog' and 'cat' have high similarity due to both being animals.")
print("• 'car' is moderately similar to 'dog' and 'cat' due to physical object context.")
print("• 'skym' may be OOV → low similarity with other words.")
print("• Embeddings capture meaning beyond frequency (unlike TF-IDF).")

### ***NLP_Modules.py***

In [None]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess_docs(docs):
    return [clean_text(d) for d in docs if d.strip()]

def get_word_vector(nlp, word, vector_size):
    doc = nlp(word)
    if doc.has_vector:
        return doc.vector
    return np.zeros(vector_size)

def get_document_embeddings(nlp, docs, vector_size):
    vectors = []
    for doc in nlp.pipe(docs):
        if doc.has_vector:
            vectors.append(doc.vector)
        else:
            vectors.append(np.zeros(vector_size))
    return np.vstack(vectors)

def cosine_sim_embeddings(vectors):
    return cosine_similarity(vectors)

def word_similarity(nlp, sentence, vector_size):
    words = sentence.split()
    valid_words = []
    vectors = []

    for w in words:
        vec = get_word_vector(nlp, w, vector_size)
        if np.any(vec):
            valid_words.append(w)
            vectors.append(vec)

    if len(vectors) < 2:
        return None, valid_words

    vectors = np.vstack(vectors)
    sim_matrix = cosine_similarity(vectors)
    return sim_matrix, valid_words