In [None]:
!pip install gensim



In [None]:
# Block 1: Imports and sample documents
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
#nltk.download('all')

doc1 = "Natural language processing is a field of artificial intelligence."
doc2 = "Machine learning helps in processing human language and speech."
doc3 = "Football is a popular sport played worldwide."

documents = [doc1, doc2, doc3]

stop_words = set(stopwords.words('english'))
def preprocess(doc):
    doc = doc.lower()
    doc = word_tokenize(doc)
    doc = [w for w in doc if w.isalpha()]  # Keep only words
    doc = [w for w in doc if w not in stop_words]
    return doc



In [None]:
documents = pd.read_csv('job_dataset.csv')
documents = (documents['Responsibilities'])
docs = [preprocess(doc) for doc in documents]

print("Tokenized Documents:", docs)

Tokenized Documents: [['assist', 'coding', 'debugging', 'applications', 'learn', 'apply', 'framework', 'core', 'fundamentals', 'support', 'team', 'building', 'mvc', 'web', 'applications', 'write', 'basic', 'sql', 'queries', 'work', 'entity', 'framework', 'collaborate', 'peers', 'solve', 'issues', 'participate', 'code', 'reviews', 'learning', 'follow', 'best', 'practices', 'coding', 'work', 'version', 'control', 'git'], ['write', 'simple', 'c', 'programs', 'guidance', 'support', 'development', 'mvc', 'applications', 'implement', 'razor', 'views', 'logic', 'assist', 'database', 'query', 'writing', 'participate', 'unit', 'testing', 'tasks', 'learn', 'apply', 'linq', 'data', 'operations', 'work', 'mentors', 'code', 'corrections'], ['contribute', 'development', 'small', 'modules', 'assist', 'bug', 'fixing', 'debugging', 'learn', 'implement', 'mvc', 'patterns', 'support', 'database', 'integration', 'tasks', 'understand', 'version', 'control', 'basics', 'work', 'minor', 'testing', 'scripts', 

In [None]:
# Block 2: Train Word2Vec model on sample documents
model = Word2Vec(sentences=docs, vector_size=300, window=10, min_count=1, workers=4)

print("Word2Vec Vocabulary:", list(model.wv.index_to_key)[:10])

Word2Vec Vocabulary: ['collaborate', 'lead', 'teams', 'implement', 'design', 'assist', 'manage', 'support', 'develop', 'mentor']


In [None]:
# Block 3: Compute document vectors by averaging word embeddings
def document_vector(doc):
    doc = [word for word in doc if word in model.wv]
    return np.mean(model.wv[doc], axis=0)

doc_vectors = np.array([document_vector(doc) for doc in tokenized_docs])
print("Shape of document vectors:", doc_vectors.shape)


ValueError: need at least one array to concatenate

In [None]:
# Block 4: Similarity between documents using cosine similarity
similarity_matrix = cosine_similarity(doc_vectors)
print("Cosine Similarity Matrix:\n", similarity_matrix)

Cosine Similarity Matrix:
 [[ 1.0000002   0.30129337 -0.02288702]
 [ 0.30129337  0.99999964 -0.02391089]
 [-0.02288702 -0.02391089  0.99999994]]


In [None]:
# Block 5: Word-level similarity examples
print("Similarity between 'processing' and 'language':", model.wv.similarity('processing', 'language'))
print("Similarity between 'peers' and 'follow':", model.wv.similarity('artificial', 'human'))

Similarity between 'processing' and 'language': -0.0118171135
Similarity between 'peers' and 'follow': -0.04430993


In [None]:
#TRY GLOVE,  AND COMPARE
# Block 6: Load Pretrained GloVe Embeddings
from gensim.downloader import load
import numpy as np

print("Loading GloVe (50d)... This may take a few seconds.")
glove_model = load("glove-wiki-gigaword-50")
print("GloVe model loaded successfully!")


Loading GloVe (50d)... This may take a few seconds.
GloVe model loaded successfully!


In [None]:
# Block 7: Function to compute average GloVe vector per document
def get_glove_doc_vector(glove_model, doc_tokens):
    vectors = []
    for token in doc_tokens:
        if token in glove_model:
            vectors.append(glove_model[token])
    if len(vectors) == 0:
        return np.zeros(glove_model.vector_size)
    return np.mean(vectors, axis=0)


In [None]:
# Block 8: Compute document vectors using GloVe
glove_doc_vectors = np.array([get_glove_doc_vector(glove_model, doc) for doc in docs])
print("Shape of GloVe document embeddings:", glove_doc_vectors.shape)


Shape of GloVe document embeddings: (3, 50)


In [None]:
# Block 9: Compute cosine similarity between documents using GloVe embeddings
from sklearn.metrics.pairwise import cosine_similarity

glove_similarity = cosine_similarity(glove_doc_vectors)

print("\nCosine Similarity Matrix (GloVe):")
print(np.round(glove_similarity, 3))



Cosine Similarity Matrix (GloVe):
[[1.    0.902 0.581]
 [0.902 1.    0.588]
 [0.581 0.588 1.   ]]


In [None]:
# Block 10: Compare Word2Vec and GloVe similarity matrices side-by-side
import pandas as pd

w2v_similarity_df = pd.DataFrame(similarity_matrix, columns=["Doc1", "Doc2", "Doc3"], index=["Doc1", "Doc2", "Doc3"])
glove_similarity_df = pd.DataFrame(glove_similarity, columns=["Doc1", "Doc2", "Doc3"], index=["Doc1", "Doc2", "Doc3"])

print("\n--- Word2Vec Similarity ---")
display(w2v_similarity_df)

print("\n--- GloVe Similarity ---")
display(glove_similarity_df)



--- Word2Vec Similarity ---


Unnamed: 0,Doc1,Doc2,Doc3
Doc1,1.0,0.301293,-0.022887
Doc2,0.301293,1.0,-0.023911
Doc3,-0.022887,-0.023911,1.0



--- GloVe Similarity ---


Unnamed: 0,Doc1,Doc2,Doc3
Doc1,1.0,0.902362,0.581222
Doc2,0.902362,1.0,0.588385
Doc3,0.581222,0.588385,1.0
