In [1]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec, FastText
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.data.path.append('nltk_packages')
nltk.download('punkt_tab', download_dir='nltk_packages')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings(action='ignore')

[nltk_data] Downloading package punkt_tab to nltk_packages...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "Cats and dogs are great pets."
]

# Create the BoW model
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(documents)

# Convert to array and display
bow_array = X_bow.toarray()
print("Bag of Words representation:")
print(bow_array)
print("Feature Names:", vectorizer.get_feature_names_out())

Bag of Words representation:
[[0 0 1 0 0 0 0 0 1 1 0 1 2]
 [0 0 0 0 1 0 0 1 0 1 0 1 2]
 [1 1 0 1 0 1 1 0 0 0 1 0 0]]
Feature Names: ['and' 'are' 'cat' 'cats' 'dog' 'dogs' 'great' 'log' 'mat' 'on' 'pets'
 'sat' 'the']


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF model
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(documents)

# Convert to array and display
tfidf_array = X_tfidf.toarray()
print("\nTF-IDF representation:")
print(tfidf_array)
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())


TF-IDF representation:
[[0.         0.         0.42755362 0.         0.         0.
  0.         0.         0.42755362 0.32516555 0.         0.32516555
  0.6503311 ]
 [0.         0.         0.         0.         0.42755362 0.
  0.         0.42755362 0.         0.32516555 0.         0.32516555
  0.6503311 ]
 [0.40824829 0.40824829 0.         0.40824829 0.         0.40824829
  0.40824829 0.         0.         0.         0.40824829 0.
  0.        ]]
Feature Names: ['and' 'are' 'cat' 'cats' 'dog' 'dogs' 'great' 'log' 'mat' 'on' 'pets'
 'sat' 'the']


In [4]:
# Tokenize documents into words
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

# Train Word2Vec model
model_w2v = gensim.models.Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, sg=0)

# Get vector for a specific word
word_vector = model_w2v.wv['cat']
print("\nWord2Vec representation for 'cat':", word_vector)

# Find similar words
similar_words = model_w2v.wv.most_similar('cat', topn=3)
print("Most similar words to 'cat':", similar_words)


Word2Vec representation for 'cat': [-0.00950012  0.00956222 -0.00777076 -0.00264551 -0.00490641 -0.0049667
 -0.00802359 -0.00778358 -0.00455321 -0.00127536 -0.00510299  0.00614054
 -0.00951662 -0.0053071   0.00943715  0.00699133  0.00767582  0.00423474
  0.00050709 -0.00598114  0.00601878  0.00263503  0.00769943  0.00639384
  0.00794257  0.00865741 -0.00989575 -0.0067557   0.00133757  0.0064403
  0.00737382  0.00551698  0.00766163 -0.00512557  0.00658441 -0.00410837
 -0.00905534  0.00914168  0.0013314  -0.00275968 -0.00247784 -0.00422048
  0.00481234  0.00440022 -0.00265336 -0.00734188 -0.00356585 -0.00033661
  0.00609589 -0.00283734 -0.00012089  0.00087973 -0.00709565  0.002065
 -0.00143242  0.00280215  0.00484222 -0.00135202 -0.00278014  0.00773865
  0.0050456   0.00671352  0.00451564  0.00866716  0.00747497 -0.00108189
  0.00874764  0.00460172  0.00544063 -0.00138608 -0.00204132 -0.00442435
 -0.0085152   0.00303773  0.00888319  0.00891974 -0.00194235  0.00608616
  0.00377972 -0.004

In [5]:
similarity = model_w2v.wv.similarity('cat', 'dogs')  # Replace 'word1' and 'word2' with actual words
print("Cosine similarity between 'cat' and 'dogs':", similarity)

Cosine similarity between 'cat' and 'dogs': -0.025461027


In [6]:
import numpy as np

# Load pre-trained GloVe vectors (assuming 'glove.6B.100d.txt' is downloaded)
glove_vectors = {}
with open("datasets/nlp_pre_trained_word_embeddings/glove.6b/glove.6B.100d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_vectors[word] = vector

# Get vector for a specific word
glove_vector_cat = glove_vectors.get('cat')
print("\nGloVe representation for 'cat':", glove_vector_cat)


GloVe representation for 'cat': [ 0.23088    0.28283    0.6318    -0.59411   -0.58599    0.63255
  0.24402   -0.14108    0.060815  -0.7898    -0.29102    0.14287
  0.72274    0.20428    0.1407     0.98757    0.52533    0.097456
  0.8822     0.51221    0.40204    0.21169   -0.013109  -0.71616
  0.55387    1.1452    -0.88044   -0.50216   -0.22814    0.023885
  0.1072     0.083739   0.55015    0.58479    0.75816    0.45706
 -0.28001    0.25225    0.68965   -0.60972    0.19578    0.044209
 -0.31136   -0.68826   -0.22721    0.46185   -0.77162    0.10208
  0.55636    0.067417  -0.57207    0.23735    0.4717     0.82765
 -0.29263   -1.3422    -0.099277   0.28139    0.41604    0.10583
  0.62203    0.89496   -0.23446    0.51349    0.99379    1.1846
 -0.16364    0.20653    0.73854    0.24059   -0.96473    0.13481
 -0.0072484  0.33016   -0.12365    0.27191   -0.40951    0.021909
 -0.6069     0.40755    0.19566   -0.41802    0.18636   -0.032652
 -0.78571   -0.13847    0.044007  -0.084423   0.04911

In [7]:
# Train FastText model
model_ft = FastText(sentences=tokenized_docs, vector_size=100, window=5, min_count=1)

# Get vector for a specific word
fasttext_vector = model_ft.wv['dog']
print("\nFastText representation for 'dog':", fasttext_vector)

# Find similar words
similar_fasttext_words = model_ft.wv.most_similar('dog', topn=3)
print("Most similar words to 'dog':", similar_fasttext_words)


FastText representation for 'dog': [-1.1338070e-03  2.0002944e-03 -2.5797717e-03  1.4003406e-03
 -2.4381576e-03 -3.4200454e-03  5.9867237e-04 -7.3052529e-04
  1.1329837e-03  1.7838406e-04  3.0663244e-03  7.0723880e-04
  1.7959451e-03  4.8864083e-03  4.6188425e-04 -2.9081232e-03
 -4.3916734e-04  7.4553618e-04 -7.3311763e-04 -2.0251889e-03
  1.4080647e-03 -8.4824621e-04 -1.9739866e-03 -2.5472145e-03
 -4.6196635e-04  1.7570242e-03  1.7598222e-03  3.7484278e-03
  2.1523852e-03 -1.4960765e-03 -5.4163342e-05 -1.9794821e-03
  1.2051348e-05 -1.0916854e-03 -2.7709417e-03  2.5547245e-03
  3.1367261e-03  1.1269423e-03 -4.6792114e-03 -6.3573674e-04
 -2.6462893e-03  1.9067300e-03  2.7983184e-03  1.9837550e-03
 -7.1428862e-04 -3.0612983e-03  1.8265255e-03  3.6188185e-03
 -1.3605955e-03 -6.1967788e-04  3.3724245e-03  8.3654711e-04
  4.9976283e-03  2.9725847e-03 -2.8033410e-03  2.1066701e-03
 -1.7428529e-03 -1.3199404e-03  2.0273766e-03  1.1573492e-03
  3.0376336e-03  4.5514925e-04  1.8565302e-03  4.

## Detailed Explanation of Techniques:
 - **Bag of Words:** This method creates a sparse matrix where each row represents a document and each column represents a unique word from the corpus. The values are counts of how often each word appears in each document.

  - **TF-IDF:** This method builds on BoW by calculating how important a word is to a document relative to its frequency across all documents. It helps reduce the weight of common words that may not be informative.

  - **Word2Vec:** This technique uses neural networks to learn embeddings that reflect semantic relationships based on context. It can be trained using two architectures: Continuous Bag of Words (CBOW) or Skip-Gram.
GloVe: Unlike Word2Vec, which uses local context windows, GloVe captures global statistical information by factorizing a matrix of word co-occurrences.
  - **FastText:** This method improves upon Word2Vec by considering subword information (character n-grams), which allows it to generate embeddings for words not seen during training.

These techniques collectively enhance our ability to analyze and understand text data semantically and contextually, enabling more effective natural language processing applications.