# Count-Based methods

In [1]:
Doc_1= "The cat in the hat"
Doc_2= "The quick brown fox"
Doc_3= "The hat is blue"

Docs =[Doc_1,Doc_2,Doc_3]

# **Term Frequency (TF)**

* **TF(t,d) is the term frequency of term t in document d (how often the term appears in the document).**

In [2]:
lst = []
for d in Docs:
    lst.extend(d.lower().split(' '))
wrds = set(lst) # remove duplicate words
wrds

{'blue', 'brown', 'cat', 'fox', 'hat', 'in', 'is', 'quick', 'the'}

In [3]:
#form a dataframe to represent TF for each word in each Document where columns are words and rows are documents
import pandas as pd
def count_wrd_Doc(wrd,doc):
    i=0
    for w in doc.lower().split(' '):
        if wrd == w:
            i = i+1
    return i/len(doc.lower().split(' '))
    
tf_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
freq_lst=[] #empty list for each column to save word frequencies in each document
for c in tf_df.columns:
    freq_lst=[]#empty the list
    for d in Docs:
        freq_lst.append(count_wrd_Doc(c,d))#append the frequency of word in document d
    tf_df[c]=freq_lst #assign values to column
tf_df #display the dataframe of TF for each word in each document

Unnamed: 0,fox,the,brown,blue,hat,in,quick,is,cat
0,0.0,0.4,0.0,0.0,0.2,0.2,0.0,0.0,0.2
1,0.25,0.25,0.25,0.0,0.0,0.0,0.25,0.0,0.0
2,0.0,0.25,0.0,0.25,0.25,0.0,0.0,0.25,0.0


# Document Frequency (DF)

* **Calculate Document Frequency (DF): the word appears in how many documents**

In [4]:
df_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
for c in df_df.columns:
    df_df[c] = [sum(1 for doc in Docs if c in doc.lower().split(' '))]
df_df #display the dataframe of DF for each word 

Unnamed: 0,fox,the,brown,blue,hat,in,quick,is,cat
0,1,3,1,1,2,1,1,1,1


# Inverse Document Frequency (IDF)

* **IDF(t,D) is the inverse document frequency of term t in the entire document set D (logarithmically scaled inverse fraction of the documents that contain the term).**

In [5]:
import math
idf_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
for c in idf_df.columns:
    N = 3 #No of documents
    df = df_df[c].iloc[0] # DF of word
    idf_df[c] = [math.log((N+1) / (df+1))+1]#IDF = log (no. of documents/DF(word)) 
idf_df #display the dataframe of idf for each word 

Unnamed: 0,fox,the,brown,blue,hat,in,quick,is,cat
0,1.693147,1.0,1.693147,1.693147,1.287682,1.693147,1.693147,1.693147,1.693147


# Term Frequency - Inverse Document Frequency (TF-IDF)

*** TF-IDF = TF * IDF**

In [6]:
tfidf_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
tfidf_lst=[]  #empty list for each column 
for c in tfidf_df.columns:
    tfidf_lst=[] #empty list for each column
    for i in range(0,len(Docs)):
        tf_idf_d1 = tf_df[c].iloc[i]*idf_df[c].iloc[0] #append tf of word in i th document to idf of word
        tfidf_lst.append(tf_idf_d1)
    tfidf_df[c]=tfidf_lst#assign tfidf values for each word
tfidf_df #display the dataframe of tf-idf for all words

Unnamed: 0,fox,the,brown,blue,hat,in,quick,is,cat
0,0.0,0.4,0.0,0.0,0.257536,0.338629,0.0,0.0,0.338629
1,0.423287,0.25,0.423287,0.0,0.0,0.0,0.423287,0.0,0.0
2,0.0,0.25,0.0,0.423287,0.321921,0.0,0.0,0.423287,0.0


# L2 Normalization

* **L2 normalization, also known as Euclidean normalization or L2 norm normalization, is a technique used to scale vectors (or arrays) in such a way that their Euclidean norm becomes equal to 1.**

In [7]:
normalized_df = pd.DataFrame(columns=tfidf_df.columns)

# Apply L2 normalization to each document's TF-IDF values
for i,row in enumerate(tfidf_df.iterrows()):
    # Extract TF-IDF values    
    tfidf_values_list = list(tfidf_df.iloc[i].values)
    # Calculate L2 norm
    l2_norm = math.sqrt(sum(val**2 for val in tfidf_values_list))
    # Normalize TF-IDF values using L2 norm
    normalized_tfidf = [val / l2_norm for val in list(tfidf_df.iloc[i].values)]
    new_row = pd.Series(normalized_tfidf, index=tfidf_df.columns)
    normalized_df.loc[len(normalized_df)] = new_row
    
normalized_df

Unnamed: 0,fox,the,brown,blue,hat,in,quick,is,cat
0,0.0,0.592567,0.0,0.0,0.381519,0.501651,0.0,0.0,0.501651
1,0.546454,0.322745,0.546454,0.0,0.0,0.0,0.546454,0.0,0.0
2,0.0,0.345205,0.0,0.584483,0.444514,0.0,0.0,0.584483,0.0


# TfidfVectorizer Python Library

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(norm='l2',smooth_idf=True)

# Fit the documents and transform them into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(Docs)

# Get the feature names (terms) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(data=tfidf_matrix.toarray(), columns=feature_names)
df_tfidf

Unnamed: 0,blue,brown,cat,fox,hat,in,is,quick,the
0,0.0,0.0,0.501651,0.0,0.381519,0.501651,0.0,0.0,0.592567
1,0.0,0.546454,0.0,0.546454,0.0,0.0,0.0,0.546454,0.322745
2,0.584483,0.0,0.0,0.0,0.444514,0.0,0.584483,0.0,0.345205


* **The TfidfVectorizer in scikit-learn, by default, adds a smoothing term to the denominator of the IDF calculation to avoid division by zero. This is done to handle the case where a term is present in all documents, ensuring that the IDF is not undefined.**
* **L2 normalization, also known as Euclidean normalization or L2 norm normalization, is a technique used to scale vectors (or arrays) in such a way that their Euclidean norm becomes equal to 1.**

# Unigram

* **A unigram, in the context of natural language processing (NLP) and linguistics, refers to a single unit or token of a word. It is the simplest form of linguistic analysis where text is broken down into individual words. In other words, a unigram is a term used to describe a single word in a sequence of words.**
* **Unigrams are the building blocks for more complex linguistic analyses, such as bigrams (pairs of consecutive words), trigrams (triplets of consecutive words), and n-grams in general.**

In [9]:
#Probability of unigram P(w)=C(w)/m same idea of TF
def count_wrd_Doc(wrd,doc):
    i=0
    for w in doc.lower().split(' '):
        if wrd == w:
            i = i+1
    return i
    
unigram_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
freq_lst=[] #empty list for each column to save word frequencies in each document
for c in tf_df.columns:
    freq_lst=[]#empty the list
    for d in Docs:
        freq_lst.append(count_wrd_Doc(c,d))#append the frequency of word in document d
    unigram_df[c]=freq_lst #assign values to column
unigram_df #display the dataframe of TF for each word in each document

Unnamed: 0,fox,the,brown,blue,hat,in,quick,is,cat
0,0,2,0,0,1,1,0,0,1
1,1,1,1,0,0,0,1,0,0
2,0,1,0,1,1,0,0,1,0


# Unigrams python function

In [10]:
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist, MLEProbDist

for d in Docs:
    words = word_tokenize(d.lower())
    result = list(ngrams(words, 1))
    # Calculate frequency distribution of bigrams
    ngram_freq = FreqDist(result)
    for word, frequency in ngram_freq.items():
        print(f"{word}: {frequency}")

('the',): 2
('cat',): 1
('in',): 1
('hat',): 1
('the',): 1
('quick',): 1
('brown',): 1
('fox',): 1
('the',): 1
('hat',): 1
('is',): 1
('blue',): 1


# Bigram

* **A bigram, in the context of natural language processing (NLP) and linguistics, refers to an ordered pair of consecutive words within a text or sequence of words. It is a type of n-gram, where "n" represents the number of words in the sequence.**

In [11]:
#get bi-grams of input sentence
def bi_lst(doc):
    wrds = doc.lower().split(' ')
    bi_lst = []
    for j in range(0,len(wrds)-1):
        bi_lst.append(wrds[j:j+2])
    return bi_lst

lst = []
for d in Docs:
    lst.extend(bi_lst(d))
unique_list = []
unique_list = [item for item in lst if item not in unique_list]

def count_biwrd_Doc(st,doc):
    i=0    
    for s in bi_lst(doc):
        if s == st.split(' '):
            i = i+1
    return i
bigram_df = pd.DataFrame(columns=list((' '.join(x) for x in unique_list))) #empty dataframe initialized with words column headers
freq_lst=[] #empty list for each column to save word frequencies in each document
for c in bigram_df.columns:
    freq_lst=[]#empty the list
    for d in Docs:
        freq_lst.append(count_biwrd_Doc(c,d))#append the frequency of word in document d
    bigram_df[c]=freq_lst #assign values to column
bigram_df #display the dataframe of TF for each word in each document

Unnamed: 0,the cat,cat in,in the,the hat,the quick,quick brown,brown fox,the hat.1,hat is,is blue
0,1,1,1,1,0,0,0,1,0,0
1,0,0,0,0,1,1,1,0,0,0
2,0,0,0,1,0,0,0,1,1,1


# Bigrams python function

In [12]:
for d in Docs:
    words = word_tokenize(d.lower())
    result = list(ngrams(words, 2))
    # Calculate frequency distribution of bigrams
    ngram_freq = FreqDist(result)
    for word, frequency in ngram_freq.items():
        print(f"{word}: {frequency}")

('the', 'cat'): 1
('cat', 'in'): 1
('in', 'the'): 1
('the', 'hat'): 1
('the', 'quick'): 1
('quick', 'brown'): 1
('brown', 'fox'): 1
('the', 'hat'): 1
('hat', 'is'): 1
('is', 'blue'): 1


# Count Bag of Words (CBOW)

In [13]:
def count_wrd_Doc(wrd,doc):
    i=0
    for w in doc.lower().split(' '):
        if wrd == w:
            i = i+1
    return i
    
cw_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
freq_lst=[] #empty list for each column to save word frequencies in each document
for c in tf_df.columns:
    freq_lst=[]#empty the list
    for d in Docs:
        freq_lst.append(count_wrd_Doc(c,d))#append the frequency of word in document d
    cw_df[c]=freq_lst #assign values to column
cw_df #display the datafram

Unnamed: 0,fox,the,brown,blue,hat,in,quick,is,cat
0,0,2,0,0,1,1,0,0,1
1,1,1,1,0,0,0,1,0,0
2,0,1,0,1,1,0,0,1,0


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the documents to create the Bag of Words representation
X_bow = vectorizer.fit_transform(Docs)
feature_names = vectorizer.get_feature_names_out()
# Print the Bag of Words representation
print("Bag of Words representation:")
print(X_bow.toarray())
print("Feature names:")
print(feature_names)

Bag of Words representation:
[[0 0 1 0 1 1 0 0 2]
 [0 1 0 1 0 0 0 1 1]
 [1 0 0 0 1 0 1 0 1]]
Feature names:
['blue' 'brown' 'cat' 'fox' 'hat' 'in' 'is' 'quick' 'the']


# Skip-Grams

In [15]:
import numpy as np
# Create a vocabulary (unique words with indices)
vocab = []
for d in Docs:
    vocab.append({word: idx for idx, word in enumerate(d.lower().split())})
vocab

[{'the': 3, 'cat': 1, 'in': 2, 'hat': 4},
 {'the': 0, 'quick': 1, 'brown': 2, 'fox': 3},
 {'the': 0, 'hat': 1, 'is': 2, 'blue': 3}]

In [16]:
# Generate training pairs (target word, context word)
window_size = 2
training_pairs = []
context_words = []
for d in Docs:
    t=[]
    c=[]
    for i, target_word in enumerate(d.lower().split(' ')):
        start = max(0, i - window_size)
        end = min(len(d.lower().split(' ')), i + window_size + 1)
        c = [d.lower().split(' ')[j] for j in range(start, end) if j != i]
        for context_word in c:
            t.append((target_word, context_word))
    training_pairs.append(t)
    context_words.append(c)

In [17]:
training_pairs

[[('the', 'cat'),
  ('the', 'in'),
  ('cat', 'the'),
  ('cat', 'in'),
  ('cat', 'the'),
  ('in', 'the'),
  ('in', 'cat'),
  ('in', 'the'),
  ('in', 'hat'),
  ('the', 'cat'),
  ('the', 'in'),
  ('the', 'hat'),
  ('hat', 'in'),
  ('hat', 'the')],
 [('the', 'quick'),
  ('the', 'brown'),
  ('quick', 'the'),
  ('quick', 'brown'),
  ('quick', 'fox'),
  ('brown', 'the'),
  ('brown', 'quick'),
  ('brown', 'fox'),
  ('fox', 'quick'),
  ('fox', 'brown')],
 [('the', 'hat'),
  ('the', 'is'),
  ('hat', 'the'),
  ('hat', 'is'),
  ('hat', 'blue'),
  ('is', 'the'),
  ('is', 'hat'),
  ('is', 'blue'),
  ('blue', 'hat'),
  ('blue', 'is')]]

In [18]:
context_words

[['in', 'the'], ['quick', 'brown'], ['hat', 'is']]

In [19]:
# Initialize word vectors randomly
embedding_dim = 10
learning_rate = 0.01
epochs = 10000
word_vectors = []
for v in vocab:
    word_vectors.append({word: np.random.rand(embedding_dim) for word in v})#initialize random values vector for each word 
word_vectors

[{'the': array([0.84529526, 0.69903951, 0.28539354, 0.11736014, 0.65626588,
         0.08263976, 0.99767143, 0.28758688, 0.26407711, 0.13148117]),
  'cat': array([0.48804358, 0.59325503, 0.71037303, 0.40248577, 0.63705859,
         0.03943421, 0.89622593, 0.74615319, 0.63991473, 0.04905593]),
  'in': array([0.30902045, 0.68482678, 0.16373436, 0.84309403, 0.36842233,
         0.3958363 , 0.13988066, 0.20406499, 0.45517246, 0.17487886]),
  'hat': array([0.80510322, 0.86534605, 0.53292194, 0.9258283 , 0.38611018,
         0.70490979, 0.58199763, 0.57832354, 0.01628012, 0.85001542])},
 {'the': array([0.50691032, 0.29632617, 0.11346918, 0.4250841 , 0.46255449,
         0.56645798, 0.60185999, 0.63028282, 0.62756915, 0.14307344]),
  'quick': array([0.81222574, 0.46387643, 0.82730484, 0.68975084, 0.75186311,
         0.49662962, 0.93318671, 0.74401174, 0.81626842, 0.91329522]),
  'brown': array([0.77325203, 0.29572043, 0.40950794, 0.40860489, 0.66149743,
         0.5108963 , 0.84747143, 0.335

In [20]:
for i in range(0,len(training_pairs)):
    # Train the Skip-gram model
    for epoch in range(epochs):
    
        for target_word, context_word in training_pairs[i]:
            # Forward pass
            input_vector = word_vectors[i][target_word]
            output_vector = word_vectors[i][context_word]

            # Calculate loss (using negative log likelihood)
            error = -np.log(np.exp(np.dot(input_vector, output_vector)))

            # Backward pass (update word vectors using gradient descent)
            gradient = input_vector * np.exp(np.dot(input_vector, output_vector)) / (1 + np.exp(np.dot(input_vector, output_vector)))
            word_vectors[i][target_word] -= learning_rate * gradient
            word_vectors[i][context_word] -= learning_rate * gradient

        if epoch % 1000 == 0:
            print(f"Epoch {epoch}, Loss: {error}")
    print('-------------------------------')

Epoch 0, Loss: -2.455161845659121
Epoch 1000, Loss: 2.895171525320865e-06
Epoch 2000, Loss: 1.2455481091044738e-11
Epoch 3000, Loss: -0.0
Epoch 4000, Loss: -0.0
Epoch 5000, Loss: -0.0
Epoch 6000, Loss: -0.0
Epoch 7000, Loss: -0.0
Epoch 8000, Loss: -0.0
Epoch 9000, Loss: -0.0
-------------------------------
Epoch 0, Loss: -2.911662972370301
Epoch 1000, Loss: 7.762651243338787e-05
Epoch 2000, Loss: 3.656747671577919e-08
Epoch 3000, Loss: 1.726285781004556e-11
Epoch 4000, Loss: 8.104628079763676e-15
Epoch 5000, Loss: -0.0
Epoch 6000, Loss: -0.0
Epoch 7000, Loss: -0.0
Epoch 8000, Loss: -0.0
Epoch 9000, Loss: -0.0
-------------------------------
Epoch 0, Loss: -1.7799870869561538
Epoch 1000, Loss: 0.00016867172622069302
Epoch 2000, Loss: 7.955753705880423e-08
Epoch 3000, Loss: 3.755784572305218e-11
Epoch 4000, Loss: 1.7763568394002662e-14
Epoch 5000, Loss: -0.0
Epoch 6000, Loss: -0.0
Epoch 7000, Loss: -0.0
Epoch 8000, Loss: -0.0
Epoch 9000, Loss: -0.0
-------------------------------


In [21]:
#word vectors
for i in range(0,len(training_pairs)):
    for word, vector in word_vectors[i].items():
        print(f"Vector for '{word}': {vector}")
    print('-------------------------------')

Vector for 'the': [-2.44113355e-28 -2.87758094e-28 -3.37613298e-28 -3.64563012e-28
 -1.54601888e-28 -2.29515671e-28 -2.59581436e-28 -3.61020069e-28
 -8.32105908e-29 -3.04462084e-28]
Vector for 'cat': [3.43064380e-28 4.04400457e-28 4.74464401e-28 5.12338147e-28
 2.17269558e-28 3.22549544e-28 3.64802428e-28 5.07359076e-28
 1.16939894e-28 4.27875387e-28]
Vector for 'in': [-1.17035864e-28 -1.37960569e-28 -1.61862772e-28 -1.74783339e-28
 -7.41211623e-29 -1.10037261e-28 -1.24451765e-28 -1.73084737e-28
 -3.98938577e-29 -1.45969004e-28]
Vector for 'hat': [4.63675245e-28 5.46575197e-28 6.41271465e-28 6.92460452e-28
 2.93654840e-28 4.35948025e-28 4.93055720e-28 6.85730894e-28
 1.58052357e-28 5.78303188e-28]
-------------------------------
Vector for 'the': [ 3.11506050e-18  1.72387911e-18  1.91063019e-18  5.17514425e-18
  3.97560809e-18  1.31159958e-18  1.71048919e-18  5.87997419e-18
  5.42181347e-18 -4.98986794e-18]
Vector for 'quick': [-1.91077471e-18 -1.05742557e-18 -1.17197847e-18 -3.1744278