# Count-Based methods

In [1]:
Doc_1= "The cat in the hat"
Doc_2= "The quick brown fox"
Doc_3= "The hat is blue"

# **TF**

* **TF(t,d) is the term frequency of term t in document d (how often the term appears in the document).**

In [2]:
lst = Doc_1.lower().split(' ')
lst.extend(Doc_2.lower().split(' '))
lst.extend(Doc_3.lower().split(' '))
wrds = set(lst) # remove duplicate words

In [3]:
#form a dataframe to represent TF for each word in each Document where columns are words and rows are documents
import pandas as pd
def count_wrd_Doc(wrd,doc):
    i=0
    for w in doc.lower().split(' '):
        if wrd == w:
            i = i+1
    return i/len(doc.lower().split(' '))
    
tf_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
freq_lst=[] #empty list for each column to save word frequencies in each document
for c in tf_df.columns:
    freq_lst=[]#empty the list
    freq_lst.append(count_wrd_Doc(c,Doc_1))#append the frequency of word in document 1
    freq_lst.append(count_wrd_Doc(c,Doc_2))#append the frequency of word in document 2
    freq_lst.append(count_wrd_Doc(c,Doc_3))#append the frequency of word in document 3
    tf_df[c]=freq_lst #assign values to column
tf_df #display the dataframe of TF for each word in each document

Unnamed: 0,cat,the,is,hat,brown,quick,fox,blue,in
0,0.2,0.4,0.0,0.2,0.0,0.0,0.0,0.0,0.2
1,0.0,0.25,0.0,0.0,0.25,0.25,0.25,0.0,0.0
2,0.0,0.25,0.25,0.25,0.0,0.0,0.0,0.25,0.0


# DF

* **Calculate Document Frequency (DF): the word appears in how many documents**

In [4]:
df_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
for c in df_df.columns:
    df_df[c] = [sum(1 for doc in [Doc_1,Doc_2,Doc_3] if c in doc.lower().split(' '))]
df_df #display the dataframe of DF for each word 

Unnamed: 0,cat,the,is,hat,brown,quick,fox,blue,in
0,1,3,1,2,1,1,1,1,1


# IDF

* **IDF(t,D) is the inverse document frequency of term t in the entire document set D (logarithmically scaled inverse fraction of the documents that contain the term).**

In [5]:
import math
idf_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
for c in idf_df.columns:
    N = 3 #No of documents
    df = df_df[c].iloc[0] # DF of word
    idf_df[c] = [math.log((N+1) / (df+1))+1]#IDF = log (no. of documents/DF(word)) 
idf_df #display the dataframe of idf for each word 

Unnamed: 0,cat,the,is,hat,brown,quick,fox,blue,in
0,1.693147,1.0,1.693147,1.287682,1.693147,1.693147,1.693147,1.693147,1.693147


# TF-IDF

*** TF-IDF = TF * IDF**

In [6]:
tfidf_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
tfidf_lst=[]  #empty list for each column 
for c in tfidf_df.columns:
    tfidf_lst=[] #empty list for each column
    tf_idf_d1 = tf_df[c].iloc[0]*idf_df[c].iloc[0] #append tf of word in first document to idf of word
    tf_idf_d2 = tf_df[c].iloc[1]*idf_df[c].iloc[0] #append tf of word in second document to idf of word
    tf_idf_d3 = tf_df[c].iloc[2]*idf_df[c].iloc[0] #append tf of word in third document to idf of word
    tfidf_df[c] = [tf_idf_d1,tf_idf_d2,tf_idf_d3]
tfidf_df #display the dataframe of tf-idf for all words

Unnamed: 0,cat,the,is,hat,brown,quick,fox,blue,in
0,0.338629,0.4,0.0,0.257536,0.0,0.0,0.0,0.0,0.338629
1,0.0,0.25,0.0,0.0,0.423287,0.423287,0.423287,0.0,0.0
2,0.0,0.25,0.423287,0.321921,0.0,0.0,0.0,0.423287,0.0


# L2 Normalization

In [7]:
normalized_df = pd.DataFrame(columns=tfidf_df.columns)

# Apply L2 normalization to each document's TF-IDF values
for i,row in enumerate(tfidf_df.iterrows()):
    # Extract TF-IDF values    
    tfidf_values_list = list(tfidf_df.iloc[i].values)
    # Calculate L2 norm
    l2_norm = math.sqrt(sum(val**2 for val in tfidf_values_list))
    # Normalize TF-IDF values using L2 norm
    normalized_tfidf = [val / l2_norm for val in list(tfidf_df.iloc[i].values)]
    new_row = pd.Series(normalized_tfidf, index=tfidf_df.columns)
    normalized_df.loc[len(normalized_df)] = new_row
    
normalized_df

Unnamed: 0,cat,the,is,hat,brown,quick,fox,blue,in
0,0.501651,0.592567,0.0,0.381519,0.0,0.0,0.0,0.0,0.501651
1,0.0,0.322745,0.0,0.0,0.546454,0.546454,0.546454,0.0,0.0
2,0.0,0.345205,0.584483,0.444514,0.0,0.0,0.0,0.584483,0.0


# TfidfVectorizer Python Library

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [Doc_1,Doc_2,Doc_3]

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(norm='l2',smooth_idf=True)

# Fit the documents and transform them into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get the feature names (terms) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(data=tfidf_matrix.toarray(), columns=feature_names)
df_tfidf

Unnamed: 0,blue,brown,cat,fox,hat,in,is,quick,the
0,0.0,0.0,0.501651,0.0,0.381519,0.501651,0.0,0.0,0.592567
1,0.0,0.546454,0.0,0.546454,0.0,0.0,0.0,0.546454,0.322745
2,0.584483,0.0,0.0,0.0,0.444514,0.0,0.584483,0.0,0.345205


* **The TfidfVectorizer in scikit-learn, by default, adds a smoothing term to the denominator of the IDF calculation to avoid division by zero. This is done to handle the case where a term is present in all documents, ensuring that the IDF is not undefined.**
* **L2 normalization, also known as Euclidean normalization or L2 norm normalization, is a technique used to scale vectors (or arrays) in such a way that their Euclidean norm becomes equal to 1.**

# Unigram

In [9]:
#Probability of unigram P(w)=C(w)/m same idea of TF
def count_wrd_Doc(wrd,doc):
    i=0
    for w in doc.lower().split(' '):
        if wrd == w:
            i = i+1
    return i
    
unigram_df = pd.DataFrame(columns=list(wrds)) #empty dataframe initialized with words column headers
freq_lst=[] #empty list for each column to save word frequencies in each document
for c in tf_df.columns:
    freq_lst=[]#empty the list
    freq_lst.append(count_wrd_Doc(c,Doc_1))#append the frequency of word in document 1
    freq_lst.append(count_wrd_Doc(c,Doc_2))#append the frequency of word in document 2
    freq_lst.append(count_wrd_Doc(c,Doc_3))#append the frequency of word in document 3
    unigram_df[c]=freq_lst #assign values to column
unigram_df #display the dataframe of TF for each word in each document

Unnamed: 0,cat,the,is,hat,brown,quick,fox,blue,in
0,1,2,0,1,0,0,0,0,1
1,0,1,0,0,1,1,1,0,0
2,0,1,1,1,0,0,0,1,0


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [Doc_1.lower(),Doc_2.lower(),Doc_3.lower()]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
unigram_names = vectorizer.get_feature_names_out()
print(unigram_names)

['blue' 'brown' 'cat' 'fox' 'hat' 'in' 'is' 'quick' 'the']


In [11]:
print(X)

  (0, 8)	2
  (0, 2)	1
  (0, 5)	1
  (0, 4)	1
  (1, 8)	1
  (1, 7)	1
  (1, 1)	1
  (1, 3)	1
  (2, 8)	1
  (2, 4)	1
  (2, 6)	1
  (2, 0)	1


# Bigram

In [12]:
def bi_lst(doc):
    wrds = doc.lower().split(' ')
    bi_lst = []
    for j in range(0,len(wrds)-1):
        bi_lst.append(wrds[j:j+2])
    return bi_lst
#get bi-grams of input sentence

lst = bi_lst(Doc_1)
lst.extend(bi_lst(Doc_2))
lst.extend(bi_lst(Doc_3))
unique_list = []
unique_list = [item for item in lst if item not in unique_list]
unique_list

[['the', 'cat'],
 ['cat', 'in'],
 ['in', 'the'],
 ['the', 'hat'],
 ['the', 'quick'],
 ['quick', 'brown'],
 ['brown', 'fox'],
 ['the', 'hat'],
 ['hat', 'is'],
 ['is', 'blue']]

In [13]:
def count_biwrd_Doc(st,doc):
    i=0    
    for s in bi_lst(doc):
        if s == st.split(' '):
            i = i+1
    return i
bigram_df = pd.DataFrame(columns=list((' '.join(x) for x in unique_list))) #empty dataframe initialized with words column headers
freq_lst=[] #empty list for each column to save word frequencies in each document
for c in bigram_df.columns:
    freq_lst=[]#empty the list
    for d in [Doc_1,Doc_2,Doc_3]:
        freq_lst.append(count_biwrd_Doc(c,d))#append the frequency of word in document d
    bigram_df[c]=freq_lst #assign values to column
bigram_df #display the dataframe of TF for each word in each document

Unnamed: 0,the cat,cat in,in the,the hat,the quick,quick brown,brown fox,the hat.1,hat is,is blue
0,1,1,1,1,0,0,0,1,0,0
1,0,0,0,0,1,1,1,0,0,0
2,0,0,0,1,0,0,0,1,1,1


# Bigram in Python

In [14]:
from nltk import bigrams
from nltk.tokenize import word_tokenize
for d in [Doc_1,Doc_2,Doc_3]:
    words = word_tokenize(d.lower())
    result = list(bigrams(words))
    print("document:",result)

document: [('the', 'cat'), ('cat', 'in'), ('in', 'the'), ('the', 'hat')]
document: [('the', 'quick'), ('quick', 'brown'), ('brown', 'fox')]
document: [('the', 'hat'), ('hat', 'is'), ('is', 'blue')]
