<a href="https://colab.research.google.com/github/Ransaka/data-ai-inspire/blob/main/Text_vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BOW implementation

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [3]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
     ]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()

#creating result dataframe for demo
result_df = pd.DataFrame(X.toarray(), columns=feature_names)
result_df['text'] = corpus

In [13]:
result_df

Unnamed: 0,and,document,first,is,one,second,the,third,this,text
0,0,1,1,1,0,0,1,0,1,This is the first document.
1,0,2,0,1,0,1,1,0,1,This document is the second document.
2,1,0,0,1,1,0,1,1,1,And this is the third one.
3,0,1,1,1,0,0,1,0,1,Is this the first document?


# IF-IDF Implementation (Sklearn)




In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()

In [17]:
tf_idf_results = pd.DataFrame(X.toarray(), columns=feature_names)

In [18]:
tf_idf_results

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


# IF-IDF Implementation (Scratch)




In [23]:
import math

In [19]:
corpus = [
    "this is the first document",
    "this document is the second document",
    "and this is the third one",
    "is this the first document"
]

##  Step 1: Tokenization and Vocabulary Construction

In [20]:
vocabulary = set()
for document in corpus:
    words = document.split()
    vocabulary.update(words)

## Step 2: Calculate TF for each term in each document

In [21]:

tf_matrix = {}
for i, document in enumerate(corpus):
    tf_matrix[i] = {}
    words = document.split()
    word_count = len(words)
    for word in vocabulary:
        tf_matrix[i][word] = words.count(word) / word_count

## Step 3: Calculate IDF for each term

In [24]:
idf_matrix = {}
total_documents = len(corpus)
for word in vocabulary:
    word_count = sum(1 for document in corpus if word in document)
    idf_matrix[word] = math.log(total_documents / (1 + word_count))

## Step 4: Calculate TF-IDF

In [25]:
tfidf_matrix = {}
for i, document in enumerate(corpus):
    tfidf_matrix[i] = {}
    for word in vocabulary:
        tfidf_matrix[i][word] = tf_matrix[i][word] * idf_matrix[word]

In [26]:
# Print TF-IDF scores
for i, document in enumerate(corpus):
    print(f"Document {i + 1}:")
    for word, tfidf_score in tfidf_matrix[i].items():
        print(f"{word}: {tfidf_score}")
    print()

Document 1:
is: -0.044628710262841945
second: 0.0
one: 0.0
the: -0.044628710262841945
this: -0.044628710262841945
third: 0.0
first: 0.05753641449035617
document: 0.0
and: 0.0

Document 2:
is: -0.03719059188570162
second: 0.11552453009332421
one: 0.0
the: -0.03719059188570162
this: -0.03719059188570162
third: 0.0
first: 0.0
document: 0.0
and: 0.0

Document 3:
is: -0.03719059188570162
second: 0.0
one: 0.11552453009332421
the: -0.03719059188570162
this: -0.03719059188570162
third: 0.11552453009332421
first: 0.0
document: 0.0
and: 0.11552453009332421

Document 4:
is: -0.044628710262841945
second: 0.0
one: 0.0
the: -0.044628710262841945
this: -0.044628710262841945
third: 0.0
first: 0.05753641449035617
document: 0.0
and: 0.0

