## Importing the libraries

In [2]:
import pandas as pd
import math
from nltk.tokenize import word_tokenize
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

## Defining the documents (corpus) dictionary

In [4]:
corpus = {
    "doc_1": "Software engineering at Damascus university Software",
    "doc_2": "Information retrieval at Damascus university",
    "doc_3": "Indexing Information retrieval"
}

df = pd.DataFrame(corpus, index=["Document"])
df

Unnamed: 0,doc_1,doc_2,doc_3
Document,Software engineering at Damascus university So...,Information retrieval at Damascus university,Indexing Information retrieval


## Tokenizing & storing the indexing terms

In [5]:
indexing_terms = set()

for doc in corpus.values():
    # TODO : PROCESS THE TEXT BEFORE CREATING THE INDEXING TERMS: STEMMING, LEMMATIZATION, LOWER CASE, ..ETC
    indexing_terms.update(word_tokenize(doc))
    
indexing_terms_df = pd.DataFrame(indexing_terms)
indexing_terms_df

Unnamed: 0,0
0,Damascus
1,Indexing
2,engineering
3,university
4,Information
5,Software
6,at
7,retrieval


## Create the inverted index

In [6]:
inverted_index = defaultdict(list)

for docId, doc in corpus.items():
    doc_terms = set(word_tokenize(doc))
    for term in doc_terms:
        inverted_index[term].append(docId)

dict(inverted_index)

{'Damascus': ['doc_1', 'doc_2'],
 'engineering': ['doc_1'],
 'university': ['doc_1', 'doc_2'],
 'Software': ['doc_1'],
 'at': ['doc_1', 'doc_2'],
 'Information': ['doc_2', 'doc_3'],
 'retrieval': ['doc_2', 'doc_3'],
 'Indexing': ['doc_3']}

## Calculating the terms frequencies in a document

In [7]:
def calculate_tf(doc:str):
    tf = {}
    doc_terms = word_tokenize(doc)
    for term in doc_terms:
        tf[term] = (doc_terms.count(term) / len(doc_terms)) 
    return tf

tf_df = pd.DataFrame(calculate_tf(corpus['doc_1']), index=["tf"])
tf_df

Unnamed: 0,Software,engineering,at,Damascus,university
tf,0.333333,0.166667,0.166667,0.166667,0.166667


## Calculating the inverse document frequencies

In [187]:
def calculate_idf():
    idf = {}
    
    docs_count = len(corpus)

    for term, doc_ids in inverted_index.items():
        idf[term] = math.log((docs_count / len(doc_ids)) + 1)
    
    return idf
        
                 
idf_df = pd.DataFrame(calculate_idf(), index=["idf"])
idf_df

Unnamed: 0,university,Damascus,Software,engineering,at,retrieval,Information,Indexing
idf,0.916291,0.916291,1.386294,1.386294,0.916291,0.916291,0.916291,1.386294


## Calculating tf-idf for the document

In [188]:
tf_idf = {}

doc_terms = word_tokenize(corpus['doc_1'])

doc_tf = calculate_tf(corpus['doc_1'])

idf = calculate_idf()

for term in doc_terms:
    tf_idf[term] = doc_tf[term] * idf[term]

tf_idf_df = pd.DataFrame(tf_idf, index=["tf_idf"])
tf_idf_df

Unnamed: 0,Software,engineering,at,Damascus,university
tf_idf,0.398812,0.213698,0.141247,0.141247,0.141247


## Calculate tf-idf using Scikit Learn TfidfVectorizer

In [189]:
documents = list(corpus.values())

# TODO : ADD YOUR OWN TOKENIZER & PREPROCESSOR !
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents
tfidf_matrix = vectorizer.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus.keys())

df

Unnamed: 0,at,damascus,engineering,indexing,information,retrieval,software,university
doc_1,0.293048,0.293048,0.385323,0.0,0.0,0.0,0.770646,0.293048
doc_2,0.447214,0.447214,0.0,0.0,0.447214,0.447214,0.0,0.447214
doc_3,0.0,0.0,0.0,0.680919,0.517856,0.517856,0.0,0.0
