Importing Counter to keep track of frequency of words
Importing math for mathematical operations

In [None]:
from collections import Counter
import math

Function to read from the text files

In [None]:
def read_text_file(path):
    with open(path,'r') as f:
        return f.read()

Function to tokenize the text to words:
1. Splitting and putting words in lowercase
2. Removing Punctuation

In [None]:
def tokenize(text):
    words=text.lower.split()
    words=[word.strip(".,?") for word in words]
    return words

Function to calculate TF(term frequency)
Calculates frequency of each word and stores it in a dictionary

In [None]:
def tf_calc(text):
    tokens=tokenize(text)
    word_count=Counter(tokens)
    total=len(tokens)
    tf={word:count/total for word,count in word_count.items() }
    return tf

Function to calculate IDF(inverse document frequency)

In [None]:
def idf_calc(docs):
    doc_freq=Counter()
    for doc in docs:
        tokens=set(tokenize(doc))
        doc_freq.update(tokens)
    doc_num=len(docs)
    idf={word:math.log(doc_num/(1+freq)) for word,freq in doc_freq.items()}
    return idf

Function to calculate TF-IDF(Term Frequency=Inverse Document Frequency)

In [None]:
def tfidf_calc(text,idf):
    tf=tf_calc(text)
    tfidf={word:tf[word]*idf[word] for word in tf}
    return tfidf


Calculate cosine similarity between the two TF-IDF vectors

In [None]:
def cosine_similarity(tfidf1,tfidf2):
    dot_product=sum(tfidf1[word]*tfidf2[word] for word in set(tfidf1)& set(tfidf2))
    mag1=math.sqrt(sum(value**2 for value in tfidf1.values()))
    mag2=math.sqrt(sum(value**2 for value in tfidf2.values()))
    similarity=dot_product/(mag1*mag2)
    return similarity

Calling other functions

In [None]:
f1="t1.txt"
f2="t2.txt"
text1=read_text_file(f1)
text2=read_text_file(f2)
print(f"First text contents:\n {text1}")
print(f"Second text contents:\n {text2}")

In [None]:
docs=[text1,text2]
idf=idf_calc(docs)
print(f"IDF is:\n {idf}")

In [None]:
tfidf1=tfidf_calc(text1,idf)
tfidf2=tfidf_calc(text2,idf)
print(f"TF-IDF for 1st text is:\n {tfidf1}")
print(f"TF-IDF for 2nd text is:\n {tfidf2}")
      

In [None]:
similarity=cosine_similarity(tfidf1,tfidf2)
print(f"Cosine similarity between the 2 texts:\n {similarity}")