# Text Similarity
Checking similarity between two documents using TF-IDF and Cosine Similarity

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    ''' This function pre process the text by implementing the following steps:
        - Text is converted to lower cap and tokenized using word_tokenize
        - Stop words are removed from tokens and filtered
        - Stemming is applied on filtered tokens
        - Lastly, a string is created which is returned by the function
        '''
    
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    preprocessed_text = ' '.join(stemmed_tokens)
    
    return preprocessed_text

# using two text files containing different definition of NLP 
with open('doc1.txt', 'r') as file:
    document1 = file.read()

with open('doc2.txt', 'r') as file:
    document2 = file.read()

doc1 = preprocess_text(document1)
doc2 = preprocess_text(document2)

# computing the TF-IDF vectors
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([doc1, doc2])

# calculating cosine similarity
similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

print("Cosine similarity between the text documents is:", similarity[0][0])

Cosine similarity between the text documents is: 0.2244472271602016


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pinal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pinal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
