In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Input documents
doc1 = "Information retrieval deals with the representation and retrieval of information."
doc2 = "Information retrieval is the process of obtaining relevant information."

# Store documents in a list
documents = [doc1, doc2]

# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Convert documents into TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Compute Cosine Similarity
similarity_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

# Display result
print("Cosine Similarity between Document 1 and Document 2 is:")
print(similarity_score[0][0])

Cosine Similarity between Document 1 and Document 2 is:
0.48981813079169867


In [2]:
import nltk
import math
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK resources (run once)
nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    return filtered_tokens

def cosine_similarity(doc1, doc2):
    # Count word frequency
    vec1 = Counter(doc1)
    vec2 = Counter(doc2)

    # Intersection of words
    common_words = set(vec1.keys()) & set(vec2.keys())

    # Dot product
    dot_product = sum(vec1[word] * vec2[word] for word in common_words)

    # Magnitude
    magnitude1 = math.sqrt(sum(val**2 for val in vec1.values()))
    magnitude2 = math.sqrt(sum(val**2 for val in vec2.values()))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0

    return dot_product / (magnitude1 * magnitude2)

# Input documents
doc1 = "Information retrieval is the process of obtaining information from large repositories."
doc2 = "Information retrieval deals with searching and extracting information from databases."

# Preprocess documents
tokens1 = preprocess(doc1)
tokens2 = preprocess(doc2)

# Compute similarity
similarity = cosine_similarity(tokens1, tokens2)

print("Document 1 Tokens:", tokens1)
print("Document 2 Tokens:", tokens2)
print("Cosine Similarity:", similarity)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Document 1 Tokens: ['information', 'retrieval', 'process', 'obtaining', 'information', 'large', 'repositories']
Document 2 Tokens: ['information', 'retrieval', 'deals', 'searching', 'extracting', 'information', 'databases']
Cosine Similarity: 0.5555555555555556
