In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PRIYANSHU\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PRIYANSHU\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
document1 = "The recent economic downturn has significantly impacted the global financial landscape. Stock markets have experienced volatility, leading to investor concerns and potential losses. Central banks are implementing quantitative easing measures to stimulate economic growth, but their effectiveness remains under debate. Furthermore, rising inflation puts pressure on household budgets and raises concerns about economic stability. Overall, the current financial climate presents both challenges and opportunities for individuals and institutions, requiring careful analysis and informed decision-making."

In [4]:
document2 = "The emergence of new fintech solutions is revolutionizing the financial sector. Blockchain technology enables secure and transparent financial transactions, disrupting traditional institutions. Robo-advisors provide automated investment management services, democratizing access to wealth management. Crowdfunding platforms offer alternative pathways for businesses to raise capital. These innovations challenge existing financial structures and create new opportunities for financial inclusion and efficiency. However, regulatory frameworks need to adapt to accommodate these advancements while ensuring consumer protection and financial stability."

#### Cosine Similarity

In [5]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    return set(filtered_tokens)

In [8]:
def cosine_similarity(document1, document2):
    doc1_tokens = preprocess_text(document1)
    doc2_tokens = preprocess_text(document2)
    intersection = doc1_tokens.intersection(doc2_tokens)
    return len(intersection) / (len(doc1_tokens) * len(doc2_tokens))**0.5

In [9]:
cosine_sim = cosine_similarity(document1, document2)
print("Cosine Similarity:", cosine_sim)

Cosine Similarity: 0.08001600480160055


#### Jaccard Similarity

In [10]:
def jaccard_similarity(doc1, doc2):
    doc1_tokens = preprocess_text(doc1)
    doc2_tokens = preprocess_text(doc2)
    
    intersection = doc1_tokens.intersection(doc2_tokens)
    union = doc1_tokens.union(doc2_tokens)
    return len(intersection) / len(union)

In [11]:
jaccard_sim = jaccard_similarity(document1, document2)
print("Jaccard Similarity:", jaccard_sim)

Jaccard Similarity: 0.041666666666666664


#### TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return similarity[0][0]

In [14]:
tfidf_sim = tfidf_similarity(document1, document2)
print("TF-IDF Similarity:", tfidf_sim)

TF-IDF Similarity: 0.2658574875231505


#### Doc2Vec

In [27]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial

In [28]:
# Tokenize documents
tokenized_doc1 = word_tokenize(document1.lower())
tokenized_doc2 = word_tokenize(document2.lower())

In [29]:
# Tag documents
tagged_doc1 = TaggedDocument(words=tokenized_doc1, tags=[0])
tagged_doc2 = TaggedDocument(words=tokenized_doc2, tags=[1])

In [30]:
# Train Doc2Vec model
documents = [tagged_doc1, tagged_doc2]
model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)

In [31]:
def doc_vector_similarity(doc1, doc2, model):
    doc1_vector = model.infer_vector(doc1)
    doc2_vector = model.infer_vector(doc2)
    
    similarity = 1 - spatial.distance.cosine(doc1_vector, doc2_vector)
    return similarity

In [32]:
# Calculate similarity
doc2vec_sim = doc_vector_similarity(tokenized_doc1, tokenized_doc2, model)
print("Doc2Vec Similarity:", doc2vec_sim)

Doc2Vec Similarity: 0.459432989358902
