In [1]:
# Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [2]:

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /Users/alexs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/alexs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/alexs/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alexs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
article1 = open('article1.txt', 'r').read()
article2 = open('article2.txt', 'r').read()
article3 = open('article3.txt', 'r').read()

print("Article 1 length:", len(article1))
print("Article 2 length:", len(article2))
print("Article 3 length:", len(article3))

Article 1 length: 2700
Article 2 length: 2698
Article 3 length: 4539


In [4]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    # Lemmatize and filter words
    return [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]

# Preprocess articles
preprocessed_article1 = preprocess(article1)
preprocessed_article2 = preprocess(article2)
preprocessed_article3 = preprocess(article3)

print("\nNumber of words in Article 1:", len(preprocessed_article1))
print("Number of words in Article 2:", len(preprocessed_article2))
print("Number of words in Article 3:", len(preprocessed_article3))


Number of words in Article 1: 263
Number of words in Article 2: 255
Number of words in Article 3: 404


In [5]:
# Convert preprocessed word lists to strings for TfidfVectorizer
article1_text = ' '.join(preprocessed_article1)
article2_text = ' '.join(preprocessed_article2)
article3_text = ' '.join(preprocessed_article3)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the articles
tfidf_matrix = vectorizer.fit_transform([article1_text, article2_text, article3_text])

# Calculate cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Find the most and least similar article pairs
article_pairs = [(0, 1), (0, 2), (1, 2)]
similarities = [cosine_similarities[i, j] for i, j in article_pairs]

most_similar_pair = article_pairs[similarities.index(max(similarities))]
least_similar_pair = article_pairs[similarities.index(min(similarities))]

print("\nArticle similarity results:")
print(f"Most similar articles: {most_similar_pair[0] + 1} and {most_similar_pair[1] + 1}")
print(f"Similarity score: {max(similarities):.4f}")
print(f"Least similar articles: {least_similar_pair[0] + 1} and {least_similar_pair[1] + 1}")
print(f"Similarity score: {min(similarities):.4f}")

# Print similarity matrix
print("\nSimilarity matrix:")
for i in range(3):
    for j in range(3):
        print(f"{cosine_similarities[i, j]:.4f}", end="\t")
    print()


Article similarity results:
Most similar articles: 1 and 2
Similarity score: 0.3592
Least similar articles: 2 and 3
Similarity score: 0.0115

Similarity matrix:
1.0000	0.3592	0.0159	
0.3592	1.0000	0.0115	
0.0159	0.0115	1.0000	


In [6]:
# Use the most similar pair of articles found earlier
article_a = globals()[f"article{most_similar_pair[0] + 1}"]
article_b = globals()[f"article{most_similar_pair[1] + 1}"]

# Combine sentences from the two most similar articles
same_topic_sentences = [preprocess(sent) for sent in sent_tokenize(article_a) + sent_tokenize(article_b)]

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the sentences
tfidf_matrix = vectorizer.fit_transform([' '.join(sent) for sent in same_topic_sentences])

# Calculate cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
np.fill_diagonal(cosine_similarities, 0)  # Exclude self-similarity

# Find the most similar sentence pair with one sentence from each article
max_similarity = 0
max_similarity_indices = (-1, -1)
len_article_a = len(sent_tokenize(article_a))
for i in range(len_article_a):
    for j in range(len_article_a, len(same_topic_sentences)):
        if cosine_similarities[i, j] > max_similarity:
            max_similarity = cosine_similarities[i, j]
            max_similarity_indices = (i, j)

# Get the original sentences
sentence1 = sent_tokenize(article_a)[max_similarity_indices[0]]
sentence2 = sent_tokenize(article_b)[max_similarity_indices[1] - len_article_a]

print(f"Two most similar sentences from articles {most_similar_pair[0] + 1} and {most_similar_pair[1] + 1}:")
print(f"Sentence 1 (from article {most_similar_pair[0] + 1}): {sentence1}\n")
print(f"Sentence 2 (from article {most_similar_pair[1] + 1}): {sentence2}")
print(f"Similarity score: {max_similarity:.4f}")


Two most similar sentences from articles 1 and 2:
Sentence 1 (from article 1): Aerial footage shows Asheville, North Carolina before and after Helene's devastation

Aerial footage is capturing the extent of Hurricane Helene's cataclysmic impact on Asheville, North Carolina.

Sentence 2 (from article 2): Before-and-after images show Helene wiped parts of North Carolina off the map


A river now flows where a North Carolina home and road once stood before Hurricane Helene.
Similarity score: 0.2903


In [48]:

article_a = globals()[f"article{least_similar_pair[0] + 1}"]
article_b = globals()[f"article{least_similar_pair[1] + 1}"]
# Demonstrate difference between articles on different topics
different_topic_sentences = [preprocess(sent) for sent in sent_tokenize(article_a) + sent_tokenize(article_b)]

# Create new TF-IDF vectorizer for different topics
vectorizer_diff = TfidfVectorizer(min_df=1, stop_words=None)

# Fit and transform the sentences
tfidf_matrix_diff = vectorizer_diff.fit_transform([' '.join(sent) for sent in different_topic_sentences])

# Calculate cosine similarity
cosine_similarities_diff = cosine_similarity(tfidf_matrix_diff, tfidf_matrix_diff)

# Find the least similar sentence pair with one sentence from each article
min_similarity = 1
min_similarity_indices = (-1, -1)
for i in range(len(sent_tokenize(article1))):
    for j in range(len(sent_tokenize(article1)), len(different_topic_sentences)):
        if cosine_similarities_diff[i, j] < min_similarity:
            min_similarity = cosine_similarities_diff[i, j]
            min_similarity_indices = (i, j)

# Get the original sentences
sentence_diff1 = sent_tokenize(article1)[min_similarity_indices[0]]
sentence_diff2 = sent_tokenize(article3)[min_similarity_indices[1] - len(sent_tokenize(article1))]

print(f"\nTwo least similar sentences from article {least_similar_pair[0] + 1} and article {least_similar_pair[1] + 1}:")
print(f"Sentence 1 (from article {least_similar_pair[0] + 1}): {sentence_diff1}\n")
print(f"Sentence 2 (from article {least_similar_pair[1] + 1}): {sentence_diff2}")



Two least similar sentences from article 2 and article 3:
Sentence 1 (from article 2): Aerial footage shows Asheville, North Carolina before and after Helene's devastation

Aerial footage is capturing the extent of Hurricane Helene's cataclysmic impact on Asheville, North Carolina.

Sentence 2 (from article 3): This could translate to meaningful cost savings for high-volume enterprise deployments, as businesses may achieve better performance on specific tasks with fewer compute resources.
