In [59]:
# Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [60]:

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /Users/alexs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/alexs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/alexs/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alexs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [61]:
article1 = open('article1.txt', 'r').read()
article2 = open('article2.txt', 'r').read()
article3 = open('article3.txt', 'r').read()

print("Article 1 length:", len(article1))
print("Article 2 length:", len(article2))
print("Article 3 length:", len(article3))

Article 1 length: 2700
Article 2 length: 2698
Article 3 length: 4539


In [62]:

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function with lemmatization
def preprocess(text):
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    # Lemmatize and filter words
    return ' '.join([lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words])

# Preprocess articles
preprocessed_article1 = [preprocess(sent) for sent in sent_tokenize(article1) if sent.strip()]
preprocessed_article2 = [preprocess(sent) for sent in sent_tokenize(article2) if sent.strip()]
preprocessed_article3 = [preprocess(sent) for sent in sent_tokenize(article3) if sent.strip()]

print("\nNumber of sentences in Article 1:", len(preprocessed_article1))
print("Number of sentences in Article 2:", len(preprocessed_article2))
print("Number of sentences in Article 3:", len(preprocessed_article3))



Number of sentences in Article 1: 16
Number of sentences in Article 2: 22
Number of sentences in Article 3: 27


In [65]:

# Combine sentences from articles 1 and 2 (same topic)
same_topic_sentences = preprocessed_article1 + preprocessed_article2

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the sentences
tfidf_matrix = vectorizer.fit_transform(same_topic_sentences)

# Calculate cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
np.fill_diagonal(cosine_similarities, 0)  # Exclude self-similarity


# Find the most similar sentence pair with one sentence from each article
max_similarity = 0
max_similarity_indices = (-1, -1)
for i in range(len(preprocessed_article1)):
    for j in range(len(preprocessed_article1), len(same_topic_sentences)):
        if cosine_similarities[i, j] > max_similarity:
            max_similarity = cosine_similarities[i, j]
            max_similarity_indices = (i, j)


# Get the original sentences
sentence1 = sent_tokenize(article1)[max_similarity_indices[0]]
sentence2 = sent_tokenize(article2)[max_similarity_indices[1] - len(preprocessed_article1)]

print("Two most similar sentences from articles with the same topic:")
print(f"Sentence 1 (from article 1): {sentence1}\n")
print(f"Sentence 2 (from article 2): {sentence2}")


Two most similar sentences from articles with the same topic:
Sentence 1 (from article 1): Aerial footage shows Asheville, North Carolina before and after Helene's devastation

Aerial footage is capturing the extent of Hurricane Helene's cataclysmic impact on Asheville, North Carolina.

Sentence 2 (from article 2): Before-and-after images show Helene wiped parts of North Carolina off the map


A river now flows where a North Carolina home and road once stood before Hurricane Helene.


In [66]:

# Demonstrate difference between articles on different topics
different_topic_sentences = preprocessed_article1 + preprocessed_article3

# Create new TF-IDF vectorizer for different topics
vectorizer_diff = TfidfVectorizer(min_df=1, stop_words=None)

# Fit and transform the sentences
tfidf_matrix_diff = vectorizer_diff.fit_transform(different_topic_sentences)

# Calculate cosine similarity
cosine_similarities_diff = cosine_similarity(tfidf_matrix_diff, tfidf_matrix_diff)

# Find the least similar sentence pair with one sentence from each article
min_similarity = 1
min_similarity_indices = (-1, -1)
for i in range(len(preprocessed_article1)):
    for j in range(len(preprocessed_article1), len(different_topic_sentences)):
        if cosine_similarities_diff[i, j] < min_similarity:
            min_similarity = cosine_similarities_diff[i, j]
            min_similarity_indices = (i, j)

# Get the original sentences
sentence_diff1 = sent_tokenize(article1)[min_similarity_indices[0]]
sentence_diff2 = sent_tokenize(article3)[min_similarity_indices[1] - len(preprocessed_article1)]

print("\nTwo least similar sentences from articles with different topics:")
print(f"Sentence 1 (from article 1): {sentence_diff1}\n")
print(f"Sentence 2 (from article 3): {sentence_diff2}")


Two least similar sentences from articles with different topics:
Sentence 1 (from article 1): Aerial footage shows Asheville, North Carolina before and after Helene's devastation

Aerial footage is capturing the extent of Hurricane Helene's cataclysmic impact on Asheville, North Carolina.

Sentence 2 (from article 3): Cohere just made it way easier for companies to create their own AI language models

Join our daily and weekly newsletters for the latest updates and exclusive content on industry-leading AI coverage.
