In [26]:
import requests
import nltk
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import warnings
import requests
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
URL = "https://www.washingtonpost.com/world/2025/06/13/air-india-plane-crash-survivor-vishwash-kumar-ramesh/"
UA = {"User-Agent": "Mozilla/5.0"}

def fetch(url):
    try:
        r = requests.get(url, headers=UA, timeout=25)
        r.raise_for_status()
        return r.text
    except Exception:
        mirror = "https://r.jina.ai/http://" + url.replace("https://", "")
        r = requests.get(mirror, headers=UA, timeout=25)
        r.raise_for_status()
        return r.text

raw_text = fetch(URL)
first_700 = raw_text[:700]
print("First 700 chars:\n", first_700, "\n")

First 700 chars:
 Title: The ‘miracle’ of the sole passenger who survived the Air India flight

URL Source: http://www.washingtonpost.com/world/2025/06/13/air-india-plane-crash-survivor-vishwash-kumar-ramesh/

Published Time: 2025-06-13T12:52:23.989Z

Markdown Content:
Air India crash: Sole survivor in seat 11A says it’s a ‘miracle’ he survived - The Washington Post


[Accessibility statement](http://www.washingtonpost.com/accessibility/)[Skip to main content](http://www.washingtonpost.com/world/2025/06/13/air-india-plane-crash-survivor-vishwash-kumar-ramesh/#main-content)

[Democracy Dies in Darkness](https://www.washingtonpost.com/)

[Subscribe](https://subscribe.washingtonpost.com/acquisiti 



In [28]:
#Spliting the text into sentences
nltk.download('punkt', quiet=True)
sentences = sent_tokenize(first_700)
print(f"Total sentences extracted: {len(sentences)}")
print("\nSentences:")
for i, sentence in enumerate(sentences, 1):
    print(f"{i}. {sentence}")

Total sentences extracted: 1

Sentences:
1. Title: The ‘miracle’ of the sole passenger who survived the Air India flight

URL Source: http://www.washingtonpost.com/world/2025/06/13/air-india-plane-crash-survivor-vishwash-kumar-ramesh/

Published Time: 2025-06-13T12:52:23.989Z

Markdown Content:
Air India crash: Sole survivor in seat 11A says it’s a ‘miracle’ he survived - The Washington Post


[Accessibility statement](http://www.washingtonpost.com/accessibility/)[Skip to main content](http://www.washingtonpost.com/world/2025/06/13/air-india-plane-crash-survivor-vishwash-kumar-ramesh/#main-content)

[Democracy Dies in Darkness](https://www.washingtonpost.com/)

[Subscribe](https://subscribe.washingtonpost.com/acquisiti


In [29]:
#looading the pre-trained embedding model and apply TF-IDF
print("\nLoading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully")

# TF-IDF for first 10 sentences
first_ten_sentences = sentences[:10] if len(sentences) >= 10 else sentences
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(first_ten_sentences)

print(f"\nTF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Feature names (first 10): {list(tfidf_vectorizer.get_feature_names_out())[:10]}")


Loading sentence transformer model...
Model loaded successfully

TF-IDF matrix shape: (1, 47)
Feature names (first 10): ['06', '11a', '13', '13t12', '2025', '23', '52', '989z', 'accessibility', 'acquisiti']


In [30]:
#generating embeddings for each sentence
embeddings = model.encode(sentences)
print(f"\nGenerated embeddings for {len(embeddings)} sentences")
print(f"Embedding shape for first sentence: {embeddings[0].shape}")
print(f"First sentence: {sentences[0]}")
print(f"First 10 embedding values: {embeddings[0][:10]}")


Generated embeddings for 1 sentences
Embedding shape for first sentence: (384,)
First sentence: Title: The ‘miracle’ of the sole passenger who survived the Air India flight

URL Source: http://www.washingtonpost.com/world/2025/06/13/air-india-plane-crash-survivor-vishwash-kumar-ramesh/

Published Time: 2025-06-13T12:52:23.989Z

Markdown Content:
Air India crash: Sole survivor in seat 11A says it’s a ‘miracle’ he survived - The Washington Post


[Accessibility statement](http://www.washingtonpost.com/accessibility/)[Skip to main content](http://www.washingtonpost.com/world/2025/06/13/air-india-plane-crash-survivor-vishwash-kumar-ramesh/#main-content)

[Democracy Dies in Darkness](https://www.washingtonpost.com/)

[Subscribe](https://subscribe.washingtonpost.com/acquisiti
First 10 embedding values: [-0.02481343  0.02235181 -0.0053752   0.01985435  0.0802741   0.02732606
  0.02073626  0.03164691 -0.03613216  0.0383233 ]


In [31]:
#Computing the cosine similarity between first and second sentences
if len(sentences) >= 2:
    similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    print(f"\nCosine similarity between first and second sentences: {similarity_score:.6f}")
    
    print(f"\nFirst sentence: {sentences[0]}")
    print(f"Second sentence: {sentences[1]}")
    
    print(f"\nSimilarity analysis:")
    if similarity_score > 0.7:
        print("High similarity - sentences are closely related")
    elif similarity_score > 0.4:
        print("Moderate similarity - some common themes")
    else:
        print("Low similarity - different content")
else:
    print("Need at least 2 sentences for similarity comparison")

Need at least 2 sentences for similarity comparison
