In [1]:
!pip install nltk scikit-learn



In [14]:
import nltk
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab')


def summarize_text_tfidf(text, num_sentences=3):
    # Step 1: Split text into sentences
    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return sentences

    # Step 2: Convert sentences to TF-IDF vectors
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences).toarray()

    # Step 3: Cluster sentences using K-Means
    kmeans = KMeans(n_clusters=num_sentences, random_state=0, n_init='auto')
    kmeans.fit(X)
    cluster_centers = kmeans.cluster_centers_

    # Step 4: Select the sentence closest to each cluster center
    summary_sentences = []
    for i in range(num_sentences):
        cluster_indices = np.where(kmeans.labels_ == i)[0]
        if len(cluster_indices) == 0:
            continue
        closest_index = min(
            cluster_indices,
            key=lambda idx: np.linalg.norm(X[idx] - cluster_centers[i])
        )
        summary_sentences.append((closest_index, sentences[closest_index]))

    # Step 5: Sort by original order
    summary_sentences.sort()
    return [sent for idx, sent in summary_sentences]


if __name__ == "__main__":
    print("=== Simple TF-IDF Text Summarizer ===")
    print("Paste your paragraph below (End with an empty line):\n")

    lines = []
    while True:
        line = input()
        if not line.strip():
            break
        lines.append(line)

    full_text = " ".join(lines)

    try:
        summary = summarize_text_tfidf(full_text, num_sentences=3)
        print("\n--- SUMMARY ---\n")
        for sent in summary:
            print(sent)
    except Exception as e:
        print(f"\n⚠️ Error: {e}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


=== Simple TF-IDF Text Summarizer ===
Paste your paragraph below (End with an empty line):

Climate change is one of the most pressing global challenges of our time. Rising temperatures, melting ice caps, and extreme weather events are becoming increasingly common. Scientists have linked these changes to greenhouse gas emissions caused by human activities, such as burning fossil fuels and deforestation. In response, governments around the world are investing in renewable energy sources like solar and wind power. Individuals are also encouraged to reduce their carbon footprint by adopting sustainable practices. Despite these efforts, progress has been slow, and many experts warn that more aggressive action is needed to avoid irreversible damage to the planet.


--- SUMMARY ---

Climate change is one of the most pressing global challenges of our time.
Rising temperatures, melting ice caps, and extreme weather events are becoming increasingly common.
Scientists have linked these changes t

In [9]:
import nltk
nltk.data.path.append('/root/nltk_data/tokenizers/punkt_tab')


In [11]:
import nltk
print(nltk.data.path)  # This shows where NLTK is looking for data


['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/root/nltk_data', '/root/nltk_data/tokenizers', '/root/nltk_data/tokenizers/punkt_tab/english', '/root/nltk_data/tokenizers/punkt_tab']


In [12]:
import nltk
nltk.data.path.append('/root/nltk_data')  # Ensure it looks here for data


In [19]:
import nltk
import numpy as np
import string
import pickle
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


def summarize_text_tfidf(text, summary_ratio=0.25):
    # Step 1: Split text into sentences
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)

    if total_sentences == 0:
        return []

    # Determine the number of summary sentences (at least 1)
    num_sentences = max(1, int(total_sentences * summary_ratio))

    if total_sentences <= num_sentences:
        return [s.strip().capitalize() for s in sentences]

    # Step 2: Convert sentences to TF-IDF vectors
    vectorizer = TfidfVectorizer(
        stop_words=stopwords.words('english'),
        lowercase=True,
        token_pattern=r'\b[a-zA-Z]{3,}\b'
    )
    X = vectorizer.fit_transform(sentences).toarray()

    # Step 3: Cluster sentences using K-Means
    kmeans = KMeans(n_clusters=num_sentences, random_state=0, n_init='auto')
    kmeans.fit(X)
    cluster_centers = kmeans.cluster_centers_

    # Step 4: Select the sentence closest to each cluster center
    summary_sentences = []
    for i in range(num_sentences):
        cluster_indices = np.where(kmeans.labels_ == i)[0]
        if len(cluster_indices) == 0:
            continue
        closest_index = min(
            cluster_indices,
            key=lambda idx: np.linalg.norm(X[idx] - cluster_centers[i])
        )
        summary_sentences.append((closest_index, sentences[closest_index]))

    # Step 5: Sort by original order and clean up
    summary_sentences.sort()
    return [sent.strip().capitalize() for idx, sent in summary_sentences]


# === Save Model (Vectorizer + KMeans) ===
def save_model(text, summary_ratio=0.25, filename="tfidf_summarizer_model_final.pkl"):
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    num_sentences = max(1, int(total_sentences * summary_ratio))

    vectorizer = TfidfVectorizer(
        stop_words=stopwords.words('english'),
        lowercase=True,
        token_pattern=r'\b[a-zA-Z]{3,}\b'
    )
    X = vectorizer.fit_transform(sentences).toarray()

    kmeans = KMeans(n_clusters=num_sentences, random_state=0, n_init='auto')
    kmeans.fit(X)

    with open(filename, "wb") as f:
        pickle.dump({'vectorizer': vectorizer, 'kmeans': kmeans}, f)

    print(f"\n✅ Model saved as '{filename}'")


# === Main ===
if __name__ == "__main__":
    print("=== Simple TF-IDF Text Summarizer ===")
    print("Paste your paragraph below (End with an empty line):\n")

    lines = []
    while True:
        try:
            line = input()
            if not line.strip():
                break
            lines.append(line)
        except EOFError:
            break

    full_text = " ".join(lines)

    try:
        summary = summarize_text_tfidf(full_text, summary_ratio=0.25)
        print("\n--- SUMMARY (25%) ---\n")
        for sent in summary:
            print(sent)

        # Save model (optional)
        save_model(full_text, summary_ratio=0.25)

    exc
    ept Exception as e:
        print(f"\n⚠️ Error: {e}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== Simple TF-IDF Text Summarizer ===
Paste your paragraph below (End with an empty line):

Lifelong learning is the continuous pursuit of knowledge throughout one’s life. It goes beyond formal education and extends into personal and professional development. People engage in lifelong learning to stay relevant in a rapidly changing world. Technology evolves quickly, and keeping up requires constant effort. Learning new skills can boost self-confidence. It can also improve mental sharpness and delay cognitive decline. Many employers value candidates who show a commitment to learning. This makes lifelong learners more competitive in the job market. It’s not limited to academic subjects either. People learn music, art, coding, languages, and more. Online platforms have made access to education easier than ever. You can take free courses from top universities while sitting at home. Books and podcasts also offer great learning opportunities. Curiosity often drives the desire to keep learnin