# Text Summarizer
Simple text summarizer using NLTK for natural language processing

In [21]:
import nltk
nltk.download('punkt')  # This is what we need for sentence tokenization
nltk.download('stopwords')  # This is for stop words
nltk.download('punkt_tab')

# Verify the downloads
import nltk.data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    print("All required NLTK data is downloaded and available!")
except LookupError as e:
    print("Error: Some required NLTK data is missing:", e)

All required NLTK data is downloaded and available!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
def create_frequency_table(text):
    """Create a frequency table for words in the text."""
    words = word_tokenize(text.lower())

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english') + list(punctuation))
    words = [word for word in words if word not in stop_words]

    # Calculate frequency of each word
    freq_table = defaultdict(int)
    for word in words:
        freq_table[word] += 1

    return freq_table

In [15]:
def score_sentences(sentences, freq_table):
    """Score sentences based on word frequencies."""
    sentence_scores = defaultdict(int)

    for sentence in sentences:
        word_count = len(word_tokenize(sentence))
        if word_count <= 30:  # Ignore very long sentences
            for word in word_tokenize(sentence.lower()):
                if word in freq_table:
                    sentence_scores[sentence] += freq_table[word]

            # Normalize score by sentence length
            sentence_scores[sentence] = sentence_scores[sentence] / word_count

    return sentence_scores

In [16]:
def summarize_text(text, num_sentences=3):
    """Generate a summary of the text."""
    # Tokenize text into sentences
    sentences = sent_tokenize(text)

    # Create frequency table of words
    freq_table = create_frequency_table(text)

    # Score sentences
    sentence_scores = score_sentences(sentences, freq_table)

    # Get top scoring sentences
    summary_sentences = sorted(sentence_scores.items(),
                             key=lambda x: x[1],
                             reverse=True)[:num_sentences]

    # Sort sentences by their original order
    summary_sentences.sort(key=lambda x: sentences.index(x[0]))

    # Join sentences to create summary
    summary = ' '.join(sentence[0] for sentence in summary_sentences)

    return summary

## Test the summarizer with sample text

In [19]:
# Example text
text = """
Machine learning is a field of artificial intelligence that focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy. Machine learning is an important component of the growing field of data science. Through the use of statistical methods, algorithms are trained to make classifications or predictions, uncovering key insights within data mining projects. These insights subsequently drive decision making within applications and businesses, ideally impacting key growth metrics. As big data continues to expand and grow, the market demand for data scientists will increase. They will be required to help identify the most relevant business questions and subsequently the data to answer them.
"""

print("Original Text:")
print(text)
print("\nSummary (2 sentences):")
summary = summarize_text(text, num_sentences=2)
print(summary)

Original Text:

Machine learning is a field of artificial intelligence that focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy. Machine learning is an important component of the growing field of data science. Through the use of statistical methods, algorithms are trained to make classifications or predictions, uncovering key insights within data mining projects. These insights subsequently drive decision making within applications and businesses, ideally impacting key growth metrics. As big data continues to expand and grow, the market demand for data scientists will increase. They will be required to help identify the most relevant business questions and subsequently the data to answer them.


Summary (2 sentences):
Machine learning is an important component of the growing field of data science. As big data continues to expand and grow, the market demand for data scientists will increase.


In [20]:
your_text = """
Elasticsearch is an open source distributed, RESTful search and analytics engine, scalable data store, and vector database capable of addressing a growing number of use cases. As the heart of the Elastic Stack, it centrally stores your data for lightning-fast search, fine‑tuned relevancy, and powerful analytics that scale with ease.
"""

summary = summarize_text(your_text, num_sentences=3)
print(summary)


Elasticsearch is an open source distributed, RESTful search and analytics engine, scalable data store, and vector database capable of addressing a growing number of use cases. As the heart of the Elastic Stack, it centrally stores your data for lightning-fast search, fine‑tuned relevancy, and powerful analytics that scale with ease.
