Mini NLP Application

In [2]:
import nltk
import re
import numpy as np
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from heapq import nlargest

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')  # sometimes needed in newer NLTK

stop_words = set(stopwords.words('english'))

text = """
Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to intelligence displayed by humans or by other animals. 
"Intelligence" encompasses the "ability to learn" and to apply knowledge in order to solve problems. 
AI applications include natural language processing, speech recognition, machine vision, and many more. 
Machine learning is a core sub-area of AI; it is the study of computer algorithms that improve automatically through experience and by the use of data. 
Deep learning is a modern variation of machine learning that uses artificial neural networks with many layers. 
These networks are inspired by biological neural networks. 
In recent years, deep learning has led to breakthroughs in computer vision, natural language understanding, speech synthesis, game playing, and protein folding prediction. 
Large language models such as GPT, LLaMA, and Gemini are based on the transformer architecture and have shown remarkable abilities in generating human-like text, answering questions, writing code, and more. 
However, they still struggle with reasoning, factual accuracy, long-context understanding, and hallucinations. 
The field of AI is advancing rapidly, with implications for society, ethics, jobs, education, healthcare, transportation, and national security.
"""

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
def preprocess_text(text):
    # Lowercase and remove special chars (keep basic punctuation for sentence splitting)
    text = re.sub(r'\s+', ' ', text.strip())
    sentences = sent_tokenize(text)
    return sentences

sentences = preprocess_text(text)
print(f"Original number of sentences: {len(sentences)}")
print(f"Original character length   : {len(text)}\n")

Original number of sentences: 10
Original character length   : 1311



In [4]:
# Frequency-based summarization
def frequency_based_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    
    # Word frequency
    word_freq = defaultdict(int)
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word.isalnum() and word not in stop_words:
                word_freq[word] += 1
    
    if not word_freq:
        return "No meaningful words found."
    
    # Normalize frequencies
    max_freq = max(word_freq.values())
    for word in word_freq:
        word_freq[word] /= max_freq
    
    # Sentence scores = sum of normalized freq of words in sentence
    sentence_scores = defaultdict(float)
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                sentence_scores[sentence] += word_freq[word]
    
    # Select top N sentences (ordered by original appearance)
    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    
    # Re-order to preserve original sequence
    summary_sentences.sort(key=lambda s: sentences.index(s))
    
    return ' '.join(summary_sentences)

# Generate
freq_summary = frequency_based_summary(text, num_sentences=4)

print("=== Frequency-based Summary ===")
print(freq_summary)
print("\nSummary length (chars) :", len(freq_summary))
print("Summary sentences      :", len(sent_tokenize(freq_summary)))

=== Frequency-based Summary ===

Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to intelligence displayed by humans or by other animals. AI applications include natural language processing, speech recognition, machine vision, and many more. Deep learning is a modern variation of machine learning that uses artificial neural networks with many layers. In recent years, deep learning has led to breakthroughs in computer vision, natural language understanding, speech synthesis, game playing, and protein folding prediction.

Summary length (chars) : 526
Summary sentences      : 4


In [5]:
#graph-based summarization
def textrank_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text
    
    clean_sentences = []
    for s in sentences:
        words = [w.lower() for w in word_tokenize(s) if w.isalnum() and w.lower() not in stop_words]
        clean_sentences.append(' '.join(words))
    
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    
    for i in range(len(clean_sentences)):
        for j in range(len(clean_sentences)):
            if i == j:
                continue
            words_i = set(clean_sentences[i].split())
            words_j = set(clean_sentences[j].split())
            if len(words_i) == 0 or len(words_j) == 0:
                continue
            intersection = len(words_i.intersection(words_j))
            similarity = intersection / (len(words_i) + len(words_j) - intersection)
            similarity_matrix[i][j] = similarity
    
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph, max_iter=100, tol=1e-6)
    
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    
    top_sentences = [s for _, s in ranked_sentences[:num_sentences]]
    top_sentences.sort(key=lambda s: sentences.index(s))
    
    return ' '.join(top_sentences)

In [6]:
tr_summary = textrank_summary(text, num_sentences=4)

print("\n=== TextRank Summary ===")
print(tr_summary)
print("\nSummary length (chars) :", len(tr_summary))
print("Summary sentences      :", len(sent_tokenize(tr_summary)))


=== TextRank Summary ===
AI applications include natural language processing, speech recognition, machine vision, and many more. Machine learning is a core sub-area of AI; it is the study of computer algorithms that improve automatically through experience and by the use of data. Deep learning is a modern variation of machine learning that uses artificial neural networks with many layers. In recent years, deep learning has led to breakthroughs in computer vision, natural language understanding, speech synthesis, game playing, and protein folding prediction.

Summary length (chars) : 538
Summary sentences      : 4


In [7]:
# ────────────────────────────────────────────────
# Comparison Table
# ────────────────────────────────────────────────
print("\n" + "="*70)
print("Comparison:")
print(f"Original text")
print(f"  • Sentences : {len(sentences)}")
print(f"  • Characters: {len(text):,}")
print("-"*40)
print(f"Frequency-based summary ({len(sent_tokenize(freq_summary))} sent)")
print(f"  • Compression ratio: {len(freq_summary)/len(text):.2%}")
print("-"*40)
print(f"TextRank summary        ({len(sent_tokenize(tr_summary))} sent)")
print(f"  • Compression ratio: {len(tr_summary)/len(text):.2%}")
print("="*70)


Comparison:
Original text
  • Sentences : 10
  • Characters: 1,311
----------------------------------------
Frequency-based summary (4 sent)
  • Compression ratio: 40.12%
----------------------------------------
TextRank summary        (4 sent)
  • Compression ratio: 41.04%


In [8]:
from IPython.display import display, Markdown

display(Markdown("### Original Text"))
print(text)

display(Markdown("### Frequency-based Summary"))
print(freq_summary)

display(Markdown("### TextRank Summary"))
print(tr_summary)

### Original Text


Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to intelligence displayed by humans or by other animals. 
"Intelligence" encompasses the "ability to learn" and to apply knowledge in order to solve problems. 
AI applications include natural language processing, speech recognition, machine vision, and many more. 
Machine learning is a core sub-area of AI; it is the study of computer algorithms that improve automatically through experience and by the use of data. 
Deep learning is a modern variation of machine learning that uses artificial neural networks with many layers. 
These networks are inspired by biological neural networks. 
In recent years, deep learning has led to breakthroughs in computer vision, natural language understanding, speech synthesis, game playing, and protein folding prediction. 
Large language models such as GPT, LLaMA, and Gemini are based on the transformer architecture and have shown remarkable abilities in generating human-lik

### Frequency-based Summary


Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to intelligence displayed by humans or by other animals. AI applications include natural language processing, speech recognition, machine vision, and many more. Deep learning is a modern variation of machine learning that uses artificial neural networks with many layers. In recent years, deep learning has led to breakthroughs in computer vision, natural language understanding, speech synthesis, game playing, and protein folding prediction.


### TextRank Summary

AI applications include natural language processing, speech recognition, machine vision, and many more. Machine learning is a core sub-area of AI; it is the study of computer algorithms that improve automatically through experience and by the use of data. Deep learning is a modern variation of machine learning that uses artificial neural networks with many layers. In recent years, deep learning has led to breakthroughs in computer vision, natural language understanding, speech synthesis, game playing, and protein folding prediction.
