# Install Required Packages
Reinstall the necessary packages using pip to ensure we have a clean start with all dependencies.

In [None]:
# Install Required Packages
!pip install --upgrade --force-reinstall numpy pandas nltk scikit-learn

# Import Libraries
Import all required libraries for text processing, including NLTK, spaCy, or other NLP libraries.

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Text Preprocessing Functions
Create functions to clean and preprocess text, such as tokenization, stopword removal, and sentence segmentation.

In [None]:
# Text Preprocessing Functions

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import string

# Function to clean and preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text into words
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    return words

# Function to segment text into sentences
def segment_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

# Example usage
example_text = "This is an example sentence. This is another sentence."
cleaned_words = preprocess_text(example_text)
segmented_sentences = segment_sentences(example_text)

print("Cleaned Words:", cleaned_words)
print("Segmented Sentences:", segmented_sentences)

# Sentence Scoring Algorithm
Implement algorithms to score sentences based on metrics like TF-IDF, sentence position, or word frequency.

In [None]:
# Sentence Scoring Algorithm

# Function to compute TF-IDF scores for sentences
def compute_tfidf_scores(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    return tfidf_matrix

# Function to score sentences based on word frequency
def score_sentences_by_word_frequency(sentences):
    word_frequencies = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word not in stopwords.words('english') and word not in string.punctuation:
                if word not in word_frequencies:
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1
    
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = word_frequencies[word]
                else:
                    sentence_scores[sentence] += word_frequencies[word]
    
    return sentence_scores

# Function to score sentences based on their position in the text
def score_sentences_by_position(sentences):
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        sentence_scores[sentence] = len(sentences) - i
    return sentence_scores

# Example usage
example_text = "This is an example sentence. This is another sentence. This is yet another example of a sentence."
segmented_sentences = segment_sentences(example_text)

# Compute TF-IDF scores
tfidf_scores = compute_tfidf_scores(segmented_sentences)
print("TF-IDF Scores:\n", tfidf_scores.toarray())

# Score sentences by word frequency
word_freq_scores = score_sentences_by_word_frequency(segmented_sentences)
print("Word Frequency Scores:\n", word_freq_scores)

# Score sentences by position
position_scores = score_sentences_by_position(segmented_sentences)
print("Position Scores:\n", position_scores)

# Extractive Summary Generation
Develop a function that selects the highest-scoring sentences to form the summary of the input text.

In [None]:
# Extractive Summary Generation

# Function to generate extractive summary
def generate_extractive_summary(text, num_sentences=2):
    # Segment text into sentences
    sentences = segment_sentences(text)
    
    # Compute TF-IDF scores for sentences
    tfidf_scores = compute_tfidf_scores(sentences)
    
    # Score sentences by word frequency
    word_freq_scores = score_sentences_by_word_frequency(sentences)
    
    # Score sentences by position
    position_scores = score_sentences_by_position(sentences)
    
    # Combine scores (simple sum of scores for demonstration purposes)
    combined_scores = {}
    for sentence in sentences:
        combined_scores[sentence] = (
            word_freq_scores.get(sentence, 0) +
            position_scores.get(sentence, 0)
        )
    
    # Sort sentences by combined score
    sorted_sentences = sorted(combined_scores, key=combined_scores.get, reverse=True)
    
    # Select top N sentences for summary
    summary_sentences = sorted_sentences[:num_sentences]
    
    # Join selected sentences to form the summary
    summary = ' '.join(summary_sentences)
    
    return summary

# Example usage
example_text = "This is an example sentence. This is another sentence. This is yet another example of a sentence. This is the fourth sentence in the example."
summary = generate_extractive_summary(example_text, num_sentences=2)
print("Extractive Summary:\n", summary)

# Testing the Summarizer
Test the summarizer on different texts and evaluate its performance using metrics such as ROUGE scores.

In [None]:
# Testing the Summarizer

# Function to evaluate the summarizer using ROUGE scores
from rouge import Rouge

def evaluate_summary(reference_summary, generated_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, reference_summary, avg=True)
    return scores

# Example texts for testing
test_texts = [
    {
        "text": "Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate goal of NLP is to enable computers to understand, interpret, and generate human languages in a way that is both meaningful and useful.",
        "reference_summary": "NLP is a field of AI focused on the interaction between computers and humans through natural language, aiming to enable computers to understand, interpret, and generate human languages."
    },
    {
        "text": "Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns, and make decisions with minimal human intervention.",
        "reference_summary": "Machine learning automates analytical model building, allowing systems to learn from data, identify patterns, and make decisions with minimal human intervention."
    }
]

# Test the summarizer and evaluate its performance
for i, test in enumerate(test_texts):
    print(f"Test {i+1}:")
    print("Original Text:", test["text"])
    generated_summary = generate_extractive_summary(test["text"], num_sentences=2)
    print("Generated Summary:", generated_summary)
    scores = evaluate_summary(test["reference_summary"], generated_summary)
    print("ROUGE Scores:", scores)
    print("\n")