# Document Summarization using LDA
# This notebook demonstrates topic modeling and document summarization

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx
from string import punctuation

  and should_run_async(code)


In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

  and should_run_async(code)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def read_article(text):
    """
    Read and tokenize the text into sentences

    Args:
        text (str): Input text to be summarized

    Returns:
        list: List of sentences
    """
    # Split text into sentences
    sentences = sent_tokenize(text)

    # Clean sentences by removing special characters and converting to lowercase
    clean_sentences = [sentence.replace("[^a-zA-Z0-9\s]", "").lower()
                      for sentence in sentences]

    return clean_sentences

  and should_run_async(code)
  clean_sentences = [sentence.replace("[^a-zA-Z0-9\s]", "").lower()


In [None]:
def create_sentence_vectors(sentences):
    """
    Create vectors for each sentence based on word frequencies

    Args:
        sentences (list): List of sentences

    Returns:
        list: List of sentence vectors
    """
    # Get English stop words
    stop_words = set(stopwords.words('english') + list(punctuation))

    # Create word frequency dictionary for all sentences
    word_freq = {}
    for sentence in sentences:
        words = word_tokenize(sentence)
        for word in words:
            if word not in stop_words:
                if word not in word_freq:
                    word_freq[word] = 1
                else:
                    word_freq[word] += 1

    # Create sentence vectors based on word frequencies
    sentence_vectors = []
    for sentence in sentences:
        sentence_dict = {}
        words = word_tokenize(sentence)
        for word in words:
            if word not in stop_words:
                if word not in sentence_dict:
                    sentence_dict[word] = 1
                else:
                    sentence_dict[word] += 1
        sentence_vectors.append(sentence_dict)

    return sentence_vectors

  and should_run_async(code)


In [None]:
def calculate_sentence_similarity(sent1, sent2):
    """
    Calculate similarity between two sentences using cosine similarity

    Args:
        sent1 (dict): First sentence vector
        sent2 (dict): Second sentence vector

    Returns:
        float: Similarity score between 0 and 1
    """
    # Get all unique words from both sentences
    all_words = list(set(sent1.keys()).union(set(sent2.keys())))

    # Create vectors with word frequencies
    vector1 = [sent1.get(word, 0) for word in all_words]
    vector2 = [sent2.get(word, 0) for word in all_words]

    # Calculate cosine similarity
    if sum(vector1) == 0 or sum(vector2) == 0:
        return 0.0

    numerator = sum(a * b for a, b in zip(vector1, vector2))
    denominator = (sum(a * a for a in vector1) ** 0.5) * (sum(b * b for b in vector2) ** 0.5)

    return numerator / denominator if denominator != 0 else 0.0

  and should_run_async(code)


In [None]:
def build_similarity_matrix(sentences):
    """
    Build similarity matrix for all sentences

    Args:
        sentences (list): List of sentence vectors

    Returns:
        numpy.ndarray: Similarity matrix
    """
    # Initialize similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    # Calculate similarity scores for each sentence pair
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                similarity_matrix[i][j] = calculate_sentence_similarity(
                    sentences[i], sentences[j])

    return similarity_matrix

  and should_run_async(code)


In [None]:
def generate_summary(text, num_sentences=3):
    """
    Generate text summary using TextRank algorithm

    Args:
        text (str): Input text to be summarized
        num_sentences (int): Number of sentences in the summary

    Returns:
        str: Generated summary
    """
    # Get sentences and create sentence vectors
    sentences = read_article(text)
    sentence_vectors = create_sentence_vectors(sentences)

    # Build similarity matrix
    similarity_matrix = build_similarity_matrix(sentence_vectors)

    # Create graph and calculate scores
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)

    # Rank sentences by score
    ranked_sentences = sorted(((scores[i], sentence)
                             for i, sentence in enumerate(sentences)),
                            reverse=True)

    # Select top sentences for summary
    summary_sentences = [sentence for score, sentence in ranked_sentences[:num_sentences]]

    # Sort sentences by their original order
    summary_sentences.sort(key=lambda x: sentences.index(x))

    return " ".join(summary_sentences)

  and should_run_async(code)


In [None]:
# Example usage
if __name__ == "__main__":
    # Sample text for summarization
    sample_text = """
    Natural Language Processing (NLP) is a branch of artificial intelligence that helps computers understand, interpret, and manipulate human language.
    NLP draws from many disciplines, including computer science and computational linguistics.
    It enables computers to perform various language-related tasks like speech recognition, machine translation, and text summarization.
    Modern NLP applications use machine learning, especially deep learning, to achieve better results.
    These applications can be found in virtual assistants, chatbots, and various text analysis tools.
    The field of NLP continues to evolve with new research and technological advancement.
    """

    # Generate summary
    summary = generate_summary(sample_text, num_sentences=3)
    print("Original Text:\n", sample_text)
    print("\nGenerated Summary:\n", summary)

Original Text:
 
    Natural Language Processing (NLP) is a branch of artificial intelligence that helps computers understand, interpret, and manipulate human language. 
    NLP draws from many disciplines, including computer science and computational linguistics. 
    It enables computers to perform various language-related tasks like speech recognition, machine translation, and text summarization. 
    Modern NLP applications use machine learning, especially deep learning, to achieve better results. 
    These applications can be found in virtual assistants, chatbots, and various text analysis tools. 
    The field of NLP continues to evolve with new research and technological advancement.
    

Generated Summary:
 
    natural language processing (nlp) is a branch of artificial intelligence that helps computers understand, interpret, and manipulate human language. it enables computers to perform various language-related tasks like speech recognition, machine translation, and text su

  and should_run_async(code)
