# extractive summarization using text rank

In [33]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## install libraries

In [34]:
! pip install pandas numpy nltk networkx




## Load and Preprocess Data

In [35]:
import pandas as pd  # Importing pandas library for data manipulation and analysis
import numpy as np   # Importing numpy library for numerical operations
import nltk          # Importing NLTK (Natural Language Toolkit) for natural language processing tasks
nltk.download('punkt')  # Downloading the 'punkt' tokenizer models used by NLTK for sentence and word tokenization
from nltk.tokenize import sent_tokenize, word_tokenize  # Importing specific tokenization functions from NLTK
from nltk.corpus import stopwords  # Importing NLTK's stopwords corpus for filtering out common words
import networkx as nx  # Importing NetworkX library for graph-based algorithms like PageRank




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [36]:
# Download NLTK data files (if not already downloaded)
nltk.download('punkt')  # Download NLTK's punkt tokenizer
nltk.download('stopwords')  # Download NLTK's stopwords


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Tokenize Sentences and Words

In [37]:
# Function to read the dataset
def read_dataset(file_path):
    # Read CSV file into a DataFrame
    return pd.read_csv(file_path)  # Returns a DataFrame containing the dataset

# Function to preprocess text into sentences
def preprocess_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    return sentences  # Returns a list of sentences


## Build Similarity Matrix

In [38]:
# Function to calculate similarity between two sentences using cosine similarity
def sentence_similarity(sent1, sent2):
    stop_words = set(stopwords.words('english'))  # Get English stopwords
    # Tokenize and remove stopwords
    sent1 = [w.lower() for w in word_tokenize(sent1) if w.lower() not in stop_words]
    sent2 = [w.lower() for w in word_tokenize(sent2) if w.lower() not in stop_words]

    all_words = list(set(sent1 + sent2))  # Get all unique words from both sentences

    # Initialize vectors for word counts
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # Count word occurrences in each sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    for w in sent2:
        vector2[all_words.index(w)] += 1

    # Compute cosine similarity between the vectors
    similarity = cosine_similarity([vector1], [vector2])[0][0]
    return similarity  # Returns a float representing the cosine similarity
# Example of building a similarity matrix of sentences based on a threshold
def build_similarity_matrix(sentences, threshold=0.2):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    # Calculate similarity between all pairs of sentences
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                similarity = sentence_similarity(sentences[i], sentences[j])
                if similarity > threshold:
                    similarity_matrix[i][j] = similarity

    return similarity_matrix  # Returns a 2D NumPy array (similarity matrix)



## Apply TextRank Algorithm

In [39]:
# Function to generate summary using TextRank algorithm
def textrank_summarize(text, top_n=5):
    sentences = preprocess_text(text)  # Preprocess text into sentences
    similarity_matrix = build_similarity_matrix(sentences)  # Build similarity matrix based on sentences

    # Convert similarity matrix to a graph
    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)  # Compute PageRank scores for ranking sentences

    # Rank sentences based on PageRank scores
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Ensure that top_n does not exceed available sentences
    top_n = min(top_n, len(ranked_sentences))

    # Select top N sentences for summary
    summary = ' '.join([ranked_sentences[i][1] for i in range(top_n)])
    return summary  # Returns a string (generated summary)


## Generate Summary

In [40]:
from tqdm import tqdm  # Import tqdm for progress bar visualizat
# Function to evaluate the generated summary against a reference summary
def evaluate_summary(generated_summary, reference_summary):
    generated_tokens = set(word_tokenize(generated_summary))  # Tokenize generated summary
    reference_tokens = set(word_tokenize(reference_summary))  # Tokenize reference summary

    common_tokens = generated_tokens.intersection(reference_tokens)  # Find common tokens

    # Calculate precision, recall, and F1 score based on token overlap
    if len(generated_tokens) == 0 or len(reference_tokens) == 0:
        return 0, 0, 0

    precision = len(common_tokens) / len(generated_tokens)
    recall = len(common_tokens) / len(reference_tokens)

    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1  # Returns three floats (precision, recall, F1 score)

# Main function to summarize texts in the dataset and evaluate the summaries
def main():
    file_path = '/content/drive/My Drive/train_data1.csv'  # Path to your dataset CSV file

    summaries = []  # List to store generated summaries
    precisions = []  # List to store precision scores
    recalls = []  # List to store recall scores
    f1s = []  # List to store F1 scores

    # Iterate through each row in the dataset
    for i, row in tqdm(df.iterrows(), total=len(df), desc="Summarizing"):
        summary = textrank_summarize(row['article'], top_n=5)  # Generate summary for each article
        summaries.append(summary)  # Append generated summary to list

        # Evaluate generated summary against reference summary (assumed in 'highlight' column)
        precision, recall, f1 = evaluate_summary(summary, row['highlight'])
        precisions.append(precision)  # Append precision score
        recalls.append(recall)  # Append recall score
        f1s.append(f1)  # Append F1 score

    # Add generated summaries to the DataFrame
    df['summary'] = summaries

    # Save DataFrame with summaries to a new CSV file
    df.to_csv('/content/drive/My Drive/summarized_train_data1.csv', index=False)
    print("Summarization complete. Summarized data saved to 'summarized_train_new1.csv'.")

    # Calculate average precision, recall, and F1 score
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1 = sum(f1s) / len(f1s)

    # Print average evaluation scores
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")

# Execute main function if script is run directly
if __name__ == "__main__":
    main()


Summarizing: 100%|██████████| 11490/11490 [00:55<00:00, 206.62it/s]


Summarization complete. Summarized data saved to 'summarized_train_new1.csv'.
Average Precision: 0.11943901651820747
Average Recall: 0.8127170619817642
Average F1 Score: 0.20283625093778002


In [43]:
import pandas as pd

# Function to display summarized data
def display_summarized_data(file_path, num_records=2):
    # Read the summarized CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Display the first num_records records
    print(f"Displaying first {num_records} records from {file_path}:")
    for i in range(num_records):
        print(f"\nRecord {i+1}:")
        print(f"Article:\n{df.loc[i, 'article']}")
        print(f"\nGenerated Summary:\n{df.loc[i, 'summary']}")
        print(f"\nReference Summary:\n{df.loc[i, 'highlight']}")
        print("\n---")

# Example usage:
file_path = '/content/drive/My Drive/summarized_train_data1.csv'  # Replace with your actual file path
display_summarized_data(file_path)


Displaying first 2 records from /content/drive/My Drive/summarized_train_data1.csv:

Record 1:
Article:
ever noticed plane seats appear getting smaller smaller increasing numbers people taking skies experts questioning packed planes putting passengers risk say shrinking space aeroplanes uncomfortable putting health safety danger squabbling arm rest shrinking space planes putting health safety danger week us consumer advisory group set department transportation said public hearing government happy set standards animals flying planes doesnt stipulate minimum amount space humans world animals rights space food humans said charlie leocha consumer representative committee time dot faa take stand humane treatment passengers could crowding planes lead serious issues fighting space overhead lockers crashing elbows seat back kicking tests conducted faa use planes 31 inch pitch standard airlines decreased many economy seats united airlines 30 inches room airlines offer little 28 inches cynthia c