<a href="https://colab.research.google.com/github/RoaaM/summarize_text/blob/main/summarize_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# requremnets


In [None]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import pandas as pd
import networkx as nx
from rouge import Rouge
import re
from collections import Counter
from math import sqrt

# uplaod data

In [None]:
# set the path to the CSV file
csv_path = "Dataset.csv"

# read the data from the CSV file
df = pd.read_csv(csv_path)

df.head()

Unnamed: 0,Text,Summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...


In [None]:
df.shape

(2225, 2)

# exploar samples

In [None]:
df['Text'][0]

'Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to si

In [None]:
len(df['Text'][0])

2560

In [None]:
df['Summary'][0]

"TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.Time Warner's fourth quarter profits were slightly better than analysts' expectations."

In [None]:
len(df['Summary'][0])

901

# create similarity matching function

In [None]:
def calculate_similarity(sentence1, sentence2):
    # Pre-processing: convert to lowercase, remove punctuation and stop words
    sentence1 = re.sub(r'[^\w\s]', '', sentence1.lower())
    sentence2 = re.sub(r'[^\w\s]', '', sentence2.lower())
    stop_words = {'a', 'an', 'the', 'of', 'to', 'in', 'for', 'on', 'that', 'this', 'it', 'with', 'and', 'or', 'as', 'at', 'by'}
    words1 = [word for word in sentence1.split() if word not in stop_words]
    words2 = [word for word in sentence2.split() if word not in stop_words]

    # Calculate the cosine similarity between the two sentences
    word_count1 = Counter(words1)
    word_count2 = Counter(words2)
    common_words = set(words1).intersection(set(words2))
    dot_product = sum([word_count1[word] * word_count2[word] for word in common_words])
    magnitude1 = sqrt(sum([count ** 2 for count in word_count1.values()]))
    magnitude2 = sqrt(sum([count ** 2 for count in word_count2.values()]))
    similarity = dot_product / (magnitude1 * magnitude2) if magnitude1 > 0 and magnitude2 > 0 else 0.0

    return similarity

# if we want to explar summary as graph we can use this

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# define a function to create the hypergraph from a text
def create_hypergraph(text):
    # split the text into sentences
    sentences = text.split(". ")

    # create a graph to represent the text
    graph = nx.Graph()

    # add nodes to the graph for each sentence in the text
    for sentence in sentences:
        graph.add_node(sentence)

    # add hyperedges to the graph to connect related nodes
    for sentence1 in sentences:
        for sentence2 in sentences:
            # calculate the similarity between the two sentences
            similarity = calculate_similarity(sentence1, sentence2)

            # if the similarity is above a certain threshold, add a hyperedge between the two nodes
            if similarity > 0.5:
                graph.add_edge(sentence1, sentence2)

    return graph


# generate summary

In [None]:
# define a function to generate a summary from a hypergraph
def generate_summary(graph):
    # apply a graph-based summarization algorithm to generate a summary
    # here, we use the PageRank algorithm to rank the sentences based on their importance in the graph
    scores = nx.pagerank(graph)

    # sort the sentences by their scores in descending order
    ranked_sentences = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # select the top 3 sentences as the summary
    summary_sentences = [sentence[0] for sentence in ranked_sentences[:3]]

    # convert the summary back into text format
    summary = ". ".join(summary_sentences) + "."

    return summary

In [None]:
calculate_similarity(df['Text'][0], df['Summary'][0])

0.6884660574217999

In [None]:
summary = generate_summary(graph)
summary

'Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google.'

In [None]:
len(summary)

566

In [None]:
calculate_similarity(summary , df['Summary'][0])

0.5956398291872697

In [None]:
# apply the hypergraph-based summarization to each row in the DataFrame and store the results in a new column
# df["Model Summary"] = df["Text"].apply(lambda text: generate_summary(create_hypergraph(text)))

In [None]:
# # create a new DataFrame with only the summary and model summary columns
# output_df = pd.DataFrame({
# "Summary": df["Summary"],
# "Model Summary": df["Model Summary"]
# })

In [None]:
# output_df.to_csv("output.csv", index=False)

In [None]:
# rouge_scores = []
# for index, row in output_df.iterrows():
#     rouge_scores.append(Rouge().get_scores(row["Model Summary"], row["Summary"]))

#     rouge_2_scores = [score[0]["rouge-2"]["f"] for score in rouge_scores]
#     rouge_2_avg = sum(rouge_2_scores) / len(rouge_2_scores)

# print("ROUGE-2 F1 score: {:.2f}%".format(rouge_2_avg * 100))

##transformer


In [None]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the pre-trained model and tokenizer
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Define input text to summarize
input_text = df['Text'][0]

# Preprocess the input text
inputs = tokenizer.encode(input_text, return_tensors='pt')

# Generate summary output
outputs = model.generate(inputs, max_length=1024, min_length=30, length_penalty=2.0, num_beams=4)
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the generated summary
print(summary)


Time Warner profits up 76% to $1.13bn for the three months to December. Firm is one of the biggest investors in Google and owns 8% of the search engine.


In [None]:
len(summary)

152

In [None]:
calculate_similarity(summary, df['Summary'][0])

0.24210006235312612

# graph


In [None]:
import pandas as pd
import re
import networkx as nx
from collections import Counter
from math import sqrt


def calculate_similarity(sentence1, sentence2):
    # Pre-processing: convert to lowercase, remove punctuation and stop words
    sentence1 = re.sub(r'[^\w\s]', '', sentence1.lower())
    sentence2 = re.sub(r'[^\w\s]', '', sentence2.lower())
    stop_words = {'a', 'an', 'the', 'of', 'to', 'in', 'for', 'on', 'that', 'this', 'it', 'with', 'and', 'or', 'as', 'at', 'by'}
    words1 = [word for word in sentence1.split() if word not in stop_words]
    words2 = [word for word in sentence2.split() if word not in stop_words]

    # Calculate the cosine similarity between the two sentences
    word_count1 = Counter(words1)
    word_count2 = Counter(words2)
    common_words = set(words1).intersection(set(words2))
    dot_product = sum([word_count1[word] * word_count2[word] for word in common_words])
    magnitude1 = sqrt(sum([count ** 2 for count in word_count1.values()]))
    magnitude2 = sqrt(sum([count ** 2 for count in word_count2.values()]))
    similarity = dot_product / (magnitude1 * magnitude2) if magnitude1 > 0 and magnitude2 > 0 else 0.0

    return similarity

# define a function to create the hypergraph from a text
def create_hypergraph(text):
    # split the text into sentences
    sentences = text.split(". ")

    # create a graph to represent the text
    graph = nx.Graph()

    # add nodes to the graph for each sentence in the text
    for sentence in sentences:
        graph.add_node(sentence)

    # add hyperedges to the graph to connect related nodes
    for sentence1 in sentences:
        for sentence2 in sentences:
            # calculate the similarity between the two sentences
            similarity = calculate_similarity(sentence1, sentence2)

            # if the similarity is above a certain threshold, add a hyperedge between the two nodes
            if similarity > 0.5:
                graph.add_edge(sentence1, sentence2)

    return graph

# define a function to read the data from a CSV file and generate model summaries
def generate_model_summaries(filename):
    # read the data from the CSV file
    data = pd.read_csv(filename)

    # create an empty list to store the model summaries
    model_summaries = []

    # iterate over each row in the data
    for index, row in data.iterrows():
        # extract the text and human summary from the row
        text = row['Text']
        human_summary = row['Summary']

        # create a hypergraph from the text
        graph = create_hypergraph(text)

        # generate a summary from the hypergraph
        model_summary = generate_summary(graph)

        # append the model summary to the list
        model_summaries.append(model_summary)

    # add the list of model summaries as a new column in the data
    data['model_summary'] = model_summaries

    # write the updated data to a new CSV file
    data.to_csv('output.csv', index=False)


In [None]:
generate_model_summaries('Dataset.csv')

# rouge metric

In [None]:
import csv
from rouge import Rouge

# Load the data from the output CSV file
with open('output.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    summaries = []
    model_summaries = []
    for row in reader:
        summaries.append(row['Summary'])
        model_summaries.append(row['model_summary'])

# Calculate the ROUGE scores
rouge = Rouge()
scores = rouge.get_scores(summaries, model_summaries, avg=True)

# Print the scores
print("ROUGE-1: {:.2f}".format(scores['rouge-1']['f']*100))
print("ROUGE-2: {:.2f}".format(scores['rouge-2']['f']*100))
print("ROUGE-L: {:.2f}".format(scores['rouge-l']['f']*100))

print('ROUGE-1 Precision:', scores['rouge-1']['p'])
print('ROUGE-1 Recall:', scores['rouge-1']['r'])
print('ROUGE-1 F1 Score:', scores['rouge-1']['f'])
print('ROUGE-2 Precision:', scores['rouge-2']['p'])
print('ROUGE-2 Recall:', scores['rouge-2']['r'])
print('ROUGE-2 F1 Score:', scores['rouge-2']['f'])
print('ROUGE-L Precision:', scores['rouge-l']['p'])
print('ROUGE-L Recall:', scores['rouge-l']['r'])
print('ROUGE-L F1 Score:', scores['rouge-l']['f'])


ROUGE-1: 53.29
ROUGE-2: 41.17
ROUGE-L: 52.28
ROUGE-1 Precision: 0.4736573958273568
ROUGE-1 Recall: 0.6625698746950424
ROUGE-1 F1 Score: 0.5329401070857893
ROUGE-2 Precision: 0.36139382037173146
ROUGE-2 Recall: 0.5381073099109446
ROUGE-2 F1 Score: 0.41167324512414943
ROUGE-L Precision: 0.4648487289325803
ROUGE-L Recall: 0.6495815081683715
ROUGE-L F1 Score: 0.5227836400964512


In [None]:
# ROUGE-1 measures the overlap between unigrams (single words) in the generated summary and the reference summaries.
# ROUGE-2 measures the overlap between bigrams (pairs of adjacent words) in the generated summary and the reference summaries.
# ROUGE-L measures the longest common subsequence between the generated summary and the reference summaries, which captures the
# amount of information that is common to both.