In [None]:
%pip install transformers
%pip install bert-extractive-summarizer
%pip install --upgrade transformers
%pip install --upgrade bert-extractive-summarizer
%pip install pandas

In [2]:
# Load & check Data
import pandas as pd

colab = False

if colab:
    from google.colab import drive
    drive.mount('/content/drive')
    data = pd.read_csv(
        'drive/My Drive/COMP SCI 539/bbc-news-data.csv', delimiter='\t')
else:
    data = pd.read_csv('bbc-news-data.csv', delimiter='\t')

data.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [3]:
# Data Preprocessing

# Find and remove nulls
print(data.isnull().sum())

# Data to lowercase
data["title"] = data["title"].str.lower()
data["content"] = data["content"].str.lower()
# Remove and replace contractions
# Find more contraction in text and add
contraction_dict = {"can't": "cannot", "didn't": "did not", "aren't": "are not", "she'd": "she would", "he'd": "he would", "they'd": "they would", "they've": "they have",
                    "shouldn't": "should not", "shouldn't've": "should not have", "she'll": "she will", "he'll": "he will", "they'll": "they will"
                    }


def contraction_replacer(text):
    for word in text.split():
        if word in contraction_dict:
            text = text.replace(word, contraction_dict[word])
    return text


data["title"] = data["title"].apply(contraction_replacer)
data["content"] = data["content"].apply(contraction_replacer)

# Remove punctuation and numbers
# Find more punctuation in text and add

numbers = '0123456789'


def punctuation_numbers_remover(text):
    for number in numbers:
        text = text.replace(number, '')
    return text


data["title"] = data["title"].apply(punctuation_numbers_remover)
data["content"] = data["content"].apply(punctuation_numbers_remover)

data.head()

content_lengths = [len(content.split()) for content in data["content"]]

# remove the outliers
MAX_CONTENT_LENGTH = 1000
MAX_TITLE_LENGTH = 10
# get the indices of the documents that have more than 1000 words
outliers = [idx for idx, length in enumerate(
    content_lengths) if length > MAX_CONTENT_LENGTH]

# remove the outliers from the data
data = data.drop(outliers, axis=0).reset_index(drop=True)

content_lengths = [len(content.split()) for content in data["content"]]

category    0
filename    0
title       0
content     0
dtype: int64


In [5]:
from summarizer import Summarizer
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Create a BERT Summarizer
bert_model = Summarizer()
predictions = []


for i in range(500):
       print(f'Article {i}/{500}', end='\r')
       # Summarize the content
       summary = bert_model(data["content"][i], num_sentences=1)
       # Add the summary to the predictions list
       predictions.append(summary)


Article 499/500

In [6]:
from rouge_score import rouge_scorer
# Evaluation

rouge1 = 0.0
rougeL = 0.0

def calculate_rouge(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return scores


for i in range(len(predictions)):
    sc = calculate_rouge(data["title"][i], predictions[i])
    rouge1 += sc["rouge1"].fmeasure
    rougeL += sc["rougeL"].fmeasure

rouge1 /= len(predictions)
rougeL /= len(predictions)

print("ROUGE-1: ", rouge1)
print("ROUGE-L: ", rougeL)

ROUGE-1:  0.22259510941951052
ROUGE-L:  0.18755025677527778
