In [1]:
# Create the .kaggle directory
!mkdir -p ~/.kaggle

# Copy kaggle.json to the .kaggle directory
!cp kaggle.json ~/.kaggle/

# Set permissions for the Kaggle API token
!chmod 600 ~/.kaggle/kaggle.json

# Confirm Kaggle API setup
!kaggle datasets list -s "bbc-news-summary"

ref                                                           title                                                 size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------------  --------------------------------------------------  ------  -------------------  -------------  ---------  ---------------  
pariza/bbc-news-summary                                       BBC News Summary                                       9MB  2018-05-06 11:08:19          14966        177  0.75             
jacopoferretti/bbc-articles-dataset                           BBC Articles Dataset with Extra Features               4MB  2024-11-11 17:50:09           1235         34  1.0              
bhavikjikadara/bbc-news-articles                              BBC News Articles                                      3MB  2024-07-04 08:45:12            564         17  1.0              
dignity45/bbc-news-summarycsv-format                          BBC

In [2]:
!kaggle datasets download -d pariza/bbc-news-summary

Dataset URL: https://www.kaggle.com/datasets/pariza/bbc-news-summary
License(s): CC0-1.0
bbc-news-summary.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
!unzip bbc-news-summary.zip -d bbc-news-summary

Archive:  bbc-news-summary.zip
replace bbc-news-summary/BBC News Summary/News Articles/business/001.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [4]:
! pip install rouge-score



In [5]:
! pip install joblib



### Extractive summarization

In [6]:
import os
import time
import numpy as np
import networkx as nx
import nltk, re
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from joblib import Parallel, delayed
from nltk.tokenize import sent_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
import logging
from tqdm import tqdm
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)


### Directory paths

In [8]:
articles_dir = "/content/bbc-news-summary/BBC News Summary/News Articles/"
summaries_dir = "/content/bbc-news-summary/BBC News Summary/Summaries/"

### Read one text file under the business folder

In [9]:
# Open the file in read mode
with open("/content/bbc-news-summary/BBC News Summary/News Articles/business/001.txt", "r") as file:
    content = file.read()

# Print the content
print(content)

Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL

### Load articles and summaries


In [23]:
# # Load articles and summaries with encoding handling
# def load_text_files(directory):
#     data = {}
#     for root, _, files in os.walk(directory):
#         for file in files:
#             filepath = os.path.join(root, file)
#             try:
#                 # Attempt to read with UTF-8 encoding
#                 with open(filepath, 'r', encoding='utf-8') as f:
#                     data[file] = f.read().strip()
#             except UnicodeDecodeError:
#                 # Fallback to a different encoding if UTF-8 fails
#                 with open(filepath, 'r', encoding='ISO-8859-1') as f:
#                     data[file] = f.read().strip()
#     return data

# # Load articles and summaries
# articles = load_text_files(articles_dir)
# summaries = load_text_files(summaries_dir)

# print(f"Loaded {len(articles)} articles and {len(summaries)} summaries.")

In [24]:
# Function to load all articles or summaries from multiple subfolders
def load_text_files_from_all_categories(base_directory):
    data = {}
    for category in os.listdir(base_directory):  # Loop through categories
        category_path = os.path.join(base_directory, category)
        if os.path.isdir(category_path):  # Check if it's a directory
            for file in os.listdir(category_path):  # Loop through files
                file_path = os.path.join(category_path, file)
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    data[f"{category}/{file}"] = f.read().strip()
    return data

# Load all articles and summaries
articles = load_text_files_from_all_categories(articles_dir)
summaries = load_text_files_from_all_categories(summaries_dir)

print(f"Loaded {len(articles)} articles and {len(summaries)} summaries from all categories.")

Loaded 2225 articles and 2225 summaries from all categories.


In [25]:
def read_text(txt: str = ""):
  sentences = []
  sentences = sent_tokenize(txt)
  for sentence in sentences:
    # remove everthing in the text that is not alphanumeric i.e. letters or numbers
    sentence.replace("[^a-zA-Z0-9]", " ")
  return sentences

### Extractive summarization

Cosine Distance:

    Cosine distance is often used as a dissimilarity measure. It's defined as:

Cosine Distance=1−Cosine Similarity
Cosine Distance=1−Cosine Similarity

    This transformation shifts the range of the cosine similarity to [0, 2]:
        0 means the vectors are identical (perfect match),
        1 means the vectors are orthogonal (no similarity),
        2 means the vectors are opposite (completely dissimilar).

In [26]:
def sentence_similarity(sentence1, sentence2, stopwords = []):
  """
  This function computes the cosine similarity between two sentences
  by representing them as vectors of word occurrences.
  """
  sentence1 = [word.lower() for word in sentence1]
  sentence2 = [word.lower() for word in sentence2]
  all_words = list(set(sentence1 + sentence2))
  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)
  # First sentence vector
  for word in sentence1:
    if not word in stopwords:
      vector1[all_words.index(word)] += 1
  # Second sentence vector
  for word in sentence2:
    if not word in stopwords:
      vector2[all_words.index(word)] += 1
  # Vectors cosine similarity
  return 1 - cosine_distance(vector1, vector2)

In [27]:
def sentences_similarity_matrix(sentences, stopwords_):
  """
  This function will output a matrix where each element (i,j)(i,j) represents the similarity score
  between the i-th and j-th sentences, excluding the specified stopwords.
  We can use this matrix to observe sentences which have the highest similarity scores.
  """
  similarity_matrix = np.zeros((len(sentences), len(sentences))) # N on N
  for i in range(len(sentences)):
      for j in range(len(sentences)):
        if i != j:
          similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j], stopwords_)
  return similarity_matrix

In [28]:
def sentence_similarity_after_stopword_removal(txt):
  # nltk.download('stopwords')
  # nltk.download('punkt_tab')
  final_stopwords = stopwords.words('english')
  # Read and tokenize txt
  sentences = read_text(txt)
  # Get similarity matrix by passing the stopwords
  sentence_similarity_matrix = sentences_similarity_matrix(sentences, final_stopwords)
  return sentence_similarity_matrix, sentences

In [29]:
def summarize(sentence_similarity_matrix, top_n, sentences):
  """
  The provided code snippet implements an extractive text summarization technique
  using a graph-based ranking algorithm TextRank:
  In this graph, each node represents a sentence,
  and edges between nodes are weighted by the similarity scores from the matrix.
  """
  summarized_text = []
  # Rank sentences in the given similarity matrix
  # convert similarity matrix into a graph structure using numpy
  sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
  # rank sentences based on their similarity to other sentences using TextRank
  scores = nx.pagerank(sentence_similarity_graph)
  # Sort the rank of top sentences
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse = True)
  # Get the top n number of sentences based on rank
  for i in range(top_n):
    summarized_text.append(ranked_sentences[i][1])
  # Output the summarized version
  summary = " ".join(summarized_text)
  return summary, len(sentences), ranked_sentences

In [30]:
# Function to calculate BLEU for summaries (already defined)
def calculate_bleu_for_summary_4gram(reference, candidate):
    smoothing = SmoothingFunction().method1
    total_score = 0
    for cand_sentence in candidate:
        tokenized_cand = cand_sentence.split()
        sentence_scores = [
            sentence_bleu([ref.split()], tokenized_cand, smoothing_function=smoothing,
                          weights=(0.25, 0.25, 0.25, 0.25))
            for ref in reference
        ]
        best_score = max(sentence_scores)
        total_score += best_score
    average_score = total_score / len(candidate)
    return average_score

# Function to process a single article
def process_single_article(file, article, summaries, stopwords_list, top_n=3):
    logger.info(f"Processing file: {file}")  # Log the file being processed
    # Tokenize reference summary
    reference_summary = summaries.get(file, "").splitlines()

    # Tokenize and calculate similarity matrix
    sentences = sent_tokenize(article)
    similarity_matrix = sentences_similarity_matrix(sentences, stopwords_list)


    final_summary, sent_len, ranked_sentences = summarize(similarity_matrix, top_n, sentences)

    # Calculate BLEU
    bleu_score = calculate_bleu_for_summary_4gram(reference_summary, final_summary.splitlines())
    logger.info(f"Finished processing file: {file}, BLEU Score: {bleu_score:.4f}")
    return {
        "file": file,
        "summary": final_summary,
        "bleu": bleu_score
    }

# parallel processing all articles
def process_all_articles_parallel(articles, summaries, top_n=3, n_jobs=-1):
  stopwords_list = stopwords.words('english')
  logger.info(f"Starting parallel processing for {len(articles)} articles...")

  # Add tqdm progress bar
  with tqdm(total=len(articles), desc="Processing Articles") as pbar:
    results = Parallel(n_jobs=n_jobs)(
        delayed(lambda file, article: process_single_article(
            file, article, summaries, stopwords_list, top_n
            ))(file, article) for file, article in articles.items()
        )
    pbar.update(1)  # Update progress bar

  logger.info(f"Completed processing {len(results)} articles.")
  return results


In [31]:
# Function to process all articles
def process_all_articles(articles, summaries, top_n=3):
    stopwords_list = stopwords.words('english')
    final_results = []
    total_bleu = 0
    st = time.time()
    for file, article in articles.items():
         # Tokenize reference summary
        reference_summary = summaries.get(file, "").splitlines()
        # Tokenize and calculate similarity matrix
        sentences = sent_tokenize(article)
        similarity_matrix = sentences_similarity_matrix(sentences, stopwords_list)

        # Generate summary
        final_summary, sent_len, ranked_sentences = summarize(similarity_matrix, top_n, sentences)

        # Calculate BLEU
        bleu_score = calculate_bleu_for_summary_4gram(reference_summary, final_summary.splitlines())

        # Store results
        final_results.append({
            "file": file,
            "summary": final_summary,
            "bleu": bleu_score
        })

        # Accumulate BLEU scores
        total_bleu += bleu_score
    print("Time taken is :", time.time() - st)

    # Compute average BLEU
    average_bleu = total_bleu / len(final_results)

    return final_results, average_bleu

In [32]:
# Function to process articles for all categories
results, avg_bleu = process_all_articles(articles, summaries, top_n=3)

# Group results by category
category_results = {}
for result in results:
    category = result['file'].split('/')[0]  # Extract category
    if category not in category_results:
        category_results[category] = []
    category_results[category].append(result)

# Calculate BLEU scores for each category
category_bleu_scores = {}
for category, category_data in category_results.items():
    category_bleu = sum(r['bleu'] for r in category_data) / len(category_data)
    category_bleu_scores[category] = category_bleu
    print(f"Category: {category}, Articles: {len(category_data)}, Average BLEU Score: {category_bleu:.4f}")


Time taken is : 767.3139472007751
Category: entertainment, Articles: 386, Average BLEU Score: 0.1971
Category: sport, Articles: 511, Average BLEU Score: 0.2132
Category: tech, Articles: 401, Average BLEU Score: 0.1287
Category: business, Articles: 510, Average BLEU Score: 0.1990
Category: politics, Articles: 417, Average BLEU Score: 0.1568


### n_gram = 2 for Business articles

In [34]:
# Load articles and summaries with encoding handling
def load_text_files(directory):
    data = {}
    for root, _, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(root, file)
            try:
                # Attempt to read with UTF-8 encoding
                with open(filepath, 'r', encoding='utf-8') as f:
                    data[file] = f.read().strip()
            except UnicodeDecodeError:
                # Fallback to a different encoding if UTF-8 fails
                with open(filepath, 'r', encoding='ISO-8859-1') as f:
                    data[file] = f.read().strip()
    return data

# Load articles and summaries
articles_business = load_text_files(articles_dir)
summaries_business = load_text_files(summaries_dir)

print(f"Loaded {len(articles_business)} articles and {len(summaries_business)} summaries.")

Loaded 511 articles and 511 summaries.


In [36]:
def calculate_bleu_for_summary_bigram(reference, candidate, weights=(0.5, 0.5, 0, 0)):
    smoothing = SmoothingFunction().method1
    total_score = 0
    for cand_sentence in candidate:
        tokenized_cand = cand_sentence.split()
        sentence_scores = [
            sentence_bleu([ref.split()], tokenized_cand, smoothing_function=smoothing, weights=weights)
            for ref in reference
        ]
        best_score = max(sentence_scores)  # Take the best match
        total_score += best_score
    average_score = total_score / len(candidate)  # Normalize by the number of summary sentences
    return average_score

In [38]:
weights = (0, 1, 0, 0)
bleu_score = calculate_bleu_for_summary_bigram(summaries_business, articles_business, weights)
print(bleu_score)

0.1000000000000009


### Evaluate using Rouge

In [39]:
# Function to process all articles with ROUGE calculation
def process_all_articles_with_rouge(articles, summaries, top_n=3):
    stopwords_list = stopwords.words('english')
    final_results = []
    st = time.time()
    for file, article in articles.items():
        # Tokenize reference summary
        reference_summary = summaries.get(file, "")
        # Tokenize and calculate similarity matrix
        sentences = sent_tokenize(article)
        similarity_matrix = sentences_similarity_matrix(sentences, stopwords_list)

        # Generate summary
        final_summary, sent_len, ranked_sentences = summarize(similarity_matrix, top_n, sentences)

        # Calculate ROUGE
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge_scores = scorer.score(reference_summary, final_summary)

        # Store results
        final_results.append({
            "file": file,
            "summary": final_summary,
            "rouge1": rouge_scores['rouge1'].fmeasure,
            "rouge2": rouge_scores['rouge2'].fmeasure,
            "rougeL": rouge_scores['rougeL'].fmeasure
        })

    print("Time taken is :", time.time() - st)
    return final_results


In [40]:
# Process articles and calculate ROUGE scores
results = process_all_articles_with_rouge(articles, summaries, top_n=3)

# Group results by category
category_results = {}
for result in results:
    category = result['file'].split('/')[0]  # Extract category
    if category not in category_results:
        category_results[category] = []
    category_results[category].append(result)

# Calculate average ROUGE scores for each category
category_rouge_scores = {}
for category, category_data in category_results.items():
    rouge1_avg = sum(r['rouge1'] for r in category_data) / len(category_data)
    rouge2_avg = sum(r['rouge2'] for r in category_data) / len(category_data)
    rougeL_avg = sum(r['rougeL'] for r in category_data) / len(category_data)

    category_rouge_scores[category] = {
        "rouge1": rouge1_avg,
        "rouge2": rouge2_avg,
        "rougeL": rougeL_avg
    }

    print(f"Category: {category}, Articles: {len(category_data)}")
    print(f"  ROUGE-1: {rouge1_avg:.4f}")
    print(f"  ROUGE-2: {rouge2_avg:.4f}")
    print(f"  ROUGE-L: {rougeL_avg:.4f}")

Time taken is : 797.9454424381256
Category: entertainment, Articles: 386
  ROUGE-1: 0.5005
  ROUGE-2: 0.3829
  ROUGE-L: 0.3748
Category: sport, Articles: 511
  ROUGE-1: 0.4933
  ROUGE-2: 0.3858
  ROUGE-L: 0.3671
Category: tech, Articles: 401
  ROUGE-1: 0.4360
  ROUGE-2: 0.3218
  ROUGE-L: 0.3172
Category: business, Articles: 510
  ROUGE-1: 0.5054
  ROUGE-2: 0.3889
  ROUGE-L: 0.3721
Category: politics, Articles: 417
  ROUGE-1: 0.4697
  ROUGE-2: 0.3590
  ROUGE-L: 0.3411


In [None]:
###