## Loading data into dataframe for ease of access

In [None]:

import os
import pandas as pd
path_, filename_, category_, article_or_summary_ = [],[],[],[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path_.append(os.path.join(dirname, filename))
        filename_.append(filename)
        category_.append(dirname.split("/")[-1])
        article_or_summary_.append(dirname.split("/")[-2])

In [None]:
df = pd.DataFrame({"path":path_, "filename":filename_, "category":category_, "article_or_summary":article_or_summary_}, columns=["path", "filename", "category", "article_or_summary"])
df


<a id='6'></a>

# <p style="padding:10px;background-color:#6600cc ;margin:10;color:#ccff99;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 10px 10px ;overflow:hidden;font-weight:50">Sentence Tokenization</p>

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import numpy as np
import networkx as nx
import re

In [None]:
def sent_tokenizer(text):        
    sentences =[]        
    sentences = sent_tokenize(text)    
    for sentence in sentences:        
        sentence.replace("[^a-zA-Z0-9]"," ")     
    return sentences

def read_articles_and_summary(df, category):
    """
    read all the articles in category
    """
    articles = {}
    summaries = {}
    
    df = df[df['category']==category]
    for _, row in df.iterrows():
        file_path = row['path']
        file_name = row['filename']
        file_type = row['article_or_summary']
        
        with open(file_path, "r") as f:
            content = f.read()
            if file_type == 'News Articles':
                articles[(category, file_name)] = content
            elif file_type == "Summaries":
                summaries[(category, file_name)] = content
    return articles, summaries
    

## Spell Correction 

In [None]:
from textblob import TextBlob
import tensorflow_hub as hub


embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


def spell_correction(sent_tok):
    mod_sent = []
    for tok in sent_tok:
        blob_obj = TextBlob(tok)
        correct_sent = str(blob_obj.correct())
#         print(f"\033[94m Original Token : {tok} \033[0m")
#         print(f"\033[92m Corrected Token: {correct_sent} \033[92m")
        mod_sent.append(correct_sent)
    return " ".join(mod_sent)

def sentence_similarity(sent1,sent2,embed):  
    A = embed([sent1])[0]
    B = embed([sent2])[0]
    return 1 - (np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B)))

def build_similarity_matrix(sentences,embeds):
    similarity_matrix = np.zeros((len(sentences),len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1!=idx2:
                similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],sentences[idx2],embeds)
    return similarity_matrix


# sim_mat = build_similarity_matrix(mod_sent, embed)

In [None]:
!pip install rouge

In [None]:
from rouge import Rouge

# construct a dataframe

# 函数来计算ROUGE分数
def compute_rouge_scores(generate, reference):
    rouge = Rouge()
    scores = rouge.get_scores(generate, reference, avg=True)
    return pd.Series({
        'rouge1_precision': scores['rouge-1']['p'],
        'rouge1_recall': scores['rouge-1']['r'],
        'rouge1_fmeasure': scores['rouge-1']['f'],
        'rouge2_precision': scores['rouge-2']['p'],
        'rouge2_recall': scores['rouge-2']['r'],
        'rouge2_fmeasure': scores['rouge-2']['f'],
        'rougeL_precision': scores['rouge-l']['p'],
        'rougeL_recall': scores['rouge-l']['r'],
        'rougeL_fmeasure': scores['rouge-l']['f']
    })


def generate_single_summary(text,top_n,embeds):
    summarize_text = []  
    sentences = sent_tokenizer(text)           
    sentence_similarity_matrix = build_similarity_matrix(sentences,embeds)  
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph) 
    
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)),reverse=True)
    top_n = min(len(ranked_sentences), top_n)
    
    for i in range(top_n):
        summarize_text.append(ranked_sentences[i][1]) 
    return " ".join(summarize_text)


# def generate_summaries(df, category):
#     generated_sumaries = {}
#     rouge_scores = {}
#     # get articles and summaries
#     articles, summaries = read_articles_and_summary(df, category)
#     print("Articles and Summaries are Loaded")
    
#     for (category, file_name), text in articles.items():
#         print(f"Category: {category}, Article: {file_name}")
#         sent_tok = sent_tokenizer(text)
#         original_text = spell_correction(sent_tok)
#         summarized_text = generate_single_summary(original_text, top_n=5, embeds=embed)
#         generated_sumaries[(category, file_name)] = summarized_text
        
#         if (category, file_name) in summaries:
#             article_summaries = summaries[(category, file_name)]
#             rouge_score = compute_rouge(summarized_text, article_summaries)
            
#     return generated_summaries, rouge_scores

#     for (category, article_name), scores in rouge_scores.items():
#         print(f"Category: {category}, Article: {article_name}")
#         for idx, score in enumerate(scores):
#             print(f"  Summary {idx+1}: {score}")

# 生成摘要并计算ROUGE分数的函数
def generate_summaries(df, category, embed, top_n):
    data = []
    articles, summaries = read_articles_and_summary(df, category)
    print("Articles and Summaries are Loaded")
    
    for (category, file_name), text in articles.items():
        print(f"Category: {category}, Article: {file_name}")
        sent_tok = sent_tokenizer(text)
        original_text = spell_correction(sent_tok)
        summarized_text = generate_single_summary(original_text, top_n, embed)
        
        if (category, file_name) in summaries:
            reference_summary = summaries[(category, file_name)]
            rouge_scores = compute_rouge_scores(summarized_text, reference_summary)
            data.append({
                'actual_summary': reference_summary,
                'generated_summary': summarized_text,
                **rouge_scores
            })
    
    return pd.DataFrame(data)


In [None]:
result_df = generate_summaries(df, "politics", embed, top_n=5)
result_df.to_pandas("/kaggle/working/result.csv", index=False)

Defining Function of Summary which is basically combined actions which we have gone through in the above steps.
To be more clear we will be collection Nth top most relevant sentences to summarize entire articles.

Steps:

1. Reading Article and extracting Text from it.
2. Generate Sentence tokens.
3. Compute cosine similarity.
4. Using NetworkX to compute Graph Similiarity nodes.
5. Using Page Ranking method to rank the sentences.
6. Collect Top N Sentences and represent as summary of the Entire Article.

Note : The Above steps metioned is applicaple for Extractive Strategy for Text Summarization 
