# INITIALIZATION

In [1]:
import os
import pandas as pd

# Initialize lists to store data
path_ = []
filename_ = []
category_ = []
article_or_summary_ = []

# Define the root folder (replace with your path)
root_folder = r"C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC News Summary"

# Traverse the folder structure
for dirname, _, filenames in os.walk(root_folder):
    for filename in filenames:
        if filename.endswith('.txt'):  # Only process .txt files
            # Full path to the file
            full_path = os.path.join(dirname, filename)
            path_.append(full_path)
            filename_.append(filename)
            
            # Normalize path for consistent splitting
            normalized_path = os.path.normpath(dirname)
            
            # Extract article_or_summary (parent folder of category)
            article_or_summary_.append(os.path.split(os.path.dirname(normalized_path))[-1])
            
            # Extract category (current folder)
            category_.append(os.path.split(normalized_path)[-1])

# Create a DataFrame
df = pd.DataFrame(
    {
        "path": path_,
        "filename": filename_,
        "category": category_,
        "article_or_summary": article_or_summary_,
    },
    columns=["path", "filename", "category", "article_or_summary"]
)

# Display the DataFrame
print(df.head())

                                                path filename  category  \
0  C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...  001.txt  business   
1  C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...  002.txt  business   
2  C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...  003.txt  business   
3  C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...  004.txt  business   
4  C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...  005.txt  business   

  article_or_summary  
0      News Articles  
1      News Articles  
2      News Articles  
3      News Articles  
4      News Articles  


In [2]:
df = pd.DataFrame({"path":path_, "filename":filename_, "category":category_, "article_or_summary":article_or_summary_}, columns=["path", "filename", "category", "article_or_summary"])
df

Unnamed: 0,path,filename,category,article_or_summary
0,C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...,001.txt,business,News Articles
1,C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...,002.txt,business,News Articles
2,C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...,003.txt,business,News Articles
3,C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...,004.txt,business,News Articles
4,C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...,005.txt,business,News Articles
...,...,...,...,...
4445,C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...,397.txt,tech,Summaries
4446,C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...,398.txt,tech,Summaries
4447,C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...,399.txt,tech,Summaries
4448,C:/Users/Lavanya Gurram/OneDrive/Desktop/BBC N...,400.txt,tech,Summaries


# EDA

In [None]:
# Install required libraries
!pip install plotly cufflinks

# Import necessary libraries
import plotly.express as px  # plotly.express is the correct import for plotly_express
import cufflinks as cf

# Enable offline mode for cufflinks
cf.go_offline()

# Optional: Set a theme for visualizations (if desired)
cf.set_config_file(theme="pearl")



## Distribution of Number of Articles in Each Category

In [None]:
from collections import Counter

ct = Counter(df[df['article_or_summary']=="News Articles"]["category"])
pd.DataFrame({"category":ct.keys(), "value":ct.values()}).iplot(kind='bar', x='category', y='value')

## Distribution of Category and its Values

In [None]:
pd.DataFrame({"category":ct.keys(), "value":ct.values()}).iplot(kind='box')

## Distribution Size of Each Category

In [None]:
pd.DataFrame({"category":ct.keys(), "value":ct.values()}).iplot(kind='bubble', x='category', y='value', size='value')

## Coverage Ratio of Each Category¶

In [None]:
pd.DataFrame({"category":ct.keys(), "value":ct.values()}).iplot(kind='pie', labels="category", values='value')

# Sentence Tokenization

In [None]:
!pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import numpy as np
import networkx as nx
import re

In [None]:
import re
from nltk.tokenize import sent_tokenize

def read_article(text):        
    sentences =[]        
    sentences = sent_tokenize(text)    
    for sentence in sentences:        
        sentence.replace("[^a-zA-Z0-9]"," ")     
    return sentences

In [None]:
file_path = df[df['article_or_summary']=='News Articles'].iloc[0]['path']
with open(file_path, "r") as f:
    article = f.read()

In [None]:
sent_tok = read_article(article)
sent_tok

# Spell Correction

In [None]:
!pip install textblob
from textblob import TextBlob

# Initialize an empty list to store corrected sentences
mod_sent = []

# Iterate over each sentence in the tokenized sentences
for tok in sent_tok:
    blob_obj = TextBlob(tok)  # Create a TextBlob object for the sentence
    correct_sent = str(blob_obj.correct())  # Correct the sentence using TextBlob
    print(f"\033[94mOriginal Token : {tok}\033[0m")
    print(f"\033[92mCorrected Token: {correct_sent}\033[0m")
    mod_sent.append(correct_sent)  # Add the corrected sentence to the list

In [None]:
" ".join(mod_sent)

# Sentence Similarity

In [None]:
!pip install tensorflow_hub
import tensorflow_hub as hub
import numpy as np

# Load the Universal Sentence Encoder model from TensorFlow Hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def sentence_similarity(sent1: str, sent2: str, embed) -> float:
    """
    Compute the cosine similarity between two sentences.

    Args:
        sent1 (str): First sentence.
        sent2 (str): Second sentence.
        embed: Pre-loaded Universal Sentence Encoder model.

    Returns:
        float: Cosine distance between the two sentence embeddings.
    """
    # Get the embeddings for the sentences
    embedding1 = embed([sent1])[0].numpy()
    embedding2 = embed([sent2])[0].numpy()
    
    # Compute cosine similarity
    cosine_similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
    
    # Return cosine distance (1 - cosine similarity)
    return 1 - cosine_similarity

In [None]:
# Display the sentences and their similarity score
sentence1 = mod_sent[0]
sentence2 = mod_sent[1]
similarity_score = sentence_similarity(sentence1, sentence2, embed)

print(f"\033[92mSentence 1        : {sentence1}\033[0m")
print(f"\033[92mSentence 2        : {sentence2}\033[0m")
print(f"\033[92mSimilarity Score  : {similarity_score:.4f}\033[0m")  # Rounded to 4 decimal places for better readability

In [None]:
import numpy as np

def build_similarity_matrix(sentences: list, embeds) -> np.ndarray:
    """
    Build a similarity matrix for a list of sentences.

    Args:
        sentences (list): A list of sentences.
        embeds: Pre-loaded Universal Sentence Encoder model.

    Returns:
        np.ndarray: A similarity matrix where each entry (i, j) represents 
                    the similarity score between sentences i and j.
    """
    num_sentences = len(sentences)
    similarity_matrix = np.zeros((num_sentences, num_sentences))

    for idx1 in range(num_sentences):
        for idx2 in range(idx1 + 1, num_sentences):  # Compute only upper triangle
            similarity = sentence_similarity(sentences[idx1], sentences[idx2], embeds)
            similarity_matrix[idx1][idx2] = similarity
            similarity_matrix[idx2][idx1] = similarity  # Symmetric assignment

    return similarity_matrix

In [None]:
# Build the similarity matrix using the corrected sentences and the embedding model
similarity_matrix = build_similarity_matrix(mod_sent, embed)

In [None]:
from bokeh.io import output_notebook, show
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
import networkx as nx
import numpy as np

output_notebook()

# Example similarity matrix
similarity_matrix = np.random.rand(10, 10)  # Replace with your actual similarity matrix

# Creating the graph
g = nx.Graph()

for i in range(similarity_matrix.shape[0]):
    for j in range(similarity_matrix.shape[1]):
        if similarity_matrix[i][j] >= 0.9:
            g.add_edge(i, j)

# Tooltip and plot setup
HOVER_TOOLTIPS = [("sent_tok", "@index")]
plot = figure(tooltips=HOVER_TOOLTIPS, tools="pan,wheel_zoom,save,reset",
              active_scroll='wheel_zoom',
              x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1))

# Creating the network graph
network_graph = from_networkx(g, nx.spring_layout, scale=7, center=(0, 0))
network_graph.node_renderer.glyph = Circle(radius=0.1, fill_color='green')  # Changed 'size' to 'radius'
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=1)

# Adding the network graph to the plot
plot.renderers.append(network_graph)

# Display the plot
show(plot)

# Summarization

In [None]:
# Extract the file path for 'Summaries'
file_path_summary = df.loc[df['article_or_summary'] == 'Summaries', 'path'].iloc[0]

# Read the file content
try:
    with open(file_path_summary, "r") as file:
        actual_summary = file.read()
except FileNotFoundError:
    print(f"Error: File not found at path {file_path_summary}")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

In [None]:
def generate_summary(text: str, top_n: int, embeds) -> str:
    sentences = read_article(text)

    # Handle cases with fewer sentences than top_n
    if len(sentences) < top_n:
        top_n = len(sentences)

    if not sentences:
        return "No content to summarize."

    sentence_similarity_matrix = build_similarity_matrix(sentences, embeds)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentences = sorted(
        ((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True
    )

    summarize_text = [ranked_sentences[i][1] for i in range(top_n)]
    return " ".join(summarize_text)

In [None]:
# Combine all corrected sentences into the original text
original_text = " ".join(mod_sent)

# Generate a summary of the original text
summarized_text = generate_summary(text=original_text, top_n=5, embeds=embed)

In [None]:
original_text

In [None]:
summarized_text

In [None]:
actual_summary

# Validation 
There are Multiple ways we can compary Two sentences to compute accuracy

N-Grams/Bleu Score : Mostly used in Translation
Similarity Score for Computing similarity from two sentences : Used mostly for Summary comparision or similar word/sentence Search.
In Our case 2nd option is best but will implement both Cases and see the difference of scores

# N-Grams/Bleu Score

In [None]:
import nltk

hypothesis = summarized_text
reference = actual_summary
BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
print(f"BLEUscore : {BLEUscore}")

# Similarity Score

In [None]:
def sentence_similarity(sent1,sent2,embed):  
    A = embed([sent1])[0]
    B = embed([sent2])[0]
    return 1 - (np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B)))

In [None]:
print(f"Senetence Similarity Score : {sentence_similarity(summarized_text, actual_summary, embed)}")

# Summarization With Sumy

In [None]:
!pip install  sumy
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

In [None]:
parser = PlaintextParser.from_string(original_text,Tokenizer("english"))

summarizer = LexRankSummarizer()
#Summarize the document with 2 sentences
summary = summarizer(parser.document, 5)

for sentence in summary:
    print(sentence)

# CONCLUSION 
As we approach the end of implementation Summarizing what we have done and achieved thus far:-> 
We gathered BBC articles and their summaries for the purpose of reference and comparison.-> 
We gathered a variety of methodologies and strategies for text summarization, including extractive and abstractive methods-> .
We delved further into extraction procedure-> s.
I picked up the Graph Implementation technique for Extractive Text Summarization. Six articles were converted to Senetence Toke-> ns.
A similarity matrix was computed for graph format-> ion.
The Page rank method was used to rank sentence tokens, and the top N were chosen to reflect the sum-> mary.
For the validation portion, we utilized both BleuScore and Similarity Score and discovered that Bleu cannot be used in our scenario and that Similarity Score is considerably more dependable.
Summarizing using Python is a :

This section allows us to compare and contrast our solution with some of the most popular Python tools available.
Summy is a simple library and command-line application that extracts summary information from HTML pages or plain text. The program also includes a simple assessment mechanism for text summaries. The implemented summarizing techniques are given in the documentation.ailable.