In [None]:
pip install pandas numpy nltk scikit-learn transformers textblob

In [None]:
pip install tensorflow

In [None]:
# Ensure required nltk resources are downloaded
import nltk
nltk.download('punkt')

In [2]:
import os
import glob
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize
from transformers import BartForConditionalGeneration, BartTokenizer
from rouge_score import rouge_scorer
from textblob import TextBlob
import sacrebleu

def read_text_files(folder_path):
    """
    Reads all text files from the specified folder and returns a dictionary with file names as keys and content as values.
    """
    print(f"Reading files from: {folder_path}")
    file_paths = glob.glob(os.path.join(folder_path, "*.txt"))
    print(f"Found files: {file_paths}")
    documents = {}
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            documents[os.path.basename(file_path)] = file.read()
    print(f"Documents read from {folder_path}: {list(documents.keys())}")
    return documents

def extractive_summarization(text, num_sentences=3):
    """
    Performs extractive summarization by selecting the top sentences based on cosine similarity scores.
    """
    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text

    vectorizer = CountVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    nx_graph = nx.from_numpy_array(cosine_matrix)

    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    selected_sentences = [sentence for score, sentence in ranked_sentences[:num_sentences]]
    return ' '.join(selected_sentences)

def abstractive_summarization(text, model, tokenizer):
    """
    Performs abstractive summarization using the BART model.
    """
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def evaluate_summaries(original, extractive, abstractive, reference):
    """
    Evaluates the quality of summaries using ROUGE and BLEU metrics.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    extractive_scores = scorer.score(reference, extractive)
    abstractive_scores = scorer.score(reference, abstractive)
    
    # Evaluate using BLEU
    bleu_extractive = sacrebleu.corpus_bleu([extractive], [[reference]])
    bleu_abstractive = sacrebleu.corpus_bleu([abstractive], [[reference]])
    
    return extractive_scores, abstractive_scores, bleu_extractive, bleu_abstractive

def sentiment_analysis(text):
    """
    Performs sentiment analysis on the given text using TextBlob.
    """
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def save_summary(summary, output_folder, filename):
    """
    Saves the given summary text to a specified folder with the given filename.
    """
    output_path = os.path.join(output_folder, filename)
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(summary)

def main(reports_folder_path, summaries_folder_path, extractive_output_folder_path, abstractive_output_folder_path):
    """
    Main function to read text files, perform summarization, evaluate the summaries, and save the results to a CSV file.
    """
    # Read text files from the specified folder
    reports = read_text_files(reports_folder_path)
    summaries = read_text_files(summaries_folder_path)

    # Load the pre-trained BART model and tokenizer for abstractive summarization
    bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') #(facebook/bart-large), (facebook/bart-base)
    bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

    results = []

    # Process each document
    for doc_name in reports:
        if doc_name in summaries:
            doc = reports[doc_name]
            reference_summary = summaries[doc_name]
            
            # Perform extractive summarization
            extractive_summary = extractive_summarization(doc)
            print(f"Extractive summary for {doc_name}: {extractive_summary}")

            # Perform abstractive summarization
            abstractive_summary = abstractive_summarization(doc, bart_model, bart_tokenizer)
            print(f"Abstractive summary for {doc_name}: {abstractive_summary}")

            # Save summaries to respective output folders
            save_summary(extractive_summary, extractive_output_folder_path, f"extractive_{doc_name}")
            save_summary(abstractive_summary, abstractive_output_folder_path, f"abstractive_{doc_name}")
            
            # Evaluate summaries using ROUGE and BLEU metrics
            extractive_scores, abstractive_scores, bleu_extractive, bleu_abstractive = evaluate_summaries(doc, extractive_summary, abstractive_summary, reference_summary)
            
            # Perform sentiment analysis on the summaries
            extractive_sentiment = sentiment_analysis(extractive_summary)
            abstractive_sentiment = sentiment_analysis(abstractive_summary)
            
            # Append results to the list
            results.append({
                'Document Name': doc_name,
                'Original Text': doc,
                'Reference Summary': reference_summary,
                'Extractive ROUGE-1': extractive_scores['rouge1'].fmeasure,
                'Extractive ROUGE-2': extractive_scores['rouge2'].fmeasure,
                'Extractive ROUGE-L': extractive_scores['rougeL'].fmeasure,
                'Abstractive ROUGE-1': abstractive_scores['rouge1'].fmeasure,
                'Abstractive ROUGE-2': abstractive_scores['rouge2'].fmeasure,
                'Abstractive ROUGE-L': abstractive_scores['rougeL'].fmeasure,
                'Extractive BLEU': bleu_extractive.score,
                'Abstractive BLEU': bleu_abstractive.score,
                'Extractive Sentiment Polarity': extractive_sentiment[0],
                'Extractive Sentiment Subjectivity': extractive_sentiment[1],
                'Abstractive Sentiment Polarity': abstractive_sentiment[0],
                'Abstractive Sentiment Subjectivity': abstractive_sentiment[1],
            })

    print(f"Results collected: {results}")

    # Save the results to a CSV file
    df = pd.DataFrame(results)
    df.to_csv('summarized_reports_evaluation.csv', index=False)
    print("Results saved to summarized_reports_evaluation.csv")

if __name__ == "__main__":
    # Set the path to the folder containing match report text files
    reports_folder_path = r'C:\Users\Sujan Tumbaraguddi\Desktop\Data Science\Assignments\3_7150 - Project\DATA\reports'
    # Set the path to the folder containing reference summary text files
    summaries_folder_path = r'C:\Users\Sujan Tumbaraguddi\Desktop\Data Science\Assignments\3_7150 - Project\DATA\summaries'
    # Set the path to the folder where generated extractive summaries will be saved
    extractive_output_folder_path = r'C:\Users\Sujan Tumbaraguddi\Desktop\Data Science\Assignments\3_7150 - Project\DATA\Gen_extractive'
    # Set the path to the folder where generated abstractive summaries will be saved
    abstractive_output_folder_path = r'C:\Users\Sujan Tumbaraguddi\Desktop\Data Science\Assignments\3_7150 - Project\DATA\Gen_abstractive'
    main(reports_folder_path, summaries_folder_path, extractive_output_folder_path, abstractive_output_folder_path)


Reading files from: C:\Users\Sujan Tumbaraguddi\Desktop\Data Science\Assignments\3_7150 - Project\DATA\reports
Found files: ['C:\\Users\\Sujan Tumbaraguddi\\Desktop\\Data Science\\Assignments\\3_7150 - Project\\DATA\\reports\\01.txt', 'C:\\Users\\Sujan Tumbaraguddi\\Desktop\\Data Science\\Assignments\\3_7150 - Project\\DATA\\reports\\02.txt', 'C:\\Users\\Sujan Tumbaraguddi\\Desktop\\Data Science\\Assignments\\3_7150 - Project\\DATA\\reports\\03.txt']
Documents read from C:\Users\Sujan Tumbaraguddi\Desktop\Data Science\Assignments\3_7150 - Project\DATA\reports: ['01.txt', '02.txt', '03.txt']
Reading files from: C:\Users\Sujan Tumbaraguddi\Desktop\Data Science\Assignments\3_7150 - Project\DATA\summaries
Found files: ['C:\\Users\\Sujan Tumbaraguddi\\Desktop\\Data Science\\Assignments\\3_7150 - Project\\DATA\\summaries\\01.txt', 'C:\\Users\\Sujan Tumbaraguddi\\Desktop\\Data Science\\Assignments\\3_7150 - Project\\DATA\\summaries\\02.txt', 'C:\\Users\\Sujan Tumbaraguddi\\Desktop\\Data Scien