In [1]:
import networkx as nx # for graph representation, here it is used for rank awarding for textrank. 
import numpy as np    # for array/matrix creation
from nltk.corpus import stopwords # loading stopwords from nltk.corpus toolkit 
from nltk.stem import WordNetLemmatizer # for stemming WordNetLemmatizer is used
from nltk.tokenize import sent_tokenize, word_tokenize # for tokenizing sentences and words using nltk.tokenize 
from sklearn.metrics.pairwise import cosine_similarity # calculating cosiner similarity
from sklearn.cluster import KMeans #for kmean clustering purpose. 
from scipy.spatial import distance # for finding the euclidean distance for clustering in kmeans
from evaluate import load # evaluation using metrics
# Load the ROUGE metric
import evaluate 
import pandas as pd # for dataframe creation
import re # regular expression usage
import os # file/ folder handling
import matplotlib.pyplot as plt # plotting graphs
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns # for heatmap creation
import numpy as np

wl = WordNetLemmatizer()

def reading_text(filepath) -> str:
    #print("Reading text")
    with open(filepath, 'rt',encoding='utf-8') as file1:
        temp = file1.read()
        #temp = re.sub('[^a-zA-Z0-9£₹]', " ", temp)
        raw_report = temp.lower()
    return raw_report

def extract_word_vectors() -> dict:
    """
    Extracting word embeddings. These are the n vector representation of words.
    """
    #print('Extracting word vectors')

    word_embeddings = {}
    # Here we use glove word embeddings of 100 dimension
    f = open('glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs

    f.close()
    return word_embeddings


def text_preprocessing(sentences: list) -> list:
    """
    Pre processing text to remove unnecessary words.
    """
    #print('Preprocessing text')

    stop_words = set(stopwords.words('english'))
    for i in range(len(sentences)):
        sen = re.sub('[^a-zA-Z0-9£₹]', " ", sentences[i])  
        #sen = sen.lower()  
        sen=sentences[i]
        sen=sen.split()                         
        sen = ' '.join([i for i in sen if i not in stopwords.words('english')])
        sentences.append(sen)


    clean_words = None
    for sent in sentences:
        words = word_tokenize(sent)
        #words = [wl.lemmatize(word.lower()) for word in words if word.isalnum()]
        clean_words = [word for word in words if word not in stop_words]

    return clean_words


def sentence_vector_representation(sentences: list, word_embeddings: dict) -> list:
    """
    Creating sentence vectors from word embeddings.
    """
    #print('Sentence embedding vector representations')

    sentence_vectors = []
    for sent in sentences:
        clean_words = text_preprocessing([sent])
        # Averaging the sum of word embeddings of the sentence to get sentence embedding vector
        v = sum([word_embeddings.get(word, np.zeros(100, )) for word in clean_words]) / (len(clean_words) + 0.001)
        sentence_vectors.append(v)

    return sentence_vectors


def create_similarity_matrix(sentences: list, sentence_vectors: list) -> np.ndarray:
    """
    Using cosine similarity, generate similarity matrix.
    """
    #print('Creating similarity matrix')
     # Vectorize sentences
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    
    # Calculate cosine similarity between sentence vectors
    sim_mat = cosine_similarity(X, X)
    # Defining a zero matrix of dimension n * n
    #sim_mat = np.zeros([len(sentences), len(sentences)])
    #for i in range(len(sentences)):
    #    for j in range(len(sentences)):
    #        if i != j:
    #            # Replacing array value with similarity value.
    #            # Not replacing the diagonal values because it represents similarity with its own sentence.
    #            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0]

    return sim_mat


def determine_sentence_rank(sentences: list, sim_mat: np.ndarray):
    """
    Determining sentence rank using Page Rank algorithm.
    """
    #print('Determining sentence ranks')
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted([(scores[i], s[:15]) for i, s in enumerate(sentences)], reverse=True)
    return ranked_sentences


def generate_summary_textrank(sentences: list, ranked_sentences: list):
    """
    Generate a sentence for sentence score greater than average.
    """
    #print('Generating summary')

    # Get top 1/3 th ranked sentences
    top_ranked_sentences = ranked_sentences[:int(len(sentences) / 3)] if len(sentences) >= 3 else ranked_sentences

    sentence_count = 0
    summary = ''

    for i in sentences:
        for j in top_ranked_sentences:
            if i[:15] == j[1]:
                summary += i + ' '
                sentence_count += 1
                break
    #summary = ''.join(summary.split())
    return summary

def generate_summary_kmeans(sentences_textrank: list, sen_vectors_textrank: list)->str:
    # Calculate WCSS for different values of n_clusters
    #wcss = []
    #for i in range(1, 11):
    #    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    #    kmeans.fit(sen_vectors_textrank)
    #    wcss.append(kmeans.inertia_)

    # Calculate the differences between consecutive WCSS values
    #differences = np.diff(wcss)

    # Calculate the percentage change in WCSS
    #percent_change = (differences / wcss[:-1]) * 100

    # Find the index corresponding to the maximum percentage change
    #optimal_index = np.argmax(percent_change)

    # Plot the elbow curve
    #plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
    #plt.title('Elbow Method')
    #plt.xlabel('Number of clusters (n_clusters)')
    #plt.ylabel('Within-cluster sum of squares (WCSS)')
    #plt.show()

    # The optimal value for n_clusters is one more than the index with maximum percentage change
    optimal_n_clusters = 8 #optimal_index + 1
    #print("Optimal number of clusters (n_clusters):", optimal_n_clusters)
    n_clusters = optimal_n_clusters # int(input("Number of clusters: "))
    kmeans = KMeans(n_clusters, init = 'k-means++', random_state = 42)
    y_kmeans = kmeans.fit_predict(sen_vectors_textrank)

    #finding and printing the nearest sentence vector from cluster centroid

    my_list=[]
    for i in range(n_clusters):
        my_dict={}
    
        for j in range(len(y_kmeans)):
            if y_kmeans[j]==i:
                my_dict[j] =  distance.euclidean(kmeans.cluster_centers_[i],sen_vectors_textrank[j])
        #min_distance = min(my_dict.values())
        #my_list.append(min(my_dict, key=my_dict.get))
        min_distances = sorted(my_dict.values())[:4]  # Select only 4 sentences from each cluster
        selected_indices = [index for index, distance_value in my_dict.items() if distance_value in min_distances]
        my_list.extend(selected_indices)
    print(f'No. of sentences in the kmeans:{len(my_list)}\n')
    summary_kmeans = ''.join(sentences_textrank[i] for i in sorted(my_list))
    #summary_kmeans = ''.join(summary_kmeans.split())
    return summary_kmeans


def main():
    directory_path_read = "C:/Users/user/Desktop/Text books/fns2020_dataset/validation/validation/annual_reports"
    directory_path_write= "C:/Users/user/Desktop/Text books/fns2020_dataset/validation/validation/G-summaries/"
    for filename in os.listdir(directory_path_read):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory_path_read, filename)
            text = reading_text(filepath)
            #print('Original Text::-----------------\n')

            sentences = sent_tokenize(text.strip())
            #print('Sentences:---------------------\n',sentences)

            word_embeddings = extract_word_vectors()
            #print('Word embeddings:---------------------\n',len(word_embeddings))

            sentence_vectors = sentence_vector_representation(sentences, word_embeddings)
            #print('Sentence vectors:--------------------\n', len(sentence_vectors), sentence_vectors)

            sim_mat = create_similarity_matrix(sentences, sentence_vectors)
            #print('Similarity matrix:-------------------\n', sim_mat.shape, sim_mat)

            ranked_sentences = determine_sentence_rank(sentences, sim_mat)
            #print('Length of Ranked Sentences-------------\n', len(ranked_sentences))
            #print('Ranked sentences:--------------------\n', ranked_sentences)

            summary = generate_summary_textrank(sentences, ranked_sentences)
            #print('Length of summary after TestRank-------------\n', len(summary))
            #print('Summary generated by TextRank--------------------\n',summary)
    
            sentences_textrank = sent_tokenize(summary.strip())
            #print("Length of the text rank: \n", len(sentences_textrank))
            #print('Sentences:--------------------------------------\n',sentences_textrank)

            word_embeddings_textrank = extract_word_vectors()
            #print('Word embeddings:-----------------------------------\n',len(word_embeddings_textrank))

            sentence_vectors_textrank = sentence_vector_representation(sentences_textrank, word_embeddings)
            #print('Sentence vectors of summary generated by textrank ---------------------------\n', len(sentence_vectors_textrank), sentence_vectors_textrank)
    
            similarity_matrix = cosine_similarity(sentence_vectors_textrank, sentence_vectors)
            max_similarity_indices = similarity_matrix.argmax(axis=1)
            #print(f'Length of Indices is: {len(max_similarity_indices)}')
            #print(f'Indices of comparision:{max_similarity_indices}')
            sentence_vectors_textrank=sorted(sentence_vectors_textrank, key=lambda x: x[0])
            summary_final = generate_summary_kmeans(sentences_textrank, sentence_vectors_textrank)
            #print('Summary generated by combining TextRank and K-means--------------------\n',summary_final)
            summary_filename = os.path.join(directory_path_write, filename.replace(".txt", "_1.txt"))
            with open(summary_filename, 'w', encoding='utf-8') as summary_file:
                summary_file.write(summary_final)
        print(f"Summary for '{filename}' written to '{summary_filename}'.")
    
            #plot_similarity_matrix(sim_mat,sentences)
    
            #rouge = evaluate.load('rouge')
            #candidates = [summary_final]
    
    #with open('17_1.txt', 'rt',encoding='utf-8') as file2:
    #    temp = file2.read()
    #    summary_one = temp
        
        
    #with open('17_2.txt', 'rt',encoding='utf-8') as file3:
    #   temp = file3.read()
    #    summary_two = temp
        
    #with open('17_3.txt', 'rt',encoding='utf-8') as file4:
    #    temp = file4.read()
    #    summary_three = temp
    
    #with open('17_4.txt', 'rt',encoding='utf-8') as file5:
    #    temp = file5.read()
    #    summary_four = temp

    #references1 = [summary_one]
    #references2 = [summary_two]
    #references3 = [summary_three]
    #references4 = [summary_four]

    #results1 = rouge.compute(predictions=candidates, references=references1)
    #results2 = rouge.compute(predictions=candidates, references=references2)
    #results3 = rouge.compute(predictions=candidates, references=references3)
    #results4 = rouge.compute(predictions=candidates, references=references4)
    
    #results=[results1,results2,results3,results4]
    #data_frame= pd.DataFrame(results,index=['Summary1_Scores','Summary2_Scores','Summary3_Scores','Summary4_Scores'])
    #print("The Results of checking with combination of TextRank and K-means:")
    #print(data_frame,'\n')
    
    #candidates1=[summary]
    
    #references1 = [summary_one]
    #references2 = [summary_two]
    #references3 = [summary_three]
    #references4 = [summary_four]

    #results1 = rouge.compute(predictions=candidates1, references=references1)
    #results2 = rouge.compute(predictions=candidates1, references=references2)
    #results3 = rouge.compute(predictions=candidates1, references=references3)
    #results4 = rouge.compute(predictions=candidates1, references=references4)
    
    #results=[results1,results2,results3,results4]
    #data_frame= pd.DataFrame(results,index=['Summary1_Scores','Summary2_Scores','Summary3_Scores','Summary4_Scores'])
    #print("The Results of checking with only TextRank:")
    #print(data_frame,'\n')
        
#def plot_similarity_matrix(similarity_matrix, labels, max_sentences=10):
#    #create labels for sentences
#    labels= [f"Sentences {i}" for i in range(1,max_sentences+1)]
    #Select the top 'max_sentences' sentences
#    top_similarity_matrix = similarity_matrix[:max_sentences,:max_sentences]
    
    # Create a heatmap
#    sns.set(font_scale=1)
#    plt.figure(figsize=(10,10))
#    ax = sns.heatmap(top_similarity_matrix, annot=True, cmap="PRGn", xticklabels=labels, yticklabels=labels)
    #Customize the plot
#    plt.title("Similarity Matrix of 10 sentences")
#    plt.xlabel("Sentences")
#    plt.ylabel("Sentences")
    #print('\nOriginal document-------------------------------------------------\n',text,end='\n'*2)
    #print('Summary------------------------------------------------------------------\n',len(summary_final))
    
    
if __name__ == "__main__":
    main()


No. of sentences in the kmeans:32

Summary for '30855.txt' written to 'C:/Users/user/Desktop/Text books/fns2020_dataset/validation/validation/G-summaries/30855_1.txt'.
No. of sentences in the kmeans:34

Summary for '30856.txt' written to 'C:/Users/user/Desktop/Text books/fns2020_dataset/validation/validation/G-summaries/30856_1.txt'.
No. of sentences in the kmeans:33

Summary for '30858.txt' written to 'C:/Users/user/Desktop/Text books/fns2020_dataset/validation/validation/G-summaries/30858_1.txt'.
No. of sentences in the kmeans:32

Summary for '30866.txt' written to 'C:/Users/user/Desktop/Text books/fns2020_dataset/validation/validation/G-summaries/30866_1.txt'.
No. of sentences in the kmeans:33

Summary for '30874.txt' written to 'C:/Users/user/Desktop/Text books/fns2020_dataset/validation/validation/G-summaries/30874_1.txt'.
No. of sentences in the kmeans:32

Summary for '30886.txt' written to 'C:/Users/user/Desktop/Text books/fns2020_dataset/validation/validation/G-summaries/30886

In [None]:
##### Renaming Filename with taskname############ 

In [2]:
import os
import re

def move_and_rename_files(source_folder,destination_folder):
    # Ensure destination folder exists, create if not
    
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
      

    # Loop through files in the source folder
    for filename in os.listdir(source_folder):
        #print(source_folder)
        source_path = os.path.join(source_folder, filename)
        print(source_path)
        # Ensure it is a file (not a folder)
        if os.path.isfile(source_path):
            # Extract file name and extension
            fname, fext = os.path.splitext(filename)
            fname1 = fname.split('_')[0]
            # Construct the new name
            new_name = f'fns{fname1}_{fname1}{fext}'

            # Construct the destination path
            destination_path = os.path.join(destination_folder, new_name)

            # Move the file by renaming it
            os.rename(source_path, destination_path)
            print(f'Moved and renamed: {filename} to {new_name}')

# Replace these paths with your actual source and destination paths
source_path = r'C:/Users/user/Desktop/Text books/fns2020_dataset/validation/validation/annual_reports/'
destination_path = r'C:/Users/user/Desktop/Text books/fns2020_dataset/'

move_and_rename_files(source_path,destination_path)

In [None]:
##### Renaming Gold summaries Filename with taskname############ 

In [None]:
import os

def rename_files_with_prefix(folder_path, prefix):
    # Ensure the folder exists
    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' does not exist.")
        return

    # Iterate through files in the folder
    for filename in os.listdir(folder_path):
        old_path = os.path.join(folder_path, filename)

        # Check if it's a file
        if os.path.isfile(old_path):
            # Extract the numeric part from the filename
            numeric_part = ''.join(c for c in filename.split('_')[0] if c.isdigit())

            # Construct the new filename with the specified prefix and extracted numeric part
            new_filename = f'{prefix}{numeric_part}_{filename}'

            # Construct the new path
            new_path = os.path.join(folder_path, new_filename)

            # Rename the file
            os.rename(old_path, new_path)
            print(f'Renamed: {filename} to {new_filename}')

# Replace these values with your actual folder path and desired prefix
folder_path = r'C:\Users\user\Desktop\Text books\Topics in NLP\fns2020_dataset\validation\validation\gold_summaries'
prefix = 'fns'

rename_files_with_prefix(folder_path, prefix)

In [None]:
###### Running JAR file using command prompt###############

In [3]:
# to execute an external command
import subprocess

# Path to the Java executable
java_executable = "java"

# Path to the .jar file you want to run
jar_file = "C:/Users/user/Desktop/Text books/Topics in NLP/rouge2_v1.2.2_runnable/v1.2.2/rouge2-1.2.2.jar"

# Command to execute the .jar file
command = [java_executable, "-jar", jar_file]

# Run the command
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Wait for the process to finish and get the output
stdout, stderr = process.communicate()

# Print the output
print("Output:", stdout.decode())
print("Errors:", stderr.decode())

Output: 2024-02-10 10:56:24 INFO  SettingsUtil:33 - Using rouge.properties='C:\Users\user\PHD Work\rouge.properties'
2024-02-10 10:56:24 INFO  ROUGECalculator:203 - Working on fns32134 ngram=L
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns32134 ngram=1
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns32134 ngram=2
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns32134 ngram=SU4
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns32376 ngram=L
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns32376 ngram=1
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns32376 ngram=2
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns32376 ngram=SU4
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns30991 ngram=L
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns30991 ngram=1
2024-02-10 10:56:25 INFO  ROUGECalculator:203 - Working on fns30991 ngram=2
2024-02-10 10:56:25 INFO  ROUGE

In [None]:
###### Calculating the Average Rouge Score for the generated ############

In [4]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("results.csv")

# Group the DataFrame by "ROUGE-Type" and calculate the averages for each group
averages_by_rouge_type = df.groupby("ROUGE-Type")[["Avg_Precision", "Avg_Recall", "Avg_F-Score"]].mean()

# Display the calculated averages
print("Averages for each ROUGE-Type:")
print(averages_by_rouge_type)


Averages for each ROUGE-Type:
                           Avg_Precision  Avg_Recall  Avg_F-Score
ROUGE-Type                                                       
ROUGE-1+StopWordRemoval         0.229219    0.346082     0.262690
ROUGE-2+StopWordRemoval         0.080570    0.149427     0.098990
ROUGE-L+StopWordRemoval         0.196099    0.298000     0.230466
ROUGE-SU4+StopWordRemoval       0.114745    0.223018     0.143899


In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\KBS
[nltk_data]     Iyer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True