<a href="https://colab.research.google.com/github/Shahi77/Graph-based-Multi-Document-Summarization/blob/main/gcn_mds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Check if the files exist
print(os.listdir('/content/drive/My Drive/Colab Notebooks/datasets/final/'))

['train_final_Vishwas.csv', 'SGM3.ipynb', 'test_final_Vishwas_corrected.json', 'train_final_Vishwas_corrected.json', 'bekaar.ipynb', 'test_final_Vishwas.csv', 'Summaries.zip', 'T5.ipynb', 'minilm_bal_exsum.pth', 'T5_model.pth', 'bekaar_SGM.ipynb', 'SBERT_Model_Ready.pth', 'Summary_Data_new', 'Whole_text_data', 'Summaries']


In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


MDS On Whole_text_data

In [6]:
import os
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy model for English
nlp = spacy.load('en_core_web_md')

# Path to the dataset
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'


In [13]:
# Function to read document clusters from the dataset directory
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)

    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

In [19]:
import os
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Define device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to read document clusters from the dataset directory
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)

    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
            print(f"Loaded {file_name}")
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

# Tokenization using spaCy
def tokenize_with_spacy(text, nlp):
    doc = nlp(text)
    return [token.text for token in doc]

# Get sentence embeddings using spaCy vectors
def get_embeddings_with_spacy(text, nlp):
    doc = nlp(text)
    return doc.vector  # Returns a 1D vector of the document

# Function to split document into sentences
def split_into_sentences(text):
    sentences = text.split('.')
    return [sentence.strip() for sentence in sentences if sentence]

# Custom function to calculate cosine similarity with a threshold
def cosine_similarity_threshold(embeddings, threshold=0.2):
    sim_matrix = cosine_similarity(embeddings)
    adj_matrix = (sim_matrix > threshold).astype(float)  # Apply threshold to create edges
    return adj_matrix

# Extract sentence features
def extract_sentence_features(sentences, nlp):
    features = []
    for i, sentence in enumerate(sentences):
        length = len(sentence.split())
        features.append({
            'position': i,
            'length': length,
            'proper_nouns': sum([1 for token in tokenize_with_spacy(sentence, nlp) if token.isupper()]),
            'is_first_three': 1 if i < 3 else 0
        })
    return features

# Calculate sentence personalization score
def calculate_personalization_scores(features):
    weights = {
        'position': 0.1,
        'length': 0.2,
        'proper_nouns': 0.4,
        'is_first_three': 0.3
    }
    personalization_scores = []
    for feature in features:
        score = sum([feature[key] * weights[key] for key in weights])
        personalization_scores.append(score)
    return personalization_scores

# Build the sentence relation graph using cosine similarity and personalization
def build_sentence_relation_graph(sentences, nlp, threshold=0.2):
    print(f"Building sentence relation graph for {len(sentences)} sentences.")
    sentence_embeddings = [get_embeddings_with_spacy(sentence, nlp) for sentence in sentences]
    sentence_embeddings = np.vstack(sentence_embeddings)
    print(f"Shape of sentence embeddings: {sentence_embeddings.shape}")

    adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)
    print(f"Adjacency matrix shape: {adj_matrix.shape}")

    sentence_features = extract_sentence_features(sentences, nlp)
    personalization_scores = calculate_personalization_scores(sentence_features)
    print(f"Personalization scores: {personalization_scores}")

    for i in range(adj_matrix.shape[0]):
        for j in range(adj_matrix.shape[1]):
            if adj_matrix[i, j] > 0:
                adj_matrix[i, j] *= personalization_scores[i]

    adj_matrix = adj_matrix / adj_matrix.max()
    print("Adjacency matrix normalized.")
    return adj_matrix

# GRU-based Sentence Encoder
class GRUSentenceEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUSentenceEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        _, h_n = self.gru(x)
        return h_n.squeeze(0)

# Processing document cluster through the GRU
def process_document_cluster_through_gru(documents, nlp, hidden_size=128):
    input_size = 300  # SpaCy vectors are 300-dimensional
    encoder = GRUSentenceEncoder(input_size, hidden_size).to(device)

    sentence_encodings = []
    for doc in documents:
        sentences = split_into_sentences(doc)
        print(f"Processing document with {len(sentences)} sentences.")
        sentence_embeddings = [torch.tensor(get_embeddings_with_spacy(sentence, nlp)).to(device) for sentence in sentences]
        embeddings_tensor = torch.stack(sentence_embeddings).unsqueeze(0).to(device)

        # Ensure that the input tensor shape is correct for GRU
        sentence_encoding = encoder(embeddings_tensor)
        sentence_encodings.append(sentence_encoding)

    return torch.stack(sentence_encodings)

# GCN Layer Definition
# class GCNLayer(nn.Module):
#     def __init__(self, in_channels, out_channels):
#         super(GCNLayer, self).__init__()
#         self.fc = nn.Linear(in_channels, out_channels)

#    def forward(self, x, adj):
#     # Normalize the adjacency matrix
#       adj = self.normalize_adj(adj)

#     # Check shapes before multiplication
#     `print(f"Shape of adj: {adj.shape}, Shape of x: {x.shape}")

#     # Ensure dimensions match
#     `if adj.shape[0] != x.shape[0]:
#         raise ValueError(f"Dimension mismatch: adj has {adj.shape[0]} nodes, but x has {x.shape[0]} features.")

#     # Perform the multiplication
#        x = torch.matmul(adj, x)
#        return F.relu(x)

#     def normalize_adj(self, adj):
#         num_nodes = adj.size(0)
#         I = torch.eye(num_nodes).to(device)
#         adj = adj + I
#         D = torch.sum(adj, dim=1)
#         D_inv_sqrt = torch.pow(D, -0.5)
#         D_inv_sqrt[torch.isinf(D_inv_sqrt)] = 0
#         D_inv_sqrt = torch.diag(D_inv_sqrt)
#         return torch.matmul(D_inv_sqrt, torch.matmul(adj, D_inv_sqrt))

class GCNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.fc = nn.Linear(in_channels, out_channels)

    def forward(self, x, adj):
        adj = self.normalize_adj(adj)
        assert adj.shape[0] == x.shape[0], "Dimension mismatch: adj and x must have the same number of nodes."

        x = torch.matmul(adj, x)
        return F.relu(x)

    def normalize_adj(self, adj):
        num_nodes = adj.size(0)
        I = torch.eye(num_nodes).to(device)
        adj = adj + I
        D = torch.sum(adj, dim=1)
        D_inv_sqrt = torch.pow(D, -0.5)
        D_inv_sqrt[torch.isinf(D_inv_sqrt)] = 0
        D_inv_sqrt = torch.diag(D_inv_sqrt)
        return torch.matmul(D_inv_sqrt, torch.matmul(adj, D_inv_sqrt))


# Read and process documents
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'  # Replace with actual path
documents = read_document_cluster(summaries_dir)
print(f"Loaded {len(documents)} documents.")

# Initialize spaCy
nlp = spacy.load('en_core_web_md')

# Split documents into sentences
sentences_per_document = [split_into_sentences(doc) for doc in documents]
print("Done splitting documents into sentences.")

# Build sentence relation graphs for each document
adj_matrices = [build_sentence_relation_graph(sentences, nlp) for sentences in sentences_per_document]
print("Done building adjacency matrices.")

# Process documents through GRU
sentence_encodings = process_document_cluster_through_gru(documents, nlp, hidden_size=128)
print(f"Processed document clusters through GRU. Shape: {sentence_encodings.shape}")

# Example usage of GCN layer
gcn = GCNLayer(128, 64).to(device)
# Convert adj_matrices to tensor and move to device
adj_tensor = torch.tensor(adj_matrices[0], dtype=torch.float32).to(device)

# Check the shape of the sentence encodings
print(f"Shape of sentence_encodings before: {sentence_encodings.shape}")

# Ensure the encodings have the correct dimensions [608, 128]
if len(sentence_encodings.shape) == 3 and sentence_encodings.shape[1] == 1:
    sentence_encodings = sentence_encodings.squeeze(1)  # Remove the second dimension

# Verify the final shape of sentence_encodings
print(f"Shape of sentence_encodings after: {sentence_encodings.shape}")

# Ensure adj_tensor has the right dimensions
print(f"Shape of adj_tensor: {adj_tensor.shape}")

# Apply the GCN layer to the sentence encodings and adjacency matrix
gcn_output = gcn(sentence_encodings, adj_tensor)

# Print the shape of the output
print("GCN output shape: ", gcn_output.shape)



Using device: cpu
Loaded P02-1022.txt
Loaded P02-1053.txt
Loaded P95-1026.txt
Loaded J90-1003.txt
Loaded P06-1004.txt
Loaded H05-1079.txt
Loaded W04-3206.txt
Loaded W03-0428.txt
Loaded P93-1022.txt
Loaded W06-2933.txt
Loaded A94-1006.txt
Loaded D08-1027.txt
Loaded P05-1073.txt
Loaded P04-1066.txt
Loaded P04-1077.txt
Loaded P95-1021.txt
Loaded W02-0817.txt
Loaded P03-1023.txt
Loaded P02-1017.txt
Loaded P08-1012.txt
Loaded D09-1127.txt
Loaded H05-1059.txt
Loaded P06-2005.txt
Loaded P06-2066.txt
Loaded P07-1005.txt
Loaded P01-1005.txt
Loaded D11-1125.txt
Loaded P97-1013.txt
Loaded W06-0301.txt
Loaded W07-2009.txt
Loaded P11-1061.txt
Loaded W97-0119.txt
Loaded P02-1038.txt
Loaded P98-2204.txt
Loaded A00-1043.txt
Loaded W02-0109.txt
Loaded P03-1029.txt
Loaded N06-1056.txt
Loaded N03-1016.txt
Loaded D09-1005.txt
Loaded W03-0424.txt
Loaded P06-1005.txt
Loaded N06-1011.txt
Loaded W06-1615.txt
Loaded N04-1033.txt
Loaded I05-2038.txt
Loaded P04-3022.txt
Loaded P99-1065.txt
Loaded E03-1076.txt
Lo

AssertionError: Dimension mismatch: adj and x must have the same number of nodes.

In [None]:
# Tokenization using spaCy
def tokenize_with_spacy(text):
    doc = nlp(text)
    return [token.text for token in doc]

# Get sentence embeddings using spaCy vectors
def get_embeddings_with_spacy(text):
    doc = nlp(text)
    return doc.vector  # Returns a 1D vector of the document

# Function to split document into sentences
def split_into_sentences(text):
    sentences = text.split('.')
    return [sentence.strip() for sentence in sentences if sentence]


In [None]:
# Custom function to calculate cosine similarity with a threshold
from sklearn.preprocessing import normalize

def cosine_similarity_threshold(embeddings, threshold=0.1):
    # Normalize the embeddings
    embeddings = normalize(embeddings)
    sim_matrix = cosine_similarity(embeddings)
    adj_matrix = (sim_matrix > threshold).astype(float)  # Apply threshold to create edges
    return adj_matrix


# Extract sentence features
def extract_sentence_features(sentences):
    features = []
    for i, sentence in enumerate(sentences):
        length = len(sentence.split())
        features.append({
            'position': i,
            'length': length,
            'proper_nouns': sum([1 for token in tokenize_with_spacy(sentence) if token.isupper()]),
            'is_first_three': 1 if i < 3 else 0
        })
    return features

# Calculate sentence personalization score
def calculate_personalization_scores(features):
    weights = {
        'position': 0.1,
        'length': 0.2,
        'proper_nouns': 0.4,
        'is_first_three': 0.3
    }
    personalization_scores = []
    for feature in features:
        score = sum([feature[key] * weights[key] for key in weights])
        personalization_scores.append(score)
    return personalization_scores

# Build the sentence relation graph using cosine similarity and personalization
def build_sentence_relation_graph(sentences, threshold=0.2):
    sentence_embeddings = [get_embeddings_with_spacy(sentence) for sentence in sentences]
    sentence_embeddings = np.vstack(sentence_embeddings)

    adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)
    sentence_features = extract_sentence_features(sentences)
    personalization_scores = calculate_personalization_scores(sentence_features)

    for i in range(adj_matrix.shape[0]):
        for j in range(adj_matrix.shape[1]):
            if adj_matrix[i, j] > 0:
                adj_matrix[i, j] *= personalization_scores[i]

    adj_matrix = adj_matrix / adj_matrix.max()
    return adj_matrix

In [None]:
class GRUSentenceEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUSentenceEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        _, h_n = self.gru(x)
        return h_n.squeeze(0)

# Processing document cluster through the GRU
from torch.nn.utils.rnn import pad_sequence

def process_document_cluster_through_gru(documents, hidden_size=128):
    input_size = 300  # SpaCy vectors are 300-dimensional
    encoder = GRUSentenceEncoder(input_size, hidden_size).to(device)

    sentence_encodings = []
    for doc in documents:
        sentences = split_into_sentences(doc)
        sentence_embeddings = [torch.tensor(get_embeddings_with_spacy(sentence)).to(device) for sentence in sentences]

        # Padding sentence embeddings to ensure uniform shape
        embeddings_tensor = pad_sequence(sentence_embeddings, batch_first=True).unsqueeze(0).to(device)

        # Ensure that the input tensor shape is correct for GRU
        sentence_encoding = encoder(embeddings_tensor)
        sentence_encodings.append(sentence_encoding)

    return torch.stack(sentence_encodings)




In [None]:
class GCNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.fc = nn.Linear(in_channels, out_channels)

    def forward(self, x, adj):
        x = self.fc(x)
        adj = self.normalize_adj(adj)
        x = torch.matmul(adj, x)
        return F.relu(x)

    def normalize_adj(self, adj):
        num_nodes = adj.size(0)
        I = torch.eye(num_nodes).to(device)
        adj = adj + I
        D = torch.sum(adj, dim=1)
        D_inv_sqrt = torch.pow(D, -0.5)
        D_inv_sqrt[torch.isinf(D_inv_sqrt)] = 0
        D_inv_sqrt = torch.diag(D_inv_sqrt)
        return torch.matmul(D_inv_sqrt, torch.matmul(adj, D_inv_sqrt))



In [None]:
class DocumentClusterEncoder(nn.Module):
    def __init__(self, sentence_input_size, sentence_hidden_size, doc_input_size, doc_hidden_size):
        super(DocumentClusterEncoder, self).__init__()
        self.sentence_encoder = GRUSentenceEncoder(sentence_input_size, sentence_hidden_size)
        self.doc_encoder = GRUSentenceEncoder(sentence_hidden_size, doc_hidden_size)

    def forward(self, document_clusters):
        document_embeddings = []

        for doc_cluster in document_clusters:
            sentence_embeddings = self.sentence_encoder(doc_cluster)
            sentence_embeddings = sentence_embeddings.unsqueeze(0)
            doc_embedding = self.doc_encoder(sentence_embeddings)
            document_embeddings.append(doc_embedding)

        document_embeddings = torch.stack(document_embeddings)
        cluster_embedding = torch.mean(document_embeddings, dim=0)
        return cluster_embedding


In [None]:
class SalienceEstimator(nn.Module):
    def __init__(self, hidden_size):
        super(SalienceEstimator, self).__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Parameter(torch.randn(hidden_size))

    def forward(self, sentence_embeddings, cluster_embedding):
        cluster_embedding = cluster_embedding.unsqueeze(0)
        scores = torch.matmul(
            F.tanh(self.W1(cluster_embedding) + self.W2(sentence_embeddings)),
            self.v
        )
        salience_scores = F.softmax(scores.squeeze(0), dim=0)
        return salience_scores


In [None]:
import torch

# Define the device to use: GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Read and process documents
documents = read_document_cluster(summaries_dir)
sentences_per_document = [split_into_sentences(doc) for doc in documents]
# first_document = documents[0]
# sentences_per_document = split_into_sentences(first_document)

print("done with sentences")

# Build sentence relation graphs for each document
adj_matrices = [build_sentence_relation_graph(sentences) for sentences in sentences_per_document]
print("done with adj_matrices")

# Process documents through GRU
sentence_encodings = process_document_cluster_through_gru(documents, hidden_size=128)
#sentence_encodings = process_document_cluster_through_gru([first_document], hidden_size=128)
print("done with encodings")

# Example usage of GCN layer
gcn = GCNLayer(128, 64).to(device)
adj_tensor = torch.tensor(adj_matrices[0], dtype=torch.float32).to(device)
sentence_encodings = sentence_encodings.to(device)

# Check the shape of the sentence encodings
# Check the shape of the sentence encodings
print(f"Shape of sentence_encodings before: {sentence_encodings.shape}")

# Ensure that the encodings have the correct dimensions [num_sentences, embedding_size]
if len(sentence_encodings.shape) == 3 and sentence_encodings.shape[1] == 1:
    sentence_encodings = sentence_encodings.squeeze(1)  # Remove extra dimension if needed

print(f"Shape of sentence_encodings after: {sentence_encodings.shape}")

# Apply the GCN layer to the sentence encodings and adjacency matrix
gcn_output = gcn(sentence_encodings, adj_tensor)

print("GCN output: ", gcn_output)



done with sentences
done with adj_matrices
done with encodings
Shape of sentence_encodings before: torch.Size([608, 1, 128])
Shape of sentence_encodings after: torch.Size([608, 1, 128])


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [608, 331] but got: [608, 1].

In [None]:
import os
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer  # Ensure rouge_score is installed: pip install rouge-score

# Function to read document clusters from the dataset directory
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)

    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

# Function to read reference summaries
def read_reference_summaries(reference_dir):
    references = {}
    summaries_files = os.listdir(reference_dir)

    for file_name in summaries_files:
        file_path = os.path.join(reference_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                references[file_name] = file.read()
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return references

# Sentence selection function
def select_sentences(salience_scores, sentences, length_limit=55, redundancy_threshold=0.5):
    if salience_scores.dim() > 1:
        salience_scores = salience_scores.squeeze(0)

    scores, indices = torch.sort(salience_scores, descending=True)
    selected_sentences = []
    current_length = 0
    tfidf_vectorizer = TfidfVectorizer()

    for idx in indices:
        sentence = sentences[idx]
        sentence_length = len(sentence.split())

        if current_length + sentence_length > length_limit:
            break

        if selected_sentences:
            all_sentences = selected_sentences + [sentence]
            tfidf_matrix = tfidf_vectorizer.fit_transform(all_sentences)
            cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

            if cosine_similarities.max() > redundancy_threshold:
                continue

        selected_sentences.append(sentence)
        current_length += sentence_length

    return selected_sentences

# Function to calculate ROUGE scores
def calculate_rouge(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Path to the dataset and reference summaries
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'
reference_summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Summaries'

# Read the document cluster from the dataset
documents = read_document_cluster(summaries_dir)
reference_summaries = read_reference_summaries(reference_summaries_dir)

# Example function to get salience scores for sentences (replace with your actual model's output)
def get_salience_scores(num_sentences):
    return torch.rand(num_sentences)  # Random salience scores for now (replace with model output)

# Iterate over each document, generate a summary, and compute ROUGE scores
for doc_name, doc in zip(reference_summaries.keys(), documents):
    # Split document into sentences (you can replace this with a more sophisticated tokenizer)
    sentences = doc.split('. ')  # Simple sentence splitting by '. ' (adjust based on your data)

    # Get salience scores for each sentence (replace with your actual method)
    salience_scores = get_salience_scores(len(sentences))

    # Generate summary using sentence selection
    generated_summary_sentences = select_sentences(salience_scores, sentences, length_limit=100)
    generated_summary = ' '.join(generated_summary_sentences)

    # Get reference summary
    reference_summary = reference_summaries.get(doc_name, "Reference summary not found")  # Fallback text

    # Calculate ROUGE scores
    rouge_scores = calculate_rouge(reference_summary, generated_summary)

    # Print the results
    print(f"Document: {doc_name}")
    print("Generated Summary:")
    print(generated_summary)
    print("\nROUGE Scores:", rouge_scores)
    print("-" * 80)


ModuleNotFoundError: No module named 'rouge_score'

Complete Code
