<a href="https://colab.research.google.com/github/Shahi77/Graph-based-Multi-Document-Summarization/blob/main/gcn_mds1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

# Check if the files exist
print(os.listdir('/content/drive/My Drive/Colab Notebooks/datasets/final/'))

['train_final_Vishwas.csv', 'SGM3.ipynb', 'test_final_Vishwas_corrected.json', 'train_final_Vishwas_corrected.json', 'bekaar.ipynb', 'test_final_Vishwas.csv', 'Summaries.zip', 'T5.ipynb', 'minilm_bal_exsum.pth', 'T5_model.pth', 'bekaar_SGM.ipynb', 'SBERT_Model_Ready.pth', 'Summary_Data_new', 'Whole_text_data', 'Summaries']


In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


MDS On Whole_text_data

In [None]:
import os
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy model for English
nlp = spacy.load('en_core_web_md')

# Path to the dataset
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'


In [None]:
# Function to read document clusters from the dataset directory
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)

    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

In [None]:
# Import required libraries
import os
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Check if CUDA is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to read document clusters from the dataset directory
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)
    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

# Tokenization using spaCy
def tokenize_with_spacy(text, nlp):
    doc = nlp(text)
    return [token.text for token in doc]

# Get sentence embeddings using spaCy vectors
def get_embeddings_with_spacy(text, nlp):
    doc = nlp(text)
    return doc.vector  # Returns a 1D vector of the document

# Function to split document into sentences
def split_into_sentences(text):
    sentences = text.split('.')
    return [sentence.strip() for sentence in sentences if sentence]

# Custom function to calculate cosine similarity with a threshold
def cosine_similarity_threshold(embeddings, threshold=0.2):
    sim_matrix = cosine_similarity(embeddings)
    adj_matrix = (sim_matrix > threshold).astype(float)  # Apply threshold to create edges
    return adj_matrix

# Extract sentence features
def extract_sentence_features(sentences, nlp):
    features = []
    for i, sentence in enumerate(sentences):
        length = len(sentence.split())
        features.append({
            'position': i,
            'length': length,
            'proper_nouns': sum([1 for token in tokenize_with_spacy(sentence, nlp) if token.isupper()]),
            'is_first_three': 1 if i < 3 else 0
        })
    return features

# Calculate sentence personalization score
def calculate_personalization_scores(features):
    weights = {
        'position': 0.1,
        'length': 0.2,
        'proper_nouns': 0.4,
        'is_first_three': 0.3
    }
    personalization_scores = []
    for feature in features:
        score = sum([feature[key] * weights[key] for key in weights])
        personalization_scores.append(score)
    return personalization_scores

# Build the sentence relation graph using cosine similarity and personalization
def build_sentence_relation_graph(sentences, nlp, threshold=0.2):
    print(f"Building sentence relation graph for {len(sentences)} sentences.")
    sentence_embeddings = [get_embeddings_with_spacy(sentence, nlp) for sentence in sentences]
    sentence_embeddings = np.vstack(sentence_embeddings)
    print(f"Shape of sentence embeddings: {sentence_embeddings.shape}")

    adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)
    print(f"Adjacency matrix shape: {adj_matrix.shape}")

    sentence_features = extract_sentence_features(sentences, nlp)
    personalization_scores = calculate_personalization_scores(sentence_features)
    print(f"Personalization scores: {personalization_scores}")

    for i in range(adj_matrix.shape[0]):
        for j in range(adj_matrix.shape[1]):
            if adj_matrix[i, j] > 0:
                adj_matrix[i, j] *= personalization_scores[i]

    adj_matrix = adj_matrix / adj_matrix.max()
    print("Adjacency matrix normalized.")
    return adj_matrix

# GRU-based Sentence Encoder
class GRUSentenceEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUSentenceEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        _, h_n = self.gru(x)
        return h_n.squeeze(0)

# Processing document cluster through the GRU
def process_document_cluster_through_gru(documents, nlp, hidden_size=128):
    input_size = 300  # SpaCy vectors are 300-dimensional
    encoder = GRUSentenceEncoder(input_size, hidden_size).to(device)

    sentence_encodings = []
    for doc in documents:
        sentences = split_into_sentences(doc)
        sentence_embeddings = [torch.tensor(get_embeddings_with_spacy(sentence, nlp)).to(device) for sentence in sentences]
        embeddings_tensor = torch.stack(sentence_embeddings).unsqueeze(0).to(device)

        # Ensure that the input tensor shape is correct for GRU
        sentence_encoding = encoder(embeddings_tensor)
        sentence_encodings.append(sentence_encoding)

    return torch.stack(sentence_encodings)


# Define GCN Layer
class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features):
        super(GraphConvolution, self).__init__()
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        return output

# Define GCN Model
class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nout):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nout)
        self.relu = nn.ReLU()

    def forward(self, x, adj):
        x = self.gc1(x, adj)
        x = self.relu(x)
        x = self.gc2(x, adj)
        return x

# Define GRU-based Encoder
class GRUEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRUEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        output, hidden = self.gru(x)
        return hidden

# Load dataset (update path if necessary)
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'

# Function to encode a document's sentences
def encode_sentences(doc):
    vectors = [nlp(sentence.text).vector for sentence in doc.sents]
    return torch.tensor(np.array(vectors), dtype=torch.float32)

# List to hold sentence encodings and adjacency matrices
sentence_encodings_list = []
adj_matrix_list = []

# Iterate through files in summaries directory
for file_name in os.listdir(summaries_dir):
    file_path = os.path.join(summaries_dir, file_name)

    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
        doc = nlp(text)

        # Encode sentences
        sentence_encodings = encode_sentences(doc)
        sentence_encodings_list.append(sentence_encodings)

        # Create a random adjacency matrix (replace with actual logic as needed)
        num_sentences = sentence_encodings.shape[0]
        adj_matrix = torch.eye(num_sentences)  # Identity matrix (can replace with a different adjacency matrix)
        adj_matrix_list.append(adj_matrix)

# Ensure data is on the correct device (GPU/CPU)
sentence_encodings_list = [enc.to(device) for enc in sentence_encodings_list]
adj_matrix_list = [adj.to(device) for adj in adj_matrix_list]

# Initialize the GCN and GRU models
gcn = GCN(nfeat=300, nhid=128, nout=128).to(device)  # Assuming 300-dimensional sentence embeddings
gru_encoder = GRUEncoder(input_size=300, hidden_size=128, num_layers=1).to(device)

# Example of processing the first document
sentence_encodings = sentence_encodings_list[0]
adj_matrix = adj_matrix_list[0]

# Forward pass through GRU and GCN
with torch.no_grad():  # Disable gradients to save memory during inference
    # GRU encoder
    gru_output = gru_encoder(sentence_encodings.unsqueeze(0))  # Add batch dimension
    print(f"GRU output shape: {gru_output.shape}")

    # GCN layer
    gcn_output = gcn(sentence_encodings, adj_matrix)
    print(f"GCN output shape: {gcn_output.shape}")

# Optimizer and loss function
optimizer = optim.Adam(list(gru_encoder.parameters()) + list(gcn.parameters()), lr=0.001)
criterion = nn.MSELoss()

# Example training loop (replace with actual training process as needed)
epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()

    # Forward pass for training (using first document as an example)
    gru_output = gru_encoder(sentence_encodings.unsqueeze(0))
    gcn_output = gcn(sentence_encodings, adj_matrix)

    # Aggregating the GCN output
    gcn_output_aggregated = torch.sum(gcn_output, dim=0).unsqueeze(0)  # Shape [1, 128]

    # Squeeze GRU output
    gru_output_squeezed = gru_output.squeeze(0)  # Shape [1, 128]

    # Loss calculation
    loss = criterion(gcn_output_aggregated, gru_output_squeezed)
    loss.backward()

    optimizer.step()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}")


Using device: cpu


KeyboardInterrupt: 

In [None]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install torch-geometric

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Collecting torch-geometric
  Downloading torch_geometric-2.6.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.0


In [2]:
import os
import numpy as np
import torch
import torch.nn as nn
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import torch_geometric
from torch_geometric.nn import GCNConv

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Read Document Cluster
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)
    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

# 2. Tokenization using spaCy
def tokenize_with_spacy(text, nlp):
    doc = nlp(text)
    return [token.text for token in doc]

# 3. Get Sentence Embeddings
def get_embeddings_with_spacy(text, nlp):
    doc = nlp(text)
    return doc.vector  # Returns a 1D vector of the document

# 4. Function to Split Document into Sentences
def split_into_sentences(text):
    sentences = text.split('.')
    return [sentence.strip() for sentence in sentences if sentence]

# 5. Custom Function to Calculate Cosine Similarity with a Threshold
def cosine_similarity_threshold(embeddings, threshold=0.2):
    sim_matrix = cosine_similarity(embeddings)
    adj_matrix = (sim_matrix > threshold).astype(float)  # Apply threshold to create edges
    return adj_matrix

# 6. Extract Sentence Features
def extract_sentence_features(sentences, nlp):
    features = []
    for i, sentence in enumerate(sentences):
        length = len(sentence.split())
        features.append({
            'position': i,
            'length': length,
            'proper_nouns': sum([1 for token in tokenize_with_spacy(sentence, nlp) if token.isupper()]),
            'is_first_three': 1 if i < 3 else 0
        })
    return features

# 7. Calculate Personalization Scores
def calculate_personalization_scores(features):
    weights = {
        'position': 0.1,
        'length': 0.2,
        'proper_nouns': 0.4,
        'is_first_three': 0.3
    }
    personalization_scores = []
    for feature in features:
        score = sum([feature[key] * weights[key] for key in weights])
        personalization_scores.append(score)
    return personalization_scores

# 8. Build Sentence Relation Graph
def build_sentence_relation_graph(sentences, nlp, threshold=0.2):
    print(f"Building sentence relation graph for {len(sentences)} sentences.")
    sentence_embeddings = [get_embeddings_with_spacy(sentence, nlp) for sentence in sentences]
    sentence_embeddings = np.vstack(sentence_embeddings)
    print(f"Shape of sentence embeddings: {sentence_embeddings.shape}")

    adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)
    print(f"Adjacency matrix shape: {adj_matrix.shape}")

    sentence_features = extract_sentence_features(sentences, nlp)
    personalization_scores = calculate_personalization_scores(sentence_features)
    print(f"Personalization scores: {personalization_scores}")

    for i in range(adj_matrix.shape[0]):
        for j in range(adj_matrix.shape[1]):
            if adj_matrix[i, j] > 0:
                adj_matrix[i, j] *= personalization_scores[i]

    adj_matrix = adj_matrix / adj_matrix.max()
    print("Adjacency matrix normalized.")
    return adj_matrix

# 9. GRU-based Sentence Encoder
class GRUSentenceEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUSentenceEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        _, h_n = self.gru(x)
        return h_n.squeeze(0)

# 10. Process Document Cluster through GRU
def process_document_cluster_through_gru(documents, nlp, hidden_size=128):
    input_size = 300  # SpaCy vectors are 300-dimensional
    encoder = GRUSentenceEncoder(input_size, hidden_size).to(device)

    sentence_encodings = []
    for doc in documents:
        sentences = split_into_sentences(doc)
        sentence_embeddings = [torch.tensor(get_embeddings_with_spacy(sentence, nlp)).to(device) for sentence in sentences]
        embeddings_tensor = torch.stack(sentence_embeddings).unsqueeze(0).to(device)

        # Ensure that the input tensor shape is correct for GRU
        sentence_encoding = encoder(embeddings_tensor)
        sentence_encodings.append(sentence_encoding)

    return torch.stack(sentence_encodings)

# 11. Main Function to Process Documents and Build Graphs
def process_documents_and_build_graph(summaries_dir, nlp, hidden_size=128, threshold=0.2):
    documents = read_document_cluster(summaries_dir)

    # Process each document through GRU
    sentence_encodings = process_document_cluster_through_gru(documents, nlp, hidden_size)

    for doc_idx, doc in enumerate(documents):  # Changed variable name from 'i' to 'doc_idx'
        sentences = split_into_sentences(doc)

        # Get sentence embeddings using GRU outputs
        sentence_embeddings = sentence_encodings[doc_idx].cpu().detach().numpy()

        # Build adjacency matrix using cosine similarity and threshold
        adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)

        # Extract sentence features for personalization scores
        sentence_features = extract_sentence_features(sentences, nlp)
        personalization_scores = calculate_personalization_scores(sentence_features)

        # Modify adjacency matrix using personalization scores
        for i in range(adj_matrix.shape[0]):
            for j in range(adj_matrix.shape[1]):
                if adj_matrix[i, j] > 0:
                    adj_matrix[i, j] *= personalization_scores[i]

        # Normalize adjacency matrix
        adj_matrix = adj_matrix / adj_matrix.max()

        print(f"Processed document {doc_idx + 1}/{len(documents)}")
        print(f"Adjacency Matrix for Document {doc_idx + 1}:")
        print(adj_matrix)

    return sentence_encodings

# 12. GCN Layer Definition
class GCNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

# 13. Integrate GCN with Existing Code
def process_with_gcn(sentence_encodings, adj_matrix):
    num_sentences = adj_matrix.shape[0]

    # Prepare edge index for PyTorch Geometric
    edge_index = torch.nonzero(torch.tensor(adj_matrix), as_tuple=False).T
    edge_index = edge_index.to(device)

    # Convert sentence encodings to tensor
    x = torch.tensor(sentence_encodings, dtype=torch.float32).to(device)

    # Define GCN model
    gcn_model = GCNLayer(in_channels=128, out_channels=64).to(device)  # Adjust hidden size as needed

    # Forward pass through GCN
    gcn_output = gcn_model(x, edge_index)
    return gcn_output

# 14. Update Main Processing Function
def process_documents_and_build_graph_with_gcn(summaries_dir, nlp, hidden_size=128, threshold=0.2):
    documents = read_document_cluster(summaries_dir)

    # Process each document through GRU
    sentence_encodings = process_document_cluster_through_gru(documents, nlp, hidden_size)

    gcn_outputs = []
    for i, doc in enumerate(documents):
        sentences = split_into_sentences(doc)
        sentence_embeddings = sentence_encodings[i].cpu().detach().numpy()

        # Build adjacency matrix using cosine similarity and threshold
        adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)

        # Extract sentence features for personalization scores
        sentence_features = extract_sentence_features(sentences, nlp)
        personalization_scores = calculate_personalization_scores(sentence_features)

        # Modify adjacency matrix using personalization scores
        for j in range(adj_matrix.shape[0]):
            for k in range(adj_matrix.shape[1]):
                if adj_matrix[j, k] > 0:
                    adj_matrix[j, k] *= personalization_scores[j]

        # Normalize adjacency matrix
        adj_matrix = adj_matrix / adj_matrix.max()

        # Process with GCN
        gcn_output = process_with_gcn(sentence_embeddings, adj_matrix)
        gcn_outputs.append(gcn_output)

        print(f"Processed document {i+1}/{len(documents)} with GCN output shape: {gcn_output.shape}")

    return sentence_encodings, gcn_outputs

# Example usage
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'
nlp = spacy.load("en_core_web_md")  # Load SpaCy model
sentence_encodings, gcn_outputs = process_documents_and_build_graph_with_gcn(summaries_dir, nlp)



Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 2/608 with GCN output shape: torch.Size([1, 64])
Processed document 3/608 with GCN output shape: torch.Size([1, 64])
Processed document 4/608 with GCN output shape: torch.Size([1, 64])
Processed document 5/608 with GCN output shape: torch.Size([1, 64])
Processed document 6/608 with GCN output shape: torch.Size([1, 64])
Processed document 7/608 with GCN output shape: torch.Size([1, 64])
Processed document 8/608 with GCN output shape: torch.Size([1, 64])
Processed document 9/608 with GCN output shape: torch.Size([1, 64])
Processed document 10/608 with GCN output shape: torch.Size([1, 64])
Processed document 11/608 with GCN output shape: torch.Size([1, 64])
Processed document 12/608 with GCN output shape: torch.Size([1, 64])
Processed document 13/608 with GCN output shape: torch.Size([1, 64])
Processed document 14/608 with GCN output shape: torch.Size([1, 64])
Processed document 15/608 with GCN output s

In [None]:
#using np.array
import os
import numpy as np
import torch
import torch.nn as nn
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.nn import GCNConv

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Read Document Cluster
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)
    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

# 2. Tokenization using spaCy
def tokenize_with_spacy(text, nlp):
    doc = nlp(text)
    return [token.text for token in doc]

# 3. Get Sentence Embeddings
def get_embeddings_with_spacy(text, nlp):
    doc = nlp(text)
    return doc.vector  # Returns a 1D vector of the document

# 4. Function to Split Document into Sentences
def split_into_sentences(text):
    return [sentence.strip() for sentence in text.split('.') if sentence]

# 5. Custom Function to Calculate Cosine Similarity with a Threshold
def cosine_similarity_threshold(embeddings, threshold=0.2):
    sim_matrix = cosine_similarity(embeddings)
    return (sim_matrix > threshold).astype(float)  # Apply threshold to create edges

# 6. Extract Sentence Features
def extract_sentence_features(sentences, nlp):
    features = []
    for i, sentence in enumerate(sentences):
        length = len(sentence.split())
        features.append({
            'position': i,
            'length': length,
            'proper_nouns': sum([1 for token in tokenize_with_spacy(sentence, nlp) if token.isupper()]),
            'is_first_three': 1 if i < 3 else 0
        })
    return np.array(features)

# 7. Calculate Personalization Scores
def calculate_personalization_scores(features):
    weights = {
        'position': 0.1,
        'length': 0.2,
        'proper_nouns': 0.4,
        'is_first_three': 0.3
    }
    scores = []
    for feature in features:
        score = sum(feature[key] * weights[key] for key in weights)
        scores.append(score)
    return np.array(scores)

# 8. Build Sentence Relation Graph
def build_sentence_relation_graph(sentences, nlp, threshold=0.2):
    print(f"Building sentence relation graph for {len(sentences)} sentences.")
    sentence_embeddings = np.array([get_embeddings_with_spacy(sentence, nlp) for sentence in sentences])
    adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)

    sentence_features = extract_sentence_features(sentences, nlp)
    personalization_scores = calculate_personalization_scores(sentence_features)

    # Modify adjacency matrix using personalization scores
    for i in range(adj_matrix.shape[0]):
        for j in range(adj_matrix.shape[1]):
            if adj_matrix[i, j] > 0:
                adj_matrix[i, j] *= personalization_scores[i]

    # Normalize adjacency matrix
    adj_matrix = adj_matrix / np.max(adj_matrix) if np.max(adj_matrix) > 0 else adj_matrix
    return adj_matrix

# 9. GRU-based Sentence Encoder
class GRUSentenceEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUSentenceEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        _, h_n = self.gru(x)
        return h_n.squeeze(0)

# 10. Process Document Cluster through GRU
def process_document_cluster_through_gru(documents, nlp, hidden_size=128):
    input_size = 300  # SpaCy vectors are 300-dimensional
    encoder = GRUSentenceEncoder(input_size, hidden_size).to(device)

    sentence_encodings = []
    for doc in documents:
        sentences = split_into_sentences(doc)
        sentence_embeddings = np.array([get_embeddings_with_spacy(sentence, nlp) for sentence in sentences])

        # Ensure that the input tensor shape is correct for GRU
        embeddings_tensor = torch.tensor(sentence_embeddings, dtype=torch.float32).unsqueeze(0).to(device)
        sentence_encoding = encoder(embeddings_tensor)

        sentence_encodings.append(sentence_encoding.cpu().detach().numpy())

    return np.array(sentence_encodings)

# 11. Main Function to Process Documents and Build Graphs
def process_documents_and_build_graph(summaries_dir, nlp, hidden_size=128, threshold=0.2):
    documents = read_document_cluster(summaries_dir)

    # Process each document through GRU
    sentence_encodings = process_document_cluster_through_gru(documents, nlp, hidden_size)

    for i, doc in enumerate(documents):
        sentences = split_into_sentences(doc)
        # Get sentence embeddings using GRU outputs
        sentence_embeddings = sentence_encodings[i]

        # Build adjacency matrix using cosine similarity and threshold
        adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)

        # Extract sentence features for personalization scores
        sentence_features = extract_sentence_features(sentences, nlp)
        personalization_scores = calculate_personalization_scores(sentence_features)

        # Modify adjacency matrix using personalization scores
        for i in range(adj_matrix.shape[0]):
            for j in range(adj_matrix.shape[1]):
                if adj_matrix[i, j] > 0:
                    adj_matrix[i, j] *= personalization_scores[i]

        # Normalize adjacency matrix
        adj_matrix = adj_matrix / np.max(adj_matrix) if np.max(adj_matrix) > 0 else adj_matrix

       # print(f"Processed document {i+1}/{len(documents)}")
       # print(f"Adjacency Matrix for Document {i+1}:")
        print(adj_matrix)

    return sentence_encodings

# 12. GCN Layer Definition
class GCNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

# 13. Integrate GCN with Existing Code
def process_with_gcn(sentence_encodings, adj_matrix):
    num_sentences = adj_matrix.shape[0]

    # Prepare edge index for PyTorch Geometric
    edge_index = torch.nonzero(torch.tensor(adj_matrix), as_tuple=False).T
    edge_index = edge_index.to(device)

    # Convert sentence encodings to tensor
    x = torch.tensor(sentence_encodings, dtype=torch.float32).to(device)

    # Define GCN model
    gcn_model = GCNLayer(in_channels=128, out_channels=64).to(device)

    # Forward pass through GCN
    gcn_output = gcn_model(x, edge_index)
    return gcn_output

# 14. Update Main Processing Function
def process_documents_and_build_graph_with_gcn(summaries_dir, nlp, hidden_size=128, threshold=0.2):
    documents = read_document_cluster(summaries_dir)

    # Process each document through GRU
    sentence_encodings = process_document_cluster_through_gru(documents, nlp, hidden_size)

    gcn_outputs = []
    for i, doc in enumerate(documents):
        sentences = split_into_sentences(doc)
        sentence_embeddings = sentence_encodings[i]

        # Build adjacency matrix using cosine similarity and threshold
        adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)

        # Extract sentence features for personalization scores
        sentence_features = extract_sentence_features(sentences, nlp)
        personalization_scores = calculate_personalization_scores(sentence_features)

        # Modify adjacency matrix using personalization scores
        for i in range(adj_matrix.shape[0]):
            for j in range(adj_matrix.shape[1]):
                if adj_matrix[i, j] > 0:
                    adj_matrix[i, j] *= personalization_scores[i]

        # Normalize adjacency matrix
        adj_matrix = adj_matrix / np.max(adj_matrix) if np.max(adj_matrix) > 0 else adj_matrix

        # Process with GCN
        gcn_output = process_with_gcn(sentence_embeddings, adj_matrix)
        gcn_outputs.append(gcn_output.cpu().detach().numpy())

        print(f"Processed document {i+1}/{len(documents)} with GCN output shape: {gcn_output.shape}")

    return sentence_encodings, gcn_outputs

# Example usage
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'
nlp = spacy.load("en_core_web_md")  # Load SpaCy model
sentence_encodings, gcn_outputs = process_documents_and_build_graph_with_gcn(summaries_dir, nlp)






Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: torch.Size([1, 64])
Processed document 1/608 with GCN output shape: 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-3b886bf02917>", line 219, in <cell line: 219>
    sentence_encodings, gcn_outputs = process_documents_and_build_graph_with_gcn(summaries_dir, nlp)
  File "<ipython-input-7-3b886bf02917>", line 196, in process_documents_and_build_graph_with_gcn
    sentence_features = extract_sentence_features(sentences, nlp)
  File "<ipython-input-7-3b886bf02917>", line 53, in extract_sentence_features
    'proper_nouns': sum([1 for token in tokenize_with_spacy(sentence, nlp) if token.isupper()]),
  File "<ipython-input-7-3b886bf02917>", line 28, in tokenize_with_spacy
    doc = nlp(text)
  File "/usr/local/lib/python3.10/dist-packages/spacy/language.py", line 1049, in __call__
    doc = proc(doc, **component_cfg.get(name, {}))  # type: ignore[call-arg]
  File "spacy/pipeline/trainable_

TypeError: object of type 'NoneType' has no len()

In [None]:
#using torch

import os
import numpy as np
import torch
import torch.nn as nn
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import torch_geometric
from torch_geometric.nn import GCNConv

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Read Document Cluster
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)
    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

# 2. Tokenization using spaCy
def tokenize_with_spacy(text, nlp):
    doc = nlp(text)
    return [token.text for token in doc]

# 3. Get Sentence Embeddings
def get_embeddings_with_spacy(text, nlp):
    doc = nlp(text)
    return doc.vector  # Returns a 1D vector of the document

# 4. Function to Split Document into Sentences
def split_into_sentences(text):
    sentences = text.split('.')
    return [sentence.strip() for sentence in sentences if sentence]

# 5. Custom Function to Calculate Cosine Similarity with a Threshold
def cosine_similarity_threshold(embeddings, threshold=0.2):
    sim_matrix = cosine_similarity(embeddings)
    adj_matrix = (sim_matrix > threshold).astype(float)  # Apply threshold to create edges
    return adj_matrix

# 6. Extract Sentence Features
def extract_sentence_features(sentences, nlp):
    features = []
    for i, sentence in enumerate(sentences):
        length = len(sentence.split())
        features.append({
            'position': i,
            'length': length,
            'proper_nouns': sum([1 for token in tokenize_with_spacy(sentence, nlp) if token.isupper()]),
            'is_first_three': 1 if i < 3 else 0
        })
    return features

# 7. Calculate Personalization Scores
def calculate_personalization_scores(features):
    weights = {
        'position': 0.1,
        'length': 0.2,
        'proper_nouns': 0.4,
        'is_first_three': 0.3
    }
    personalization_scores = []
    for feature in features:
        score = sum([feature[key] * weights[key] for key in weights])
        personalization_scores.append(score)
    return personalization_scores

# 8. Build Sentence Relation Graph
def build_sentence_relation_graph(sentences, nlp, threshold=0.2):
    print(f"Building sentence relation graph for {len(sentences)} sentences.")
    sentence_embeddings = [get_embeddings_with_spacy(sentence, nlp) for sentence in sentences]
    sentence_embeddings = np.vstack(sentence_embeddings)
    print(f"Shape of sentence embeddings: {sentence_embeddings.shape}")

    adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)
    print(f"Adjacency matrix shape: {adj_matrix.shape}")

    sentence_features = extract_sentence_features(sentences, nlp)
    personalization_scores = calculate_personalization_scores(sentence_features)
    print(f"Personalization scores: {personalization_scores}")

    # Modify adjacency matrix using personalization scores
    for i in range(adj_matrix.shape[0]):
        for j in range(adj_matrix.shape[1]):
            if adj_matrix[i, j] > 0:
                adj_matrix[i, j] *= personalization_scores[i]

    adj_matrix = adj_matrix / adj_matrix.max() if adj_matrix.max() > 0 else adj_matrix
    print("Adjacency matrix normalized.")
    return adj_matrix

# 9. GRU-based Sentence Encoder
class GRUSentenceEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUSentenceEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        _, h_n = self.gru(x)
        return h_n.squeeze(0)

# 10. Process Document Cluster through GRU
def process_document_cluster_through_gru(documents, nlp, hidden_size=128):
    input_size = 300  # SpaCy vectors are 300-dimensional
    encoder = GRUSentenceEncoder(input_size, hidden_size).to(device)

    sentence_encodings = []
    for doc in documents:
        sentences = split_into_sentences(doc)
        sentence_embeddings = [torch.tensor(get_embeddings_with_spacy(sentence, nlp)).to(device).unsqueeze(0) for sentence in sentences]
        embeddings_tensor = torch.cat(sentence_embeddings, dim=0).unsqueeze(0).to(device)

        # Ensure that the input tensor shape is correct for GRU
        sentence_encoding = encoder(embeddings_tensor)
        sentence_encodings.append(sentence_encoding)

    return torch.stack(sentence_encodings)

# 11. Main Function to Process Documents and Build Graphs
def process_documents_and_build_graph(summaries_dir, nlp, hidden_size=128, threshold=0.2):
    documents = read_document_cluster(summaries_dir)

    # Process each document through GRU
    sentence_encodings = process_document_cluster_through_gru(documents, nlp, hidden_size)

    for i, doc in enumerate(documents):
        sentences = split_into_sentences(doc)
        # Get sentence embeddings using GRU outputs
        sentence_embeddings = sentence_encodings[i].cpu().detach().numpy()

        # Build adjacency matrix using cosine similarity and threshold
        adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)

        # Extract sentence features for personalization scores
        sentence_features = extract_sentence_features(sentences, nlp)
        personalization_scores = calculate_personalization_scores(sentence_features)

        # Modify adjacency matrix using personalization scores
        for i in range(adj_matrix.shape[0]):
            for j in range(adj_matrix.shape[1]):
                if adj_matrix[i, j] > 0:
                    adj_matrix[i, j] *= personalization_scores[i]

        # Normalize adjacency matrix
        adj_matrix = adj_matrix / adj_matrix.max() if adj_matrix.max() > 0 else adj_matrix

       # print(f"Processed document {i + 1}/{len(documents)}")
        #print(f"Adjacency Matrix for Document {i + 1}:")
        print(adj_matrix)

    return sentence_encodings

# 12. GCN Layer Definition
class GCNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

# 13. Integrate GCN with Existing Code
def process_with_gcn(sentence_encodings, adj_matrix):
    num_sentences = adj_matrix.shape[0]

    # Prepare edge index for PyTorch Geometric
    edge_index = torch.nonzero(torch.tensor(adj_matrix), as_tuple=False).T
    edge_index = edge_index.to(device)

    # Convert sentence encodings to tensor
    x = torch.tensor(sentence_encodings, dtype=torch.float32).to(device)

    # Define GCN model
    gcn_model = GCNLayer(in_channels=128, out_channels=64).to(device)  # Adjust hidden size as needed

    # Forward pass through GCN
    gcn_output = gcn_model(x, edge_index)
    return gcn_output

# 14. Update Main Processing Function
def process_documents_and_build_graph_with_gcn(summaries_dir, nlp, hidden_size=128, threshold=0.2):
    documents = read_document_cluster(summaries_dir)

    # Process each document through GRU
    sentence_encodings = process_document_cluster_through_gru(documents, nlp, hidden_size)

    gcn_outputs = []
    for i, doc in enumerate(documents):
        sentences = split_into_sentences(doc)
        sentence_embeddings = sentence_encodings[i].cpu().detach().numpy()

        # Build adjacency matrix using cosine similarity and threshold
        adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)

        # Extract sentence features for personalization scores
        sentence_features = extract_sentence_features(sentences, nlp)
        personalization_scores = calculate_personalization_scores(sentence_features)

        # Modify adjacency matrix using personalization scores
        for i in range(adj_matrix.shape[0]):
            for j in range(adj_matrix.shape[1]):
                if adj_matrix[i, j] > 0:
                    adj_matrix[i, j] *= personalization_scores[i]

        # Normalize adjacency matrix
        adj_matrix = adj_matrix / adj_matrix.max() if adj_matrix.max() > 0 else adj_matrix

        # Process with GCN
        gcn_output = process_with_gcn(sentence_embeddings, adj_matrix)
        gcn_outputs.append(gcn_output)

        print(f"Processed document {i + 1}/{len(documents)} with GCN output shape: {gcn_output.shape}")

    return sentence_encodings, gcn_outputs

# Example usage
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'
nlp = spacy.load("en_core_web_md")  # Load SpaCy model
sentence_encodings, gcn_outputs = process_documents_and_build_graph_with_gcn(summaries_dir, nlp)


ModuleNotFoundError: No module named 'torch_geometric'

In [None]:
import os
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Define device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to read document clusters from the dataset directory
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)

    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
           # print(f"Loaded {file_name}")
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

# Tokenization using spaCy
def tokenize_with_spacy(text, nlp):
    doc = nlp(text)
    return [token.text for token in doc]

# Get sentence embeddings using spaCy vectors
def get_embeddings_with_spacy(text, nlp):
    doc = nlp(text)
    return doc.vector  # Returns a 1D vector of the document

# Function to split document into sentences
def split_into_sentences(text):
    sentences = text.split('.')
    return [sentence.strip() for sentence in sentences if sentence]

# Custom function to calculate cosine similarity with a threshold
def cosine_similarity_threshold(embeddings, threshold=0.2):
    sim_matrix = cosine_similarity(embeddings)
    adj_matrix = (sim_matrix > threshold).astype(float)  # Apply threshold to create edges
    return adj_matrix

# Extract sentence features
def extract_sentence_features(sentences, nlp):
    features = []
    for i, sentence in enumerate(sentences):
        length = len(sentence.split())
        features.append({
            'position': i,
            'length': length,
            'proper_nouns': sum([1 for token in tokenize_with_spacy(sentence, nlp) if token.isupper()]),
            'is_first_three': 1 if i < 3 else 0
        })
    return features

# Calculate sentence personalization score
def calculate_personalization_scores(features):
    weights = {
        'position': 0.1,
        'length': 0.2,
        'proper_nouns': 0.4,
        'is_first_three': 0.3
    }
    personalization_scores = []
    for feature in features:
        score = sum([feature[key] * weights[key] for key in weights])
        personalization_scores.append(score)
    return personalization_scores

# Build the sentence relation graph using cosine similarity and personalization
def build_sentence_relation_graph(sentences, nlp, threshold=0.2):
    print(f"Building sentence relation graph for {len(sentences)} sentences.")
    sentence_embeddings = [get_embeddings_with_spacy(sentence, nlp) for sentence in sentences]
    sentence_embeddings = np.vstack(sentence_embeddings)
    print(f"Shape of sentence embeddings: {sentence_embeddings.shape}")

    adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)
    print(f"Adjacency matrix shape: {adj_matrix.shape}")

    sentence_features = extract_sentence_features(sentences, nlp)
    personalization_scores = calculate_personalization_scores(sentence_features)
    print(f"Personalization scores: {personalization_scores}")

    for i in range(adj_matrix.shape[0]):
        for j in range(adj_matrix.shape[1]):
            if adj_matrix[i, j] > 0:
                adj_matrix[i, j] *= personalization_scores[i]

    adj_matrix = adj_matrix / adj_matrix.max()
    print("Adjacency matrix normalized.")
    return adj_matrix

# GRU-based Sentence Encoder
class GRUSentenceEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUSentenceEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        _, h_n = self.gru(x)
        return h_n.squeeze(0)

# Processing document cluster through the GRU
def process_document_cluster_through_gru(documents, nlp, hidden_size=128):
    input_size = 300  # SpaCy vectors are 300-dimensional
    encoder = GRUSentenceEncoder(input_size, hidden_size).to(device)

    sentence_encodings = []
    for doc in documents:
        sentences = split_into_sentences(doc)
       # print(f"Processing document with {len(sentences)} sentences.")
        sentence_embeddings = [torch.tensor(get_embeddings_with_spacy(sentence, nlp)).to(device) for sentence in sentences]
        embeddings_tensor = torch.stack(sentence_embeddings).unsqueeze(0).to(device)

        # Ensure that the input tensor shape is correct for GRU
        sentence_encoding = encoder(embeddings_tensor)
        sentence_encodings.append(sentence_encoding)

    return torch.stack(sentence_encodings)

# GCN Layer Definition
# class GCNLayer(nn.Module):
#     def __init__(self, in_channels, out_channels):
#         super(GCNLayer, self).__init__()
#         self.fc = nn.Linear(in_channels, out_channels)

#    def forward(self, x, adj):
#     # Normalize the adjacency matrix
#       adj = self.normalize_adj(adj)

#     # Check shapes before multiplication
#     print(f"Shape of adj: {adj.shape}, Shape of x: {x.shape}")

#     # Ensure dimensions match
#     if adj.shape[0] != x.shape[0]:
#         raise ValueError(f"Dimension mismatch: adj has {adj.shape[0]} nodes, but x has {x.shape[0]} features.")

#     # Perform the multiplication
#        x = torch.matmul(adj, x)
#        return F.relu(x)

#     def normalize_adj(self, adj):
#         num_nodes = adj.size(0)
#         I = torch.eye(num_nodes).to(device)
#         adj = adj + I
#         D = torch.sum(adj, dim=1)
#         D_inv_sqrt = torch.pow(D, -0.5)
#         D_inv_sqrt[torch.isinf(D_inv_sqrt)] = 0
#         D_inv_sqrt = torch.diag(D_inv_sqrt)
#         return torch.matmul(D_inv_sqrt, torch.matmul(adj, D_inv_sqrt))

class GCNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.fc = nn.Linear(in_channels, out_channels)

    def forward(self, x, adj):
        adj = self.normalize_adj(adj)
        assert adj.shape[0] == x.shape[0], "Dimension mismatch: adj and x must have the same number of nodes."

        x = torch.matmul(adj, x)
        return F.relu(x)

    def normalize_adj(self, adj):
        num_nodes = adj.size(0)
        I = torch.eye(num_nodes).to(device)
        adj = adj + I
        D = torch.sum(adj, dim=1)
        D_inv_sqrt = torch.pow(D, -0.5)
        D_inv_sqrt[torch.isinf(D_inv_sqrt)] = 0
        D_inv_sqrt = torch.diag(D_inv_sqrt)
        return torch.matmul(D_inv_sqrt, torch.matmul(adj, D_inv_sqrt))


# Read and process documents
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'
documents = read_document_cluster(summaries_dir)
print(f"Loaded {len(documents)} documents.")

# Initialize spaCy
nlp = spacy.load('en_core_web_md')

# Split documents into sentences
sentences_per_document = [split_into_sentences(doc) for doc in documents]
print("Done splitting documents into sentences.")

# Build sentence relation graphs for each document
adj_matrices = [build_sentence_relation_graph(sentences, nlp) for sentences in sentences_per_document]
print("Done building adjacency matrices.")

# Process documents through GRU
sentence_encodings = process_document_cluster_through_gru(documents, nlp, hidden_size=128)
print(f"Processed document clusters through GRU. Shape: {sentence_encodings.shape}")

# Example usage of GCN layer
gcn = GCNLayer(128, 64).to(device)
# Convert adj_matrices to tensor and move to device
adj_tensor = torch.tensor(adj_matrices[0], dtype=torch.float32).to(device)

# Check the shape of the sentence encodings
print(f"Shape of sentence_encodings before: {sentence_encodings.shape}")

# Ensure the encodings have the correct dimensions [608, 128]
if len(sentence_encodings.shape) == 3 and sentence_encodings.shape[1] == 1:
    sentence_encodings = sentence_encodings.squeeze(1)  # Remove the second dimension

# Verify the final shape of sentence_encodings
print(f"Shape of sentence_encodings after: {sentence_encodings.shape}")

# Ensure adj_tensor has the right dimensions
print(f"Shape of adj_tensor: {adj_tensor.shape}")

# Apply the GCN layer to the sentence encodings and adjacency matrix
gcn_output = gcn(sentence_encodings, adj_tensor)

# Print the shape of the output
print("GCN output shape: ", gcn_output.shape)

In [None]:
# Tokenization using spaCy
def tokenize_with_spacy(text):
    doc = nlp(text)
    return [token.text for token in doc]

# Get sentence embeddings using spaCy vectors
def get_embeddings_with_spacy(text):
    doc = nlp(text)
    return doc.vector  # Returns a 1D vector of the document

# Function to split document into sentences
def split_into_sentences(text):
    sentences = text.split('.')
    return [sentence.strip() for sentence in sentences if sentence]


In [None]:
# Custom function to calculate cosine similarity with a threshold
from sklearn.preprocessing import normalize

def cosine_similarity_threshold(embeddings, threshold=0.1):
    # Normalize the embeddings
    embeddings = normalize(embeddings)
    sim_matrix = cosine_similarity(embeddings)
    adj_matrix = (sim_matrix > threshold).astype(float)  # Apply threshold to create edges
    return adj_matrix


# Extract sentence features
def extract_sentence_features(sentences):
    features = []
    for i, sentence in enumerate(sentences):
        length = len(sentence.split())
        features.append({
            'position': i,
            'length': length,
            'proper_nouns': sum([1 for token in tokenize_with_spacy(sentence) if token.isupper()]),
            'is_first_three': 1 if i < 3 else 0
        })
    return features

# Calculate sentence personalization score
def calculate_personalization_scores(features):
    weights = {
        'position': 0.1,
        'length': 0.2,
        'proper_nouns': 0.4,
        'is_first_three': 0.3
    }
    personalization_scores = []
    for feature in features:
        score = sum([feature[key] * weights[key] for key in weights])
        personalization_scores.append(score)
    return personalization_scores

# Build the sentence relation graph using cosine similarity and personalization
def build_sentence_relation_graph(sentences, threshold=0.2):
    sentence_embeddings = [get_embeddings_with_spacy(sentence) for sentence in sentences]
    sentence_embeddings = np.vstack(sentence_embeddings)

    adj_matrix = cosine_similarity_threshold(sentence_embeddings, threshold)
    sentence_features = extract_sentence_features(sentences)
    personalization_scores = calculate_personalization_scores(sentence_features)

    for i in range(adj_matrix.shape[0]):
        for j in range(adj_matrix.shape[1]):
            if adj_matrix[i, j] > 0:
                adj_matrix[i, j] *= personalization_scores[i]

    adj_matrix = adj_matrix / adj_matrix.max()
    return adj_matrix

In [None]:
class GRUSentenceEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUSentenceEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        _, h_n = self.gru(x)
        return h_n.squeeze(0)

# Processing document cluster through the GRU
from torch.nn.utils.rnn import pad_sequence

def process_document_cluster_through_gru(documents, hidden_size=128):
    input_size = 300  # SpaCy vectors are 300-dimensional
    encoder = GRUSentenceEncoder(input_size, hidden_size).to(device)

    sentence_encodings = []
    for doc in documents:
        sentences = split_into_sentences(doc)
        sentence_embeddings = [torch.tensor(get_embeddings_with_spacy(sentence)).to(device) for sentence in sentences]

        # Padding sentence embeddings to ensure uniform shape
        embeddings_tensor = pad_sequence(sentence_embeddings, batch_first=True).unsqueeze(0).to(device)

        # Ensure that the input tensor shape is correct for GRU
        sentence_encoding = encoder(embeddings_tensor)
        sentence_encodings.append(sentence_encoding)

    return torch.stack(sentence_encodings)




In [None]:
class GCNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.fc = nn.Linear(in_channels, out_channels)

    def forward(self, x, adj):
        x = self.fc(x)
        adj = self.normalize_adj(adj)
        x = torch.matmul(adj, x)
        return F.relu(x)

    def normalize_adj(self, adj):
        num_nodes = adj.size(0)
        I = torch.eye(num_nodes).to(device)
        adj = adj + I
        D = torch.sum(adj, dim=1)
        D_inv_sqrt = torch.pow(D, -0.5)
        D_inv_sqrt[torch.isinf(D_inv_sqrt)] = 0
        D_inv_sqrt = torch.diag(D_inv_sqrt)
        return torch.matmul(D_inv_sqrt, torch.matmul(adj, D_inv_sqrt))



In [None]:
class DocumentClusterEncoder(nn.Module):
    def __init__(self, sentence_input_size, sentence_hidden_size, doc_input_size, doc_hidden_size):
        super(DocumentClusterEncoder, self).__init__()
        self.sentence_encoder = GRUSentenceEncoder(sentence_input_size, sentence_hidden_size)
        self.doc_encoder = GRUSentenceEncoder(sentence_hidden_size, doc_hidden_size)

    def forward(self, document_clusters):
        document_embeddings = []

        for doc_cluster in document_clusters:
            sentence_embeddings = self.sentence_encoder(doc_cluster)
            sentence_embeddings = sentence_embeddings.unsqueeze(0)
            doc_embedding = self.doc_encoder(sentence_embeddings)
            document_embeddings.append(doc_embedding)

        document_embeddings = torch.stack(document_embeddings)
        cluster_embedding = torch.mean(document_embeddings, dim=0)
        return cluster_embedding


In [None]:
class SalienceEstimator(nn.Module):
    def __init__(self, hidden_size):
        super(SalienceEstimator, self).__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Parameter(torch.randn(hidden_size))

    def forward(self, sentence_embeddings, cluster_embedding):
        cluster_embedding = cluster_embedding.unsqueeze(0)
        scores = torch.matmul(
            F.tanh(self.W1(cluster_embedding) + self.W2(sentence_embeddings)),
            self.v
        )
        salience_scores = F.softmax(scores.squeeze(0), dim=0)
        return salience_scores


In [None]:
import torch

# Define the device to use: GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Read and process documents
documents = read_document_cluster(summaries_dir)
sentences_per_document = [split_into_sentences(doc) for doc in documents]
# first_document = documents[0]
# sentences_per_document = split_into_sentences(first_document)

print("done with sentences")

# Build sentence relation graphs for each document
adj_matrices = [build_sentence_relation_graph(sentences) for sentences in sentences_per_document]
print("done with adj_matrices")

# Process documents through GRU
sentence_encodings = process_document_cluster_through_gru(documents, hidden_size=128)
#sentence_encodings = process_document_cluster_through_gru([first_document], hidden_size=128)
print("done with encodings")

# Example usage of GCN layer
gcn = GCNLayer(128, 64).to(device)
adj_tensor = torch.tensor(adj_matrices[0], dtype=torch.float32).to(device)
sentence_encodings = sentence_encodings.to(device)

# Check the shape of the sentence encodings
# Check the shape of the sentence encodings
print(f"Shape of sentence_encodings before: {sentence_encodings.shape}")

# Ensure that the encodings have the correct dimensions [num_sentences, embedding_size]
if len(sentence_encodings.shape) == 3 and sentence_encodings.shape[1] == 1:
    sentence_encodings = sentence_encodings.squeeze(1)  # Remove extra dimension if needed

print(f"Shape of sentence_encodings after: {sentence_encodings.shape}")

# Apply the GCN layer to the sentence encodings and adjacency matrix
gcn_output = gcn(sentence_encodings, adj_tensor)

print("GCN output: ", gcn_output)



done with sentences
done with adj_matrices
done with encodings
Shape of sentence_encodings before: torch.Size([608, 1, 128])
Shape of sentence_encodings after: torch.Size([608, 1, 128])


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [608, 331] but got: [608, 1].

In [None]:
import os
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer  # Ensure rouge_score is installed: pip install rouge-score

# Function to read document clusters from the dataset directory
def read_document_cluster(summaries_dir):
    documents = []
    summaries_files = os.listdir(summaries_dir)

    for file_name in summaries_files:
        file_path = os.path.join(summaries_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                documents.append(file.read())
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return documents

# Function to read reference summaries
def read_reference_summaries(reference_dir):
    references = {}
    summaries_files = os.listdir(reference_dir)

    for file_name in summaries_files:
        file_path = os.path.join(reference_dir, file_name)
        try:
            with open(file_path, 'r') as file:
                references[file_name] = file.read()
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    return references

# Sentence selection function
def select_sentences(salience_scores, sentences, length_limit=55, redundancy_threshold=0.5):
    if salience_scores.dim() > 1:
        salience_scores = salience_scores.squeeze(0)

    scores, indices = torch.sort(salience_scores, descending=True)
    selected_sentences = []
    current_length = 0
    tfidf_vectorizer = TfidfVectorizer()

    for idx in indices:
        sentence = sentences[idx]
        sentence_length = len(sentence.split())

        if current_length + sentence_length > length_limit:
            break

        if selected_sentences:
            all_sentences = selected_sentences + [sentence]
            tfidf_matrix = tfidf_vectorizer.fit_transform(all_sentences)
            cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

            if cosine_similarities.max() > redundancy_threshold:
                continue

        selected_sentences.append(sentence)
        current_length += sentence_length

    return selected_sentences

# Function to calculate ROUGE scores
def calculate_rouge(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Path to the dataset and reference summaries
summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Whole_text_data'
reference_summaries_dir = '/content/drive/My Drive/Colab Notebooks/datasets/final/Summaries'

# Read the document cluster from the dataset
documents = read_document_cluster(summaries_dir)
reference_summaries = read_reference_summaries(reference_summaries_dir)

# Example function to get salience scores for sentences (replace with your actual model's output)
def get_salience_scores(num_sentences):
    return torch.rand(num_sentences)  # Random salience scores for now (replace with model output)

# Iterate over each document, generate a summary, and compute ROUGE scores
for doc_name, doc in zip(reference_summaries.keys(), documents):
    # Split document into sentences (you can replace this with a more sophisticated tokenizer)
    sentences = doc.split('. ')  # Simple sentence splitting by '. ' (adjust based on your data)

    # Get salience scores for each sentence (replace with your actual method)
    salience_scores = get_salience_scores(len(sentences))

    # Generate summary using sentence selection
    generated_summary_sentences = select_sentences(salience_scores, sentences, length_limit=100)
    generated_summary = ' '.join(generated_summary_sentences)

    # Get reference summary
    reference_summary = reference_summaries.get(doc_name, "Reference summary not found")  # Fallback text

    # Calculate ROUGE scores
    rouge_scores = calculate_rouge(reference_summary, generated_summary)

    # Print the results
    print(f"Document: {doc_name}")
    print("Generated Summary:")
    print(generated_summary)
    print("\nROUGE Scores:", rouge_scores)
    print("-" * 80)


ModuleNotFoundError: No module named 'rouge_score'

Complete Code
