# **Project: Relation Classification**


## ----- **Text Mining and Sentiment Analysis Course** ----



## **Steps**:
   ### Step 1: Data Loading and Data Exploration
   ### Step 2: Data Preprocessing
   ### Step 3: Models' Architecture
   ### Step 4: Important Functions
   ### Step 5: Training Models
   ### Step 6: Knowledge Graph


**Dataset:** relation-extraction-corpus ([link](https://code.google.com/archive/p/relation-extraction-corpus/downloads))




### **Shojaat Joodi Bigdilo**

July 2024


In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

In [None]:
pip install transformers

In [None]:
pip install torchtext

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import TensorDataset, Subset

import torchtext
torchtext.disable_torchtext_deprecation_warning()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from transformers import BertModel
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from transformers import BertTokenizer

from collections import Counter
import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## **Step1: Data Loading and Exploration**

In [None]:
import json

# Example file paths
date_of_birth_file = "/content/gdrive/My Drive/Text_Mining/dob_augment-200526.json"
education_file = "/content/gdrive/My Drive/Text_Mining/education_augment-200526.json"
place_of_death_file = "/content/gdrive/My Drive/Text_Mining/pod_augment-200526.json"
place_of_birth_file = "/content/gdrive/My Drive/Text_Mining/pob_augment-200526.json"
institution_file =  "/content/gdrive/My Drive/Text_Mining/institution_augment-200526.json"


# Load the datasets
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)


# Load data
date_of_birth_data = load_dataset(date_of_birth_file)
education_data = load_dataset(education_file)
place_of_death_data = load_dataset(place_of_death_file)
place_of_birth_data = load_dataset(place_of_birth_file)
institution_data = load_dataset(institution_file)

In [None]:
# Combine datasets into one list
data = date_of_birth_data + education_data + place_of_death_data + place_of_birth_data + institution_data

In [None]:
data[0]

In [None]:
# Define the mapping dictionary
pred_mapping = {
    '/people/person/education./education/education/institution': 'institution',
    '/people/person/date_of_birth': 'date_of_birth',
    '/people/person/education./education/education/degree': 'degree',
    '/people/deceased_person/place_of_death': 'place_of_death',
    '/people/person/place_of_birth': 'place_of_birth'
}

# Extract the relevant information
extracted_data = []
for entry in data:
    if entry['pred'] in pred_mapping:
        extracted_data.append({
            'pred': pred_mapping[entry['pred']],
            'sub': entry['sub'],
            'obj': entry['obj'],
            'evidences': entry['evidences']
        })

# assign again
data = extracted_data

In [None]:
data[0]

## **Data Exploration**

In [None]:
df = pd.DataFrame(data)
df.head()

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
from collections import Counter

# Extract relations
relations = [entry['pred'] for entry in data]
relation_counts = Counter(relations)
print(relation_counts)

In [None]:
# Explore the distribution of relation types
relation_counter = df['pred'].value_counts()
relation_counter

## **Step 2: Data Preprocessing**

In [None]:
relations = [entry['pred'] for entry in data]
relation_counts = Counter(relations)

# Define the target size for each class (e.g., the size of the smallest class)
target_size = min(relation_counts.values())
print(f'Target size for under-sampling: {target_size}')

# Create a list to store the under-sampled data
under_sampled_data = []

# Under-sample each relation
for relation in relation_counts:
    class_data = [entry for entry in data if entry['pred'] == relation]
    if len(class_data) > target_size:
        class_data = shuffle(class_data)[:target_size]
    under_sampled_data.extend(class_data)


# Shuffle the final under-sampled dataset
under_sampled_data = shuffle(under_sampled_data)

In [None]:
len(under_sampled_data)

In [None]:
# Extract relations and sentences in new under-sampled dataset
sentences = [entry['evidences'][0]['snippet'] for entry in under_sampled_data]
labels = [entry['pred'] for entry in under_sampled_data]

In [None]:
sentences[:3]

In [None]:
labels[:3]

In [None]:
# Convert labels to numerical format
unique_labels = list(set(labels))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
numerical_labels = [label2id[label] for label in labels]

In [None]:
unique_labels

In [None]:
label2id

In [None]:
id2label

### **Split Data**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sentences, numerical_labels, test_size=0.2, random_state=42)

In [None]:
# Checking the size of the splits
print(f'Training samples: {len(X_train)}, Testing samples: {len(X_test)}')
print(f"Training data class distribution: {Counter(y_train)}")
print(f"Testing data class distribution: {Counter(y_test)}")

# **Step3: All Models Architecture**

### **LSTM**

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=2, dropout=0.3):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<pad>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        x = self.embedding(x)
        packed_x = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, _) = self.lstm(packed_x)
        output = self.dropout(hidden[-1])
        output = self.fc(output)
        return output

### **Bidirectional LSTM (BiLSTM)**

In [None]:
import torch
import torch.nn as nn

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=2, dropout=0.3):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<pad>'])
        # bidirectional=True , and in FC: hidden_dim * 2
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        x = self.embedding(x)
        packed_x = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, _) = self.lstm(packed_x)
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        output = self.dropout(hidden_cat)
        output = self.fc(output)
        return output


 ### **BERT**

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output  
        logits = self.classifier(cls_output) 
        return logits

 ### **BERT Larg, with more layers**

In [None]:
class BERTClassifier_Larg(nn.Module):
    def __init__(self, num_labels):
        super(BERTClassifier_Larg, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(p=0.2)
        self.linear1 = nn.Linear(768,64)
        self.ReLu = nn.ReLU()
        self.classifier = nn.Linear(64,5)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output 
        out = self.dropout(cls_output)
        out = self.linear1(out)
        out = self.ReLu(out)
        logits = self.classifier(out)  
        return logits

# **Step 4: Important Functions**

### **Training and Test Function**

In [None]:
Average_loss = []
all_epoch = []

# Training the model
def train_model(model, train_dataloader, optimizer, loss_fn, device, epochs= 5, model_type = 'LSTM'):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0  # Initialize the total loss for this epoch

        for batch in train_dataloader:
            if model_type == 'LSTM':
                input_ids, lengths, labels = [b.to(device) for b in batch]
                optimizer.zero_grad()
                outputs = model(input_ids, lengths)
            else: # for BERT 
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                optimizer.zero_grad()
                outputs = model(input_ids, attention_mask)

            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            # Accumulate the total loss
            total_loss += loss.item()

        # Average loss for the epoch
        avg_loss = total_loss / len(train_dataloader)
        Average_loss.append(avg_loss)
        all_epoch.append(epoch + 1)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")


# Evaluation of the model
def evaluate_model(model, test_dataloader, device, model_type = 'LSTM'):
    model.to(device)
    model.eval()
    true_labels = []
    prediction = []
    with torch.no_grad():
        for batch in test_dataloader:
            if model_type == 'LSTM':
                input_ids, lengths, labels = [b.to(device) for b in batch]
                outputs = model(input_ids, lengths)
            else: # for BERT
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            prediction.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return true_labels, prediction

#### **Ploting Result Functions**

In [None]:
# Function for Ploting Average Loss vs. Epoch

import matplotlib.pyplot as plt

def plot_loss_vs_epoch(all_epoch, Average_loss, start=0, end=20):
    all_epoch = [str(num) for num in all_epoch[start:end]]
    Average_loss = Average_loss[start:end]

    plt.figure(figsize=(7, 3))

    plt.plot(all_epoch, Average_loss, marker='o', label='Average Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Average Loss')
    plt.title('Average Loss vs. Epoch')
    plt.legend()

    plt.grid(True)  # Add gridlines for better readability
    plt.tight_layout()
    plt.show()

In [None]:
# Function for classification report and Ploting confusion matrix:

def plot_classification_report_and_confusion_matrix(true_labels, preds, id2label, model_name):
    # Convert numerical predictions and true labels back to string labels
    true_labels_str = [id2label[label] for label in true_labels]
    preds_str = [id2label[label] for label in preds]

    # Print classification report
    print(classification_report(true_labels_str, preds_str, target_names=id2label.values()))

    print('........................................................ \n')

    # confusion matrix
    conf_matrix = confusion_matrix(true_labels_str, preds_str)
    # Display the confusion matrix using a heatmap
    plt.figure(figsize=(5, 5))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=id2label.values(), yticklabels=id2label.values())
    plt.title(f"Confusion Matrix for {model_name}")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()

# **Step 5: Applying Models**

## ........  5 - 1: **Prepare Data for LSTM Models** ............

In [None]:
tokenizer = get_tokenizer('basic_english')

In [None]:
# Tokenization
def tokenize(text):
    return tokenizer(text)

# Create the vocabulary
def yield_tokens(data):
    for text in data:
        yield tokenize(text)

vocab = build_vocab_from_iterator(yield_tokens(X_train), specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab.set_default_index(vocab['<unk>'])

In [None]:
# Prepare data for LSTM
def prepare_data_for_lstm(sentences, labels, tokenizer, vocab):
    tokenized_sentences = [tokenize(sentence) for sentence in sentences]
    lengths = [len(tokenized_sent) for tokenized_sent in tokenized_sentences]
    padded_sentences = [tokenized_sent + ['<pad>'] * (max(lengths) - len(tokenized_sent)) for tokenized_sent in tokenized_sentences]
    numerical_sentences = [[vocab[token] for token in sentence] for sentence in padded_sentences]

    return torch.tensor(numerical_sentences), torch.tensor(lengths), torch.tensor(labels)

# Prepare training and testing data for LSTM
lstm_train_data = prepare_data_for_lstm(X_train, y_train, tokenizer, vocab)
lstm_test_data = prepare_data_for_lstm(X_test, y_test, tokenizer, vocab)

In [None]:
lstm_train_data

In [None]:
# Create TensorDatasets
lstm_train_dataset = TensorDataset(lstm_train_data[0], lstm_train_data[1], lstm_train_data[2])
lstm_test_dataset = TensorDataset(lstm_test_data[0], lstm_test_data[1], lstm_test_data[2])

In [None]:
lstm_train_dataset[0]  # first text (sentence)

In [None]:
# Create DataLoaders
batch_size = 16
lstm_train_dataloader = DataLoader(lstm_train_dataset, batch_size=batch_size, shuffle=True)
lstm_test_dataloader = DataLoader(lstm_test_dataset, batch_size=batch_size, shuffle=False)

## ........  5 - 2: **LSTM: Training  and Evaluation** ............

In [None]:
learning_rate = [0.001, 0.005, 0.01, 0.05, 0.01]
hidden_dim_list = [128, 256]

for hidden_dim in hidden_dim_list:
    for lr in learning_rate:
        # Initialize the LSTMClassifier model
        embedding_dim = 128
        hidden_dim = hidden_dim
        lstm_model = LSTMClassifier(vocab_size=len(vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=len(unique_labels))

        optimizer = torch.optim.Adam(lstm_model.parameters(), lr=lr)
        loss_fn = nn.CrossEntropyLoss()

        print(f'---------- Result for the learing Rate: {lr} and hidden_dim: {hidden_dim} Started ----------')
        # Train model
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        train_model(lstm_model, lstm_train_dataloader, optimizer, loss_fn, device, epochs=20, model_type = 'LSTM')

        # evalution
        lstm_true_labels, lstm_preds = evaluate_model(lstm_model, lstm_test_dataloader, device, model_type = 'LSTM')

        # Print classification report and Display the confusion matrix
        plot_classification_report_and_confusion_matrix(lstm_true_labels, lstm_preds, id2label, "LSTM")

        print(f' ----------Result for the learing Rate: {lr} and hidden_dim: {hidden_dim} Finished ----------')
        print('\n\n')


print('\n\n')
print('---------------------- All process finished. ----------------------')

In [None]:
# Create the plot for Loss and Epoch
plot_loss_vs_epoch(all_epoch, Average_loss,  start=0, end=20)

## ........  5 - 3: **Bidirectional LSTM: Training  and Evaluation** ............

In [None]:
learning_rate = [0.001, 0.005, 0.01, 0.05, 0.01]
hidden_dim_list = [128, 256]

for hidden_dim in hidden_dim_list:
    for lr in learning_rate:
        embedding_dim = 128
        hidden_dim = hidden_dim
        bilstm_model = BiLSTMClassifier(vocab_size=len(vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=len(unique_labels))

        optimizer = torch.optim.Adam(bilstm_model.parameters(), lr=lr)
        loss_fn = nn.CrossEntropyLoss()

        print(f'---------- Result for the learing Rate: {lr} and hidden_dim: {hidden_dim} Started ----------')
        # Train model
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        train_model(bilstm_model, lstm_train_dataloader, optimizer, loss_fn, device, epochs=20, model_type = 'LSTM')

        # evalution
        bi_lstm_true_labels, bi_lstm_preds = evaluate_model(bilstm_model, lstm_test_dataloader, device, model_type = 'LSTM')

        # Print classification report and Display the confusion matrix
        plot_classification_report_and_confusion_matrix(bi_lstm_true_labels, bi_lstm_preds, id2label, " Bidirectional LSTM")
        print('\n')
        print(f' ----------Result for the learing Rate: {lr} and hidden_dim: {hidden_dim} Finished ----------')
        print('\n\n')


print('\n\n')
print('---------------------- All process finished. ----------------------')

In [None]:
plot_loss_vs_epoch(all_epoch, Average_loss,  start=20, end=40)

## ........  5 - 4: **Data Preparation for BERT Models** ............

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Testing tokenizer
tokenizer("Hello World") 

# sentence example : result creates format of: [CLS] + sentence + [SEP]

In [None]:
tokenizer("Hello World , How are you") # sentence example

In [None]:
def prepare_data_for_bert(sentences, labels, tokenizer):
    encodings = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=128)
    labels = torch.tensor(labels)

    return encodings, labels

# Prepare training and testing data for BERT
train_encodings, train_labels = prepare_data_for_bert(X_train, y_train, tokenizer)  # dictionary
test_encodings, test_labels = prepare_data_for_bert(X_test, y_test, tokenizer)

# Convert to tensors
train_input_ids = train_encodings['input_ids']
train_attention_mask = train_encodings['attention_mask']
train_labels = train_labels

test_input_ids = test_encodings['input_ids']
test_attention_mask = test_encodings['attention_mask']
test_labels = test_labels


# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)

In [None]:
len(train_encodings)

In [None]:
train_encodings

In [None]:
train_encodings['input_ids']

In [None]:
train_encodings['attention_mask']

In [None]:
train_dataset[0]

In [None]:
# Create DataLoaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## ........  5 - 5: **BERT: Training  and Evaluation** ............

In [None]:
learning_rate = [0.00001, 0.0001, 0.001]
epoch_numers = [4, 5]

for epoch_num in epoch_numers:
    for lr in learning_rate:
        # Initialize the model
        model = BERTClassifier(num_labels=len(unique_labels))
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        loss_fn = nn.CrossEntropyLoss()

        print(f'---------- Result for the   learing Rate: {lr} and   Epoch number: {epoch_num} Started ----------')
        # Training model
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        train_model(model, train_dataloader, optimizer, loss_fn, device, epochs=epoch_num, model_type = 'BERT')

        # Evalute model
        bert_true_labels, bert_predict = evaluate_model(model, test_dataloader, device, model_type = 'BERT')

        # Print classification report and Display the confusion matrix
        plot_classification_report_and_confusion_matrix(bert_true_labels, bert_predict, id2label, "BERT")

        print(f' ----------Result for the   learing Rate: {lr} and   Epoch number: {epoch_num} Finished ----------')
        print('\n\n')

print('\n\n')
print('---------------------- All process finished. ----------------------')

In [None]:
plot_loss_vs_epoch(all_epoch, Average_loss,  start= 40, end= 45)

## ........  5 - 6: **BERT_Larg: Training  and Evaluation** ............

In [None]:
learning_rate = [0.00001, 0.0001, 0.001]
epoch_numers = [4, 5]

for epoch_num in epoch_numers:
    for lr in learning_rate:
        # Initialize the model
        model_B_L = BERTClassifier_Larg(num_labels=len(unique_labels))
        optimizer = torch.optim.AdamW(model_B_L.parameters(), lr=lr)
        loss_fn = nn.CrossEntropyLoss()

        print(f'---------- Result for the   learing Rate: {lr} and   Epoch number: {epoch_num} Started ----------')
        # Training model
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        train_model(model_B_L, train_dataloader, optimizer, loss_fn, device, epochs=epoch_num, model_type = 'BERT')

        # Evalute model
        bert_true_labels, bert_predict = evaluate_model(model_B_L, test_dataloader, device, model_type = 'BERT')

        # Print classification report and Display the confusion matrix
        plot_classification_report_and_confusion_matrix(bert_true_labels, bert_predict, id2label, "BERTClassifier_Larg Model")

        print(f' ----------Result for the   learing Rate: {lr} and   Epoch number: {epoch_num} Finished ----------')
        print('\n\n')

print('\n\n')
print('---------------------- All process finished. ----------------------')

## Sample outputs

In [None]:
# Check the results
true_labels, prediction
print(f"True labels sample: {true_labels[:10]}")
print(f"Predictions sample: {prediction[:10]}")

In [None]:
# Sample outputs for visualization
sample_sentences = X_test[5:15]
sample_true_labels = [id2label[label] for label in y_test[:15]]
sample_pred_labels = [id2label[label] for label in prediction[:15]]

In [None]:
# Print some sample test data outputs
for sentence, true_label, pred_label in zip(sample_sentences, sample_true_labels, sample_pred_labels):
    print(f"Sentence: {sentence}\nTrue Label: {true_label}\nPredicted Label: {pred_label}\n")

# **Step 6: Knowledge Graph**



## ........  **6 - 1: Data Preparation for KG** ............

In [None]:
df = pd.DataFrame(under_sampled_data)

df_KG = df[['sub', 'obj', 'pred']]

df_KG['sub'].head(3)

In [None]:
pip install networkx

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Initialize the graph
graph = nx.Graph()

# Add nodes and edges from results
for index, row in df_KG.iterrows():
    sub = row['sub']
    obj = row['obj']
    pred = row['pred']

    # Add nodes if not already in the graph
    if sub not in graph.nodes:
        graph.add_node(sub)
    if obj not in graph.nodes:
        graph.add_node(obj)

    # Add edge with predicate as relationship
    graph.add_edge(sub, obj, relation=pred)

# Example of accessing nodes and edges
print("Nodes:", graph.nodes)
print("Edges:", graph.edges(data=True))

In [None]:
print("Number of nodes:", len(graph.nodes))
print("Number of edges:", len(graph.edges))
print("Number of dataset:", len(df_KG))
print("Number of unique subjects:", len(df_KG['sub'].unique()))
print("Number of unique objects:", len(df_KG['obj'].unique()))
print("Number of unique predicates:", len(df_KG['pred'].unique()))


## ........  **6 - 2: Static Graph Visualization** ............

In [None]:
plt.figure(figsize=(12, 8))

# Layout algorithm for visualization
pos = nx.spring_layout(graph, seed=42)

# Draw nodes and edges
nx.draw(graph, pos, with_labels=True, node_size=2000, node_color='skyblue', edge_color='gray', font_size=12, font_weight='bold')

# Draw edge labels with adjusted positioning and style
edge_labels = nx.get_edge_attributes(graph, 'relation')
nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels, font_color='red', font_size=10, label_pos=0.5)

plt.title("Knowledge Graph", fontsize=16, fontweight='bold')
plt.axis('off')  # Hide the axis

# Display the graph
plt.tight_layout()
plt.show()


## ........  **6 - 3: Dynamic Graph Visualization** ............

In [None]:
import networkx as nx
import plotly.graph_objects as go
import matplotlib.pyplot as plt

# Ploting Dynamic Graph
def visualize_dynamic_graph(graph, title='Knowledge Graph', seed=42):
    # Layout algorithm for visualization
    pos = nx.spring_layout(graph, seed=seed)

    # Extract edges and node positions
    edge_x = []
    edge_y = []
    edge_text = []  # To store the relation information
    for edge in graph.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
        edge_text.append(edge[2]['relation'])  # Add the relation to edge_text

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1, color='gray'),
        hoverinfo='text',  # Set hoverinfo to 'text'
        text=edge_text,  # Use edge_text for hover text
        mode='lines')

    node_x = []
    node_y = []
    node_text = []  # To store the node labels
    for node in graph.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_text.append(node)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=node_text,  # Use node_text for hover text
        textposition="top center",
        hoverinfo='text',
        marker=dict(
            showscale=False,
            color='skyblue',
            size=20,
            line_width=2))

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=title,
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        annotations=[dict(
                            text=title,
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002)],
                        xaxis=dict(showgrid=False, zeroline=False),
                        yaxis=dict(showgrid=False, zeroline=False))
                    )

    fig.show()

In [None]:
visualize_dynamic_graph(graph = graph, title='Knowledge Graph', seed=42)

### **KG for Specific Predicate of interest, such as institution **

In [None]:
import networkx as nx
import plotly.graph_objects as go
import matplotlib.pyplot as plt

# Define the predicate of interest (relation between sub and obj)
predicate_of_interest = 'institution'

# Initialize the graph
graph = nx.Graph()

# Add nodes and edges from results where the predicate matches the one of interest(relation)
for index, row in df_KG_limited.iterrows():
    sub = row['sub']
    obj = row['obj']
    pred = row['pred']

    if pred == predicate_of_interest:
        # Add nodes if not already in the graph
        if sub not in graph.nodes:
            graph.add_node(sub)
        if obj not in graph.nodes:
            graph.add_node(obj)

        # Add edge with predicate as relationship
        graph.add_edge(sub, obj, relation=pred)

In [None]:
visualize_dynamic_graph(graph = graph, title='Knowledge Graph', seed=42)