In [1]:
from convert_AMR import create_dataloader, convert_AMR
from litgpt.model import GPT
import numpy as np
import torch
import networkx as nx
from litgpt.config import Config
import matplotlib.pyplot as plt
import random
import itertools
from io import StringIO

2024-08-16 16:33:32 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [2]:
def magnetic_laplacian(g, q = 0.25, tolerance = 1e-10):
    def exp_theta_i(A, q=0.25):
        return np.exp(2 * np.pi * q * 1j*(A - A.T))
    A_symmetric = nx.adjacency_matrix(g.to_undirected()).toarray()
    A_directed = nx.adjacency_matrix(g).toarray()
    D_s = np.diag(np.sum(A_symmetric, axis=1))
    asymmetric_element = exp_theta_i(A_directed, q)
    laplacian = D_s - np.multiply(asymmetric_element, A_symmetric)
    
    # Check if the real part is very close to zero and set it to zero
    laplacian_real = np.real(laplacian)
    laplacian_real[np.abs(laplacian_real) < tolerance] = 0
    # Reconstruct the complex array with the updated real part and original imaginary part
    laplacian = laplacian_real + 1j * laplacian.imag
    return laplacian

def convert_complex_to_real(eig_vecs):
    """
    Convert complex eigenvectors to real by concatenating the real and imaginary parts.

    Parameters:
    eig_vecs (numpy.ndarray): Complex eigenvectors.

    Returns:
    numpy.ndarray: Real-valued vectors.
    """
    real_part = np.real(eig_vecs)
    imag_part = np.imag(eig_vecs)
    real_valued_vecs = np.concatenate((real_part, imag_part), axis=-1)
    return real_valued_vecs

def magL_eigenvalues(MagL):
    eigvals, eig_vecs = np.linalg.eig(MagL)
    return convert_complex_to_real(eig_vecs)


In [3]:
import torch

def pad_1d_tensor(tensor, target_length, pad_value=0):
    """
    Pads the input 1D tensor to the target length with the specified pad value.
    
    Args:
    tensor (torch.Tensor): The input 1D tensor to pad.
    target_length (int): The target length.
    pad_value (float): The value to use for padding.
    
    Returns:
    torch.Tensor: The padded 1D tensor.
    """
    pad_length = target_length - tensor.size(0)
    return torch.nn.functional.pad(tensor, (0, pad_length), 'constant', pad_value)

def list_of_lists_to_3d_tensor(list_of_lists, pad_value=0):
    """
    Converts a list of list of 1D tensors into a 3D tensor by padding the smaller tensors.
    
    Args:
    list_of_lists (list of list of torch.Tensor): The input list of lists of 1D tensors.
    pad_value (float): The value to use for padding.
    
    Returns:
    torch.Tensor: The resulting 3D tensor.
    """
    # Flatten the list of lists and determine the max length
    all_tensors = [tensor for sublist in list_of_lists for tensor in sublist]
    max_length = max(tensor.size(0) for tensor in all_tensors)
    
    # Pad each 1D tensor and gather them in a list
    padded_tensors = [[pad_1d_tensor(tensor, max_length, pad_value) for tensor in sublist] for sublist in list_of_lists]
    
    # Stack the padded tensors to form a 3D tensor
    stacked_tensor = [torch.stack(sublist) if len(sublist)>0 else torch.tensor([]) for sublist in padded_tensors]
    stacked_tensor = torch.stack(stacked_tensor)
    
    return stacked_tensor
import torch

def pad_1d_tensor(tensor, target_length, pad_value=0):
    """
    Pads the input 1D tensor to the target length with the specified pad value.
    
    Args:
    tensor (torch.Tensor): The input 1D tensor to pad.
    target_length (int): The target length.
    pad_value (float): The value to use for padding.
    
    Returns:
    torch.Tensor: The padded 1D tensor.
    """
    pad_length = target_length - tensor.size(0)
    return torch.nn.functional.pad(tensor, (0, pad_length), 'constant', pad_value)

def pad_list_of_tensors(lists, target_length, max_length, pad_value):
    """
    Pads the list of tensors to the target length with tensors filled with the pad value.
    
    Args:
    lists (list of torch.Tensor): The input list to pad.
    target_length (int): The target length.
    max_length (int): The maximum length of the 1D tensors.
    pad_value (float): The value to use for padding.
    
    Returns:
    list of torch.Tensor: The padded list.
    """
    padded_tensor = torch.tensor([pad_value] * max_length)
    padded_list = lists + [padded_tensor] * (target_length - len(lists))
    return padded_list

def list_of_lists_to_4d_tensor(list_of_lists, pad_value=0):
    """
    Converts a list of list of list of 1D tensors into a 4D tensor by padding the smaller tensors
    and lists.
    
    Args:
    list_of_lists (list of list of list of torch.Tensor): The input list of lists of lists of 1D tensors.
    pad_value (float): The value to use for padding.
    
    Returns:
    torch.Tensor: The resulting 4D tensor.
    """
    # Flatten the 3-layer list and determine the max length and sizes
    all_tensors = [tensor for lvl1 in list_of_lists for lvl2 in lvl1 for tensor in lvl2]
    max_length = max(tensor.size(0) for tensor in all_tensors)
    max_lvl2_length = max(len(lvl2) for lvl1 in list_of_lists for lvl2 in lvl1)
    max_lvl1_length = max(len(lvl1) for lvl1 in list_of_lists)
    max_lvl0_length = len(list_of_lists)
    
    # Pad each 1D tensor and reconstruct the 3-layer structure
    padded_tensors = [[[pad_1d_tensor(tensor, max_length, pad_value) for tensor in lvl2] for lvl2 in lvl1] for lvl1 in list_of_lists]

    # Pad the inner lists to the maximum level 2 length
    padded_tensors = [[pad_list_of_tensors(lvl2, max_lvl2_length, max_length, pad_value) for lvl2 in lvl1] for lvl1 in padded_tensors]

    # Pad the level 1 lists to the maximum level 1 length
    padded_tensors = [pad_list_of_tensors(lvl1, max_lvl1_length, max_length, pad_value) for lvl1 in padded_tensors]

    # Pad the outermost list to the maximum level 0 length
    padded_tensors = pad_list_of_tensors(padded_tensors, max_lvl0_length, max_length, pad_value)

    # Separate the stacking operations
    stacked_lvl3 = [[torch.stack(lvl3) for lvl3 in lvl2] for lvl2 in padded_tensors]
    stacked_lvl2 = [torch.stack(lvl2) for lvl2 in stacked_lvl3]
    stacked_lvl1 = torch.stack(stacked_lvl2)

    return stacked_lvl1


def tensor_of_tensors_to_list_of_lists(tensor):
    return tensor.tolist()


In [4]:
import json
path = "./data/amr3/processed_amr"
dl, tokenizer = create_dataloader(path, batch_size=128)

# Initialize the dictionary to store batches
batches_dict = {
    "node_ids": [],
    "eig_vecs": [],
    "node_labels": [],
    "graphs": [],
    "target_labels": [],
    "graph_text": [],   
}
c = 0
for batch in dl:
    node_ids_batch = []
    edge_index_batch = []
    eig_vecs_batch = []
    node_labels_batch = []
    graphs_batch = []
    labels_batch = []
    graph_text_batch = []
    if c==2:
        #break
        pass

    for i in range(len(batch["input_ids"])):
        tgt_labels = batch["labels"][i]
        edges, edge_type = batch["graphs"][i]
        input_ids = batch["input_ids"][i]
        target_text = batch["tgt_text"][i]
        source_text = batch["src_text"][i]
        AMR = convert_AMR(input_ids=input_ids, edges=edges, edge_types=edge_type, target_labels=tgt_labels, tokenizer=tokenizer, target_text=target_text)
        # Prepare graph input
        G = AMR["graph"]
        
        if G.number_of_nodes() == 0:
            continue
        node_labels = AMR["node_labels"]
        # MagL = magnetic_laplacian(G, q=0.25)
        # eig_vecs = magL_eigenvalues(MagL)

        # Get the edge list
        edge_list = nx.generate_edgelist(G, data=False)

        # Convert edge list to a single string
        edge_list_str = "\n".join(edge_list)

        
        node_ids_batch.append(torch.tensor(list(AMR["node_labels"].keys())))
        # eig_vecs_batch.append(torch.tensor(eig_vecs))
        node_labels_batch.append(torch.tensor(AMR["input_ids"]))
        graphs_batch.append(G)
        labels_batch.append(AMR["target_text"])
        graph_text_batch.append(edge_list_str)
        sentence = tokenizer.decode(input_ids)
        if(len(sentence.split()) != len(G.nodes())):
            if source_text != len(G.nodes()):
                print("ERROR")
                print(sentence)
                print(source_text)
                for n in sorted(G.nodes()):

                    print(n, AMR["node_labels"][n])
    
    batch_dict = {}    
    for i in range(len(node_labels_batch)):
        batch_dict[i] = {
            "node_ids": node_ids_batch[i].tolist(),
            # "eig_vecs": list(eig_vecs_batch[i]),
            "node_labels": tokenizer.decode(node_labels_batch[i]),
            "target_labels": labels_batch[i],
            "graph_text": graph_text_batch[i],
        }
    # Specify the file path to save the JSON file
    file_path = f"./custom_datasets_batches/{c}.json"

    # Convert batch_dict to JSON format
    batch_dict_json = json.dumps(batch_dict)

    # Write the JSON data to the file
    with open(file_path, 'w') as file:
        file.write(batch_dict_json)
        
    # Append the processed batch to the dictionary
    batches_dict["node_ids"].append(node_ids_batch)
    # batches_dict["eig_vecs"].append(eig_vecs_batch)
    batches_dict["node_labels"].append(node_labels_batch)
    batches_dict["graphs"].append(graphs_batch)
    batches_dict["target_labels"].append(labels_batch)
    batches_dict["graph_text"].append(graph_text_batch)
    c += 1

# batches_dict["node_labels"] = list_of_lists_to_3d_tensor(batches_dict["node_labels"], pad_value=tokenizer.pad_token_id)
# batches_dict["eig_vecs"] = list_of_lists_to_4d_tensor(batches_dict["eig_vecs"], pad_value=0)



Added 165 tokens
ERROR
and :op1 believe :ARG0 person :ARG0-of have-org-role :ARG1 company :wiki - :name name :op1 IM :mod country :wiki United_States :name name :op1 United :op2 States :ARG2 officer :mod executive :mod chief :ARG1 capable :ARG1 person :ARG1-of employ :ARG0 company :mod each :ARG2 innovate :ARG0 person :op2 formulate :ARG0 officer :mod executive :mod chief :ARG1 countermeasure :mod strategy :purpose innovate :topic industry :time after :op1 invent :ARG0 company :ARG0-of compete :ARG1 company :ARG1 machine :ARG0-of wash :ARG1-of load :mod front
and :op1 believe :ARG0 person :ARG0-of have-org-role :ARG1 company :wiki - :name name :op1 IM :mod country :wiki United_States :name name :op1 United :op2 States :ARG2 officer :mod executive :mod chief :ARG1 capable :ARG1 person :ARG1-of employ :ARG0 company :mod each :ARG2 innovate :ARG0 person :op2 formulate :ARG0 officer :mod executive :mod chief :ARG1 countermeasure :mod strategy :purpose innovate :topic industry :time after :

FileNotFoundError: [Errno 2] No such file or directory: './custom_datasets_batches/0.json'

In [None]:
# # Initialize and run the model
# config = Config.from_name('stablelm-base-alpha-3b')  # Adjust according to your config
# model = GPT(config,batches_dict["eig_vecs"].size(-1))
# output = model(batches_dict)
# print(output)

In [None]:
# Generate positions for a tree layout
pos = nx.nx_agraph.graphviz_layout(G, prog='dot')
#edge_labels = nx.get_edge_attributes(G, 'label')
plt.figure(figsize=(120*2, 80*2))  # You can adjust the size as needed

# Draw the graph
#pos = nx.spring_layout(G, center=[0.5, 0.5])  
#nx.draw(G, pos, with_labels=True, labels = labels_dict, node_color='skyblue', node_size=10000, edge_color='k', linewidths=1, font_size=100)
nx.draw_networkx_nodes(G, pos, node_color='skyblue', node_size=1000)

# Draw edges with arrows
nx.draw_networkx_edges(G, pos, edge_color='gray', arrowstyle='-|>', arrowsize=150)

# Draw labels
nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=150)

plt.show()

In [None]:
tokenizer.decode(AMR["target_labels"]) 

In [None]:
eigv = batches_dict["eig_vecs"][0][0]

In [None]:
eigv.shape

In [None]:
eigv[1]

In [None]:
batches_dict.keys()

In [None]:
for key in batches_dict.keys():
    print(key, batches_dict[key][0][0])

In [None]:
batches_dict

In [None]:
# Get the edge list
edge_list = nx.generate_edgelist(G, data=False)

# Convert edge list to a single string
edge_list_str = "\n".join(edge_list)

print(edge_list_str)

In [None]:
G

In [None]:
import numpy as np

def pad_and_concat_eigenvectors(eigenvectors, max_seq_len):
    """
    Pad and concatenate the real and imaginary parts of eigenvectors.

    Parameters:
    eigenvectors (list of list of complex): A square matrix (m by m) of complex-valued eigenvectors.
    max_seq_len (int): The maximum sequence length.

    Returns:
    np.ndarray: A vector with real parts from 0 to max_seq_len - 1 and imaginary parts from max_seq_len to max_seq_len * 2 - 1.
    """
    m = len(eigenvectors)
    
    # Initialize the padded vector with complex zeros
    padded_vector = np.zeros((max_seq_len,max_seq_len * 2,), dtype=complex)
    
    for i in range(m):
        real_part = [e.real for e in eigenvectors[i]]
        imag_part = [e.imag for e in eigenvectors[i]]
        
        # Place real part in the first half
        padded_vector[i][:m] = real_part
        # Place imaginary part in the second half
        padded_vector[i][max_seq_len:max_seq_len + m]  = imag_part
    
    return padded_vector.astype(np.float32)

# Example usage
eigenvectors = [
    [1+2j, 3+4j],
    [5+6j, 7+8j]
]
max_seq_len = 3

result = pad_and_concat_eigenvectors(eigenvectors, max_seq_len)
print(result)


[[1. 3. 0. 2. 4. 0.]
 [5. 7. 0. 6. 8. 0.]
 [0. 0. 0. 0. 0. 0.]]


  return padded_vector.astype(np.float32)


In [5]:
from transformers import T5Tokenizer
from init_tokenizer import CustomT5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-base")
tokenizer = CustomT5Tokenizer(tokenizer)

In [8]:
ids = tokenizer.encode("and :op1 believe :ARG0 person :ARG0-of have-org-role :ARG1 company :wiki - :name name :op1 IM :mod country :wiki United_States :name name :op1 United :op2 States :ARG2 officer :mod executive :mod chief :ARG1 capable :ARG1 person :ARG1-of employ :ARG0 company :mod each :ARG2 innovate :ARG0 person :op2 formulate :ARG0 officer :mod executive :mod chief :ARG1 countermeasure :mod strategy :purpose innovate :topic industry :time after :op1 invent :ARG0 company :ARG0-of compete :ARG1 company :ARG1 machine :ARG0-of wash :ARG1-of load :mod front")
for id in ids:
    print(tokenizer.decode([id]))

and
:op1
believe
:ARG0
person
:ARG0-of
have-org-role
:ARG1
company
:wiki
-
:name
name
:op1
IM
:mod
country
:wiki
United_States
:name
name
:op1
United
:op2
States
:ARG2
officer
:mod
executive
:mod
chief
:ARG1
capable
:ARG1
person
:ARG1-of
employ
:ARG0
company
:mod
each
:ARG2
innovate
:ARG0
person
:op2
formulate
:ARG0
officer
:mod
executive
:mod
chief
:ARG1
countermeasure
:mod
strategy
:purpose
innovate
:topic
industry
:time
after
:op1
invent
:ARG0
company
:ARG0-of
compete
:ARG1
company
:ARG1
machine
:ARG0-of
wash
:ARG1-of
load
:mod
front


In [7]:
ids

[32100,
 32101,
 32102,
 32103,
 32104,
 32105,
 32106,
 32107,
 32108,
 32109,
 32110,
 32111,
 32112,
 32101,
 32113,
 32114,
 32115,
 32109,
 32116,
 32111,
 32112,
 32101,
 32117,
 32118,
 32119,
 32120,
 32121,
 32114,
 32122,
 32114,
 32123,
 32107,
 32124,
 32107,
 32104,
 32125,
 32126,
 32103,
 32108,
 32114,
 32127,
 32120,
 32128,
 32103,
 32104,
 32118,
 32129,
 32103,
 32121,
 32114,
 32122,
 32114,
 32123,
 32107,
 32130,
 32114,
 32131,
 32132,
 32128,
 32133,
 32134,
 32135,
 32136,
 32101,
 32137,
 32103,
 32108,
 32105,
 32138,
 32107,
 32108,
 32107,
 32139,
 32105,
 32140,
 32125,
 32141,
 32114,
 32142]