In [None]:
import json
import pickle 
import torch 
from torch_geometric.data import Data
import numpy as np


In [None]:
with open('edges.json', 'r') as f:
    edges = json.load(f)

with open('labels.json', 'r') as f:
    labels = json.load(f)

with open('keyword_embeddings.pkl', 'rb') as f:
    keyword_embeddings = pickle.load(f)

with open('tweet_embeddings.pkl', 'rb') as f:
    tweet_embeddings = pickle.load(f)

with open('user_embeddings.pkl', 'rb') as f:
    user_embeddings = pickle.load(f)

In [None]:
# print(json.dumps(edges[:5], indent=4))
print(json.dumps(edges, indent = 4))

In [None]:
# print(json.dumps({follow: label for i, (follow, label) in enumerate(labels.items()) if i < 10}, indent = 4))
print(json.dumps(labels, indent = 4))

In [None]:
print(keyword_embeddings)

In [None]:
print(tweet_embeddings)

In [None]:
print(user_embeddings)

In [None]:

def check_missing_values(data, name):
    ''' Checks if there are any missing values'''

    if isinstance(data, list):
        for index, value in enumerate(data):

            if isinstance(value, dict) or isinstance(value, list) or isinstance(value, np.ndarray):
                check_missing_values(value, name)

            elif value is None:
                print(f'Missing value in {index} in {name}')
        
    elif isinstance(data, dict):
        for index, (key, value) in enumerate(data.items()):
            if isinstance(value, dict) or isinstance(value, list) or isinstance(value, np.ndarray):
                check_missing_values(value, name)

            elif value is None:
                print(f'Missing value in {key} and index {index} in {name}')
    
    elif isinstance(data, np.ndarray):
        for index, value in enumerate(data):

            if isinstance(value, dict) or isinstance(value, list) or isinstance(value, np.ndarray):
                check_missing_values(value, name)

            elif value is None:
                print(f'Missing value in {index} in {name}')

check_missing_values(edges, 'edges')
check_missing_values(labels, 'labels')
check_missing_values(keyword_embeddings, 'keyword_embeddings')
check_missing_values(tweet_embeddings, 'tweet_embeddings')
check_missing_values(user_embeddings, 'user_embeddings')



In [None]:
found = set()
duplicates = []
for edge in edges:
    edge_index = (edge['source_id'], edge['relation'], edge['target_id'])
    if edge_index in found:
        duplicates.append(edge_index)
    else:
        found.add(edge_index)
if duplicates:
    for duplicate in duplicates:
        print(duplicates)


In [None]:
found = set()
duplicates = []
for user in labels.keys():
    if user in set():
        duplicates.append(user)
    else:
        found.add(user)
        

In [None]:
def check_embeddings(data, name):
    embeddings = data['embeddings']

    ebmeddings_len = len(embeddings[0])

    if np.isnan(embeddings).any():
        print(f'There is a nan value in {name}')
    
    for embedding in embeddings:
        if len(embedding) != ebmeddings_len:
            print(f'Not smae size arrays in {name}')
            
check_embeddings(keyword_embeddings, 'keyword')
check_embeddings(tweet_embeddings, 'tweet')
check_embeddings(user_embeddings, 'user')


In [None]:
def print_embeddings(data, break_num = 5):
    embeddings = data['embeddings']
    ids = data['ids']

    for i, embedding in enumerate(embeddings):
        print(f'{ids[i]} : {embedding}')
        if break_num == i:
            break
        

In [None]:
print_embeddings(keyword_embeddings)

In [None]:
print_embeddings(tweet_embeddings)


In [None]:
print_embeddings(user_embeddings)