In [29]:
import pickle
import os
import os.path as osp
import json
import numpy as np

In [4]:
def load_json(file_path):
    with open(file_path, "rb") as f:
        data = json.load(f)
        # logger.info(f"Loaded Json: {file_path}")
    return data


def load_pickle(file_path):
    with open(file_path, "rb") as f:
        data = pickle.load(f)
        # logger.info(f"Loaded Pickle: {file_path}")
    return data
def load_file(file_path):
    if file_path.endswith(".json"):
        return load_json(file_path)
    elif file_path.endswith(".pkl"):
        return load_pickle(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_path}")

In [5]:
raw_data_path = "/media/mtybilly/My Passport1/Program/GPL-Twitter/Data/TwitterData/raw"
raw_edges = load_file(osp.join(raw_data_path, "edges.json"))
keyword_dict = load_file(osp.join(raw_data_path, "keyword_embeddings.pkl"))
tweet_dict = load_file(osp.join(raw_data_path, "tweet_embeddings.pkl"))
user_dict = load_file(osp.join(raw_data_path, "user_embeddings.pkl"))
user_labels = load_file(osp.join(raw_data_path, "labels.json"))

In [6]:
raw_edges

[{'source_id': 'user_477',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_22398',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_15158',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_19855',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_2089',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_9468',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_2244',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_22686',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_9151',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_16095',
  'relation': 'user-follower-user',
  'target_id': 'user_2824'},
 {'source_id': 'user_3862',
  'relation': 'user-follower-user',
  

In [8]:
from easydict import EasyDict
from collections import defaultdict

In [12]:
relations_types = { edge["relation"] for edge in raw_edges }
relations_types

{'tweet-include-keyword',
 'tweet-tag-keyword',
 'user-engage-tweet',
 'user-follower-user',
 'user-following-user',
 'user-post-tweet',
 'user-profile-keyword'}

In [19]:
user_stat = {user_id: defaultdict(int) for user_id in user_labels.keys()}

In [21]:
for edge in raw_edges:
    source_id, relation, target_id = edge["source_id"], edge["relation"], edge["target_id"]
    if source_id.startswith("user_"):
        user_stat[source_id][relation] += 1
    if target_id.startswith("user_"):
        user_stat[target_id][relation] += 1

In [27]:
class_members = {label :[] for label in user_labels.values()}
for user_id, label in user_labels.items():
    class_members[label].append(user_id)

In [28]:
print({label: len(users) for label, users in class_members.items()})    

{'Buyer': 2142, 'Related': 668, 'Seller': 627, 'Negative': 25402}


In [36]:
selected_users = {}
class_size = {}
for label in class_members:
    if label != "Negative":
        selected_users[label] = class_members[label]
    else:
        selected_users[label] = np.random.choice(class_members[label], len(class_members[label]) //3 , replace=False)

In [37]:
print({label: len(users) for label, users in selected_users.items()}) 

{'Buyer': 2142, 'Related': 668, 'Seller': 627, 'Negative': 8467}
