Data information


In [9]:
import pandas as pd
import json
import numpy as np

edge_path = "/kaggle/input/deezer-data/deezer_europe/deezer_europe_edges.csv"
json_path = "/kaggle/input/deezer-data/deezer_europe/deezer_europe_features.json"

# ---------- Edge-list info ----------

edges_df = pd.read_csv(edge_path)
n_edges = len(edges_df)
nodes_from_edges = set(edges_df['node_1']).union(edges_df['node_2'])
n_nodes_edges = len(nodes_from_edges)

print(f"EDGELIST:")
print(f"Number of edges: {n_edges}")
print(f"Number of nodes: {n_nodes_edges}")

self_loops = edges_df[edges_df['node_1'] == edges_df['node_2']]
duplicate_edges = edges_df.duplicated(subset=['node_1', 'node_2'])
print(f"- Self-loops: {len(self_loops)}")
print(f"- Duplicate edges: {duplicate_edges.sum()}")

# --- JSON FEATURES info ---
with open(json_path, "r") as f:
    features_dict = json.load(f)

n_json_nodes = len(features_dict)
feature_ids = set()
for feats in features_dict.values():
    feature_ids.update(feats)
n_features = len(feature_ids)

empty_feature_nodes = [uid for uid, feats in features_dict.items() if len(feats) == 0]
print("\nFEATURES JSON:")
print(f"- Nodes in JSON: {n_json_nodes}")
print(f"- Total distinct feature ids: {n_features}")
print(f"- Nodes with no features: {len(empty_feature_nodes)}")
if len(empty_feature_nodes) > 0:
    print(f"  (Sample: {empty_feature_nodes[:10]})")

# --- Overlap between nodes ---
shared_nodes = set(map(int, features_dict.keys())) & nodes_from_edges
print(f"\nOVERLAP Edgelist <-> Features JSON:")
print(f"- Common nodes: {len(shared_nodes)}")
print(f"- Nodes only in Edgelist: {len(nodes_from_edges - set(map(int, features_dict.keys())))}")
print(f"- Nodes only in Features JSON: {len(set(map(int, features_dict.keys())) - nodes_from_edges)}")


EDGELIST:
Number of edges: 92752
Number of nodes: 28281
- Self-loops: 0
- Duplicate edges: 0

FEATURES JSON:
- Nodes in JSON: 28281
- Total distinct feature ids: 30978
- Nodes with no features: 6159
  (Sample: ['4', '6', '7', '8', '9', '11', '12', '35', '36', '38'])

OVERLAP Edgelist <-> Features JSON:
- Common nodes: 28281
- Nodes only in Edgelist: 0
- Nodes only in Features JSON: 0



**Data splitting for embedding generation**

In [1]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
import random

# 1. Load edges
edges_df = pd.read_csv("/kaggle/input/deezer-data/deezer_europe/deezer_europe_edges.csv")
edges = list(zip(edges_df['node_1'], edges_df['node_2']))

# 2. Train/test split on positive edges
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)

# --- Save train and test positives ---
train_pos_df = pd.DataFrame(train_edges, columns=['node_1', 'node_2'])
train_pos_df.to_csv('train_positives.csv', index=False)

# Print number of unique nodes in train CSV
unique_nodes_train = set(train_pos_df['node_1']).union(set(train_pos_df['node_2']))
print(f"Number of unique nodes in train CSV: {len(unique_nodes_train)}")

test_pos_df = pd.DataFrame(test_edges, columns=['node_1', 'node_2'])
test_pos_df.to_csv('test_positives.csv', index=False)

def check_overlap_pairs(train_edges, test_edges):
    """
    Checks if any (u, v) pair exists in both train and test (order-independent).
    """
    train_set = set(tuple(sorted(edge)) for edge in train_edges)
    test_set = set(tuple(sorted(edge)) for edge in test_edges)
    overlap = train_set.intersection(test_set)
    return (len(overlap) > 0), overlap

# Call the function and print results
overlap_exists, overlap_pairs = check_overlap_pairs(train_edges, test_edges)
if overlap_exists:
    print(f"Overlap found! Duplicated pairs: {list(overlap_pairs)[:5]} ...")
else:
    print("No duplicate node pairs between train and test sets.")

# 3. Create train graph with all nodes and train edges
train_graph = nx.Graph()
all_nodes = list(set([n for edge in edges for n in edge]))
train_graph.add_nodes_from(all_nodes)
train_graph.add_edges_from(train_edges)

# 4. Generate negative samples for both splits (ensuring exclusivity)
def generate_negative_edges(graph, num_samples, excluded_edges):
    neg_edges = set()
    nodes = list(graph.nodes())
    while len(neg_edges) < num_samples:
        u = random.choice(nodes)
        v = random.choice(nodes)
        if u == v:
            continue
        if graph.has_edge(u, v) or (u, v) in excluded_edges or (v, u) in excluded_edges:
            continue
        neg_edges.add((u, v))
    return list(neg_edges)

train_neg_edges = generate_negative_edges(train_graph, len(train_edges), set(train_edges))
test_neg_edges = generate_negative_edges(train_graph, len(test_edges), set(train_edges).union(set(test_edges)))

# 5. Prepare datasets for training and testing
train_positive_labels = [1] * len(train_edges)
train_negative_labels = [0] * len(train_neg_edges)
test_positive_labels = [1] * len(test_edges)
test_negative_labels = [0] * len(test_neg_edges)

train_samples = train_edges + train_neg_edges
train_labels = train_positive_labels + train_negative_labels
test_samples = test_edges + test_neg_edges
test_labels = test_positive_labels + test_negative_labels

print(f"Train samples: {len(train_samples)} (positive: {len(train_edges)}, negative: {len(train_neg_edges)})")
print(f"Test samples: {len(test_samples)} (positive: {len(test_edges)}, negative: {len(test_neg_edges)})")
print(f"Total nodes in train graph: {train_graph.number_of_nodes()}")
print(f"Total edges in train graph: {train_graph.number_of_edges()}")

Number of unique nodes in train CSV: 26895
No duplicate node pairs between train and test sets.
Train samples: 148402 (positive: 74201, negative: 74201)
Test samples: 37102 (positive: 18551, negative: 18551)
Total nodes in train graph: 28281
Total edges in train graph: 74201


**Json manipulation of empty feature ids.**

In [25]:
import json
import numpy as np

# Load the user-feature JSON dictionary
with open('/kaggle/input/deezer-data/deezer_europe/deezer_europe_features.json', 'r') as f:
    user_features_dict = json.load(f)

# Determine the number of users (nodes)
num_users = len(user_features_dict)

# Find the maximum feature id to get the number of features
max_feature_id = 0
for features in user_features_dict.values():
    if features:  # handle empty list case
        max_feature_id = max(max_feature_id, max(features))

num_features = max_feature_id + 1

# Initialize the feature matrix with zeros
X = np.zeros((num_users, num_features), dtype=np.float32)

# Fill in the matrix
for user_str, features in user_features_dict.items():
    user_idx = int(user_str)
    for f in features:
        X[user_idx, f] = 1.0

print(f"Feature matrix shape: {X.shape}")
print(f"Feature vector of user 0: {X[0]}")


Feature matrix shape: (28281, 31241)
Feature vector of user 0: [0. 0. 0. ... 0. 0. 0.]


In [6]:
import json
import numpy as np
import torch
from sklearn.preprocessing import MultiLabelBinarizer

# 1. Load JSON data
with open('/kaggle/input/deezer-data/deezer_europe/deezer_europe_features.json') as f:
    raw_data = json.load(f)

# 2. Parse node IDs and features
node_ids = sorted(int(k) for k in raw_data.keys())
node_features = {int(k): v for k, v in raw_data.items()}

all_features = [
    [str(f) for f in node_features[nid]]  # convert all features to str
    for nid in node_ids
]

# Assign '<NO_FEATURE>' string to empty feature lists
for i, feats in enumerate(all_features):
    if len(feats) == 0:
        all_features[i] = ['<NO_FEATURE>']

mlb = MultiLabelBinarizer()
X = mlb.fit_transform(all_features)  # one-hot encode features

x = torch.tensor(X, dtype=torch.float)
print("x.shape =", x.shape)

x.shape = torch.Size([28281, 30979])


In [4]:
sparsity = 1.0 - (x.sum().item() / x.numel())
print(f"Sparsity: {sparsity:.4f}")  # e.g., > 0.99


Sparsity: 0.9989


In [3]:
import pandas as pd
edges_df = pd.read_csv('/kaggle/input/deezer-data/deezer_europe/deezer_europe_edges.csv')
all_nodes = set(edges_df['node_1']).union(edges_df['node_2'])

train_pos_df = pd.read_csv('/kaggle/working/train_positives.csv')
train_nodes = set(train_pos_df['node_1']).union(train_pos_df['node_2'])

missing_in_train = all_nodes - train_nodes
print(f"# nodes in full edgelist: {len(all_nodes)}")
print(f"# nodes in train_pos_df: {len(train_nodes)}")
print(f"# nodes missing from train_pos_df: {len(missing_in_train)}")
print("Sample missing nodes:", list(missing_in_train)[:10])


# nodes in full edgelist: 28281
# nodes in train_pos_df: 26895
# nodes missing from train_pos_df: 1386
Sample missing nodes: [16387, 24579, 16393, 15, 8211, 16405, 27, 16415, 16419, 41]
