# Coauthorship and cotitation

Cocitation: citeseer, cora, pubmed
Coauthorship: cora, deblp

Just one hypergraph per folder

In [28]:
"""Notebook to analyze the structure of coauthorship and cocitation data.

Mimicks Hypergraph_Encodings/scripts/compute_encodings/analyze_cc_ca_data.py

This script loads and examines the data structure for:
- coauthorship: cora, dblp
- cocitation: citeseer, cora, pubmed

It prints examples and statistics to understand the data format.

These files have:

Features: Sparse matrices (scipy.sparse.csr.csr_matrix)
Hypergraph: Dictionary with hyperedges
Labels: List of node labels
Splits: Dictionary with 'train' and 'test' splits (10 different splits)
"""

import os
import pickle
import numpy as np
from typing import Any
import warnings


warnings.simplefilter("ignore")


def load_pickle_file(file_path: str) -> Any:
    """Load a pickle file and return its contents.

    Args:
        file_path: Path to the pickle file

    Returns:
        Contents of the pickle file
    """
    try:
        with open(file_path, "rb") as handle:
            return pickle.load(handle)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None
    

def analyze_dataset(dataset_path: str, dataset_name: str) -> None:
    """Analyze a single dataset and print its structure.

    Args:
        dataset_path: Path to the dataset directory
        dataset_name: Name of the dataset
    """
    print(f"\n{'='*60}")
    print(f"ANALYZING DATASET: {dataset_name.upper()}")
    print(f"{'='*60}")

    # Load features
    features_path = os.path.join(dataset_path, "features.pickle")
    features = load_pickle_file(features_path)
    if features is not None:
        print(f"Features type: {type(features)}")
        if isinstance(features, np.ndarray):
            print(f"Features shape: {features.shape}")
            print(f"Features dtype: {features.dtype}")
            print(f"Features sample (first 5 elements): {features[:5]}")
        else:
            print(f"Features content (first 100 chars): {str(features)[:100]}...")

    # Load hypergraph
    hypergraph_path = os.path.join(dataset_path, "hypergraph.pickle")
    hypergraph = load_pickle_file(hypergraph_path)
    if hypergraph is not None:
        print(f"\nHypergraph type: {type(hypergraph)}")
        if isinstance(hypergraph, dict):
            print(f"Number of hyperedges: {len(hypergraph)}")
            print("Sample hyperedges (first 3):")
            for i, (key, value) in enumerate(list(hypergraph.items())[:3]):
                print(f"  {key}: {value}")

            # Analyze hyperedge sizes
            edge_sizes = [len(edge) for edge in hypergraph.values()]
            print("Hyperedge size statistics:")
            print(f"  Min: {min(edge_sizes)}")
            print(f"  Max: {max(edge_sizes)}")
            print(f"  Mean: {np.mean(edge_sizes):.2f}")
            print(f"  Median: {np.median(edge_sizes):.2f}")

            # Get all unique nodes
            all_nodes = set()
            for edge in hypergraph.values():
                all_nodes.update(edge)
            print(f"Total unique nodes: {len(all_nodes)}")

            # dict_hypergraph = {
            #     "hypergraph": hypergraph,
            #     "n": len(all_nodes),
            #     "features": [[1.0] for _ in range(len(all_nodes))],
            #     "labels": [[1] for _ in range(len(all_nodes))],
            # }

            # # do a clique expansion and plot
            # expanded_graph = compute_clique_expansion(dict_hypergraph)
            # G = nx.Graph()
            # edge_index = expanded_graph.edge_index.t().tolist()
            # G.add_edges_from(edge_index)
            # pos = nx.spring_layout(G)
            # nx.draw(G, pos, with_labels=True, node_color="lightblue", node_size=500, font_size=12, font_weight="bold")
            # plt.title("Clique Expansion", fontsize=20)


    # Load labels
    labels_path = os.path.join(dataset_path, "labels.pickle")
    labels = load_pickle_file(labels_path)
    if labels is not None:
        print(f"\nLabels type: {type(labels)}")
        if isinstance(labels, np.ndarray):
            print(f"Labels shape: {labels.shape}")
            print(f"Labels dtype: {labels.dtype}")
            print(f"Labels sample (first 10): {labels[:10]}")
            print(f"Unique labels: {np.unique(labels)}")
            print(f"Number of classes: {len(np.unique(labels))}")
        else:
            print(f"Labels content (first 100 chars): {str(labels)[:100]}...")

    # Load splits
    splits_dir = os.path.join(dataset_path, "splits")
    if os.path.exists(splits_dir):
        print(f"\nSplits directory: {splits_dir}")
        split_files = [f for f in os.listdir(splits_dir) if f.endswith(".pickle")]
        print(f"Number of split files: {len(split_files)}")

        # Analyze first split
        if split_files:
            first_split_path = os.path.join(splits_dir, split_files[0])
            split_data = load_pickle_file(first_split_path)
            if split_data is not None:
                print(f"First split ({split_files[0]}) type: {type(split_data)}")
                if isinstance(split_data, dict):
                    print(f"Split keys: {list(split_data.keys())}")
                    for key, value in split_data.items():
                        if isinstance(value, (list, np.ndarray)):
                            print(f"  {key}: {len(value)} elements")
                        else:
                            print(f"  {key}: {type(value)}")
                elif isinstance(split_data, (list, tuple)):
                    print(f"Split contains {len(split_data)} elements")
                    if len(split_data) >= 3:
                        print(f"  Train: {len(split_data[0])} elements")
                        print(f"  Val: {len(split_data[1])} elements")
                        print(f"  Test: {len(split_data[2])} elements")



In [29]:
def analsse_cc_ca_data() -> None:
    """Main function to analyze all datasets."""
    print("ANALYZING COAUTHORSHIP AND COCITATION DATASETS")
    print("=" * 60)

    data_dir = os.path.join("../data")

    print(f"Data directory: {data_dir}")

    # Analyze coauthorship datasets
    coauthorship_dir = os.path.join(data_dir, "coauthorship")
    if os.path.exists(coauthorship_dir):
        print(f"\nFound coauthorship directory: {coauthorship_dir}")
        for dataset in ["cora", "dblp"]:
            dataset_path = os.path.join(coauthorship_dir, dataset)
            if os.path.exists(dataset_path):
                analyze_dataset(dataset_path, f"coauthorship_{dataset}")
            else:
                print(f"Dataset {dataset} not found in coauthorship")
    else:
        print("Coauthorship directory not found")

    # Analyze cocitation datasets
    cocitation_dir = os.path.join(data_dir, "cocitation")
    if os.path.exists(cocitation_dir):
        print(f"\nFound cocitation directory: {cocitation_dir}")
        for dataset in ["citeseer", "cora", "pubmed"]:
            dataset_path = os.path.join(cocitation_dir, dataset)
            if os.path.exists(dataset_path):
                analyze_dataset(dataset_path, f"cocitation_{dataset}")
            else:
                print(f"Dataset {dataset} not found in cocitation")
    else:
        print("Cocitation directory not found")

analsse_cc_ca_data()

ANALYZING COAUTHORSHIP AND COCITATION DATASETS
Data directory: ../data

Found coauthorship directory: ../data/coauthorship

ANALYZING DATASET: COAUTHORSHIP_CORA
Features type: <class 'scipy.sparse._csr.csr_matrix'>
Features content (first 100 chars): <Compressed Sparse Row sparse matrix of dtype 'float32'
	with 49216 stored elements and shape (2708,...

Hypergraph type: <class 'dict'>
Number of hyperedges: 1072
Sample hyperedges (first 3):
  V Gupta: [235, 355]
  A Srinivasan: [1133, 1666, 1888]
  J Zavrel: [783, 785]
Hyperedge size statistics:
  Min: 2
  Max: 43
  Mean: 4.28
  Median: 3.00
Total unique nodes: 2388

Labels type: <class 'list'>
Labels content (first 100 chars): [3, 2, 6, 6, 5, 5, 1, 3, 3, 1, 3, 0, 5, 4, 3, 3, 6, 3, 3, 3, 1, 6, 0, 5, 6, 1, 3, 5, 3, 2, 4, 3, 4, ...

Splits directory: ../data/coauthorship/cora/splits
Number of split files: 10
First split (9.pickle) type: <class 'dict'>
Split keys: ['train', 'test']
  train: 140 elements
  test: 2568 elements

ANALYZING DAT

# graph_classification_datasets & hypegraph_classification_datasets

List of graphs (presented as a list of dicts, with graph, features, etc as key-pairs in each dict).

In [34]:
"""Notebook to analyze the structure of graph and hypergraph classification datasets.

This script loads and examines the data structure for:
- Graph classification datasets: peptidesstruct
- Hypergraph classification datasets: collab, imdb, mutag, enzymes, proteins, reddit

It prints examples and statistics to understand the data format.

These files contain lists of (hypergraph, features, labels) tuples.
"""


def analyze_classification_dataset(file_path: str, dataset_name: str) -> None:
    """Analyze a single classification dataset file and print its structure.

    Args:
        file_path: Path to the pickle file
        dataset_name: Name of the dataset
    """
    print(f"\n{'='*80}")
    print(f"ANALYZING DATASET: {dataset_name.upper()}")
    print(f"{'='*80}")
    
    # Load the dataset
    data = load_pickle_file(file_path)
    if data is None:
        print("Failed to load dataset")
        return
    
    print(f"Dataset type: {type(data)}")
    
    if isinstance(data, list):
        print(f"Number of samples: {len(data)}")
        
        if len(data) == 0:
            print("Dataset is empty")
            return
        
        # Analyze first sample
        first_sample = data[0]
        print(f"\nFirst sample type: {type(first_sample)}")
        print(f"First sample: {first_sample.keys()}")
    
    else:
        print(f"Unexpected data format: {type(data)}")


def main() -> None:
    """Main function to analyze all classification datasets."""
    print("ANALYZING GRAPH AND HYPERGRAPH CLASSIFICATION DATASETS")
    print("=" * 80)

    # Define the data directories
    graph_classification_dir = "/Users/pellegrinraphael/Desktop/Repos_GNN/Hypergraph_Encodings/data/graph_classification_datasets"
    hypergraph_classification_dir = "/Users/pellegrinraphael/Desktop/Repos_GNN/Hypergraph_Encodings/data/hypergraph_classification_datasets"

    print(f"Graph classification directory: {graph_classification_dir}")
    print(f"Hypergraph classification directory: {hypergraph_classification_dir}")

    # Analyze graph classification datasets
    if os.path.exists(graph_classification_dir):
        print(f"\n{'='*60}")
        print("ANALYZING GRAPH CLASSIFICATION DATASETS")
        print(f"{'='*60}")
        
        # Look for peptidesstruct files
        for filename in os.listdir(graph_classification_dir):
            if filename.startswith("peptidesstruct") and filename.endswith(".pickle") and "with_encodings" not in filename:
                file_path = os.path.join(graph_classification_dir, filename)
                dataset_name = filename.replace(".pickle", "")
                analyze_classification_dataset(file_path, dataset_name)
    else:
        print("Graph classification directory not found")

    # Analyze hypergraph classification datasets
    if os.path.exists(hypergraph_classification_dir):
        print(f"\n{'='*60}")
        print("ANALYZING HYPERGRAPH CLASSIFICATION DATASETS")
        print(f"{'='*60}")
        
        # Look for dataset files
        datasets = ["collab", "imdb", "mutag", "enzymes", "proteins", "reddit"]
        
        for dataset in datasets:
            # First try to find the base dataset (without encodings)
            base_file = f"{dataset}_hypergraphs.pickle"
            base_path = os.path.join(hypergraph_classification_dir, base_file)
            
            if os.path.exists(base_path):
                analyze_classification_dataset(base_path, f"{dataset}_base")
            
    else:
        print("Hypergraph classification directory not found")


if __name__ == "__main__":
    main()

ANALYZING GRAPH AND HYPERGRAPH CLASSIFICATION DATASETS
Graph classification directory: /Users/pellegrinraphael/Desktop/Repos_GNN/Hypergraph_Encodings/data/graph_classification_datasets
Hypergraph classification directory: /Users/pellegrinraphael/Desktop/Repos_GNN/Hypergraph_Encodings/data/hypergraph_classification_datasets

ANALYZING GRAPH CLASSIFICATION DATASETS

ANALYZING DATASET: PEPTIDESSTRUCT_HYPERGRAPHS_TEST
Dataset type: <class 'list'>
Number of samples: 2331

First sample type: <class 'dict'>
First sample: dict_keys(['hypergraph', 'features', 'labels', 'n'])

ANALYZING DATASET: PEPTIDESSTRUCT_HYPERGRAPHS
Dataset type: <class 'list'>
Number of samples: 15535

First sample type: <class 'dict'>
First sample: dict_keys(['hypergraph', 'features', 'labels', 'n'])

ANALYZING DATASET: PEPTIDESSTRUCT_HYPERGRAPHS_VAL
Dataset type: <class 'list'>
Number of samples: 2331

First sample type: <class 'dict'>
First sample: dict_keys(['hypergraph', 'features', 'labels', 'n'])

ANALYZING DATASET