In [1]:
import pandas as pd
import networkx as nx
from datetime import datetime, timedelta
import os
from graphprocessor import GraphProcessor
import numpy as np
from IMutil import IMutil

dataset:  "https://snap.stanford.edu/data/email-Eu-core-temporal.html" - directed graphs
Nodes	309
Temporal Edges	61046
Edges in static graph	3031
Time span	803 days

In [13]:
dir_path = 'C:\\Priyanka\\dynamic-infmax-gnn-lstm\\data'
dataset = "\\email-Eu-core-temporal-Dept1"
filename= "\\email-Eu-core-temporal-Dept1.txt"

In [14]:
file_path = dir_path + dataset + filename
# Assume the dataset is in a CSV file with columns: 'source', 'target', 'time'
df = pd.read_csv(file_path,  sep=" ", header=None, names=["SRC", "DST", "timestamp"])

In [15]:
# Find the minimum node number
min_node_number = min(df["SRC"].min(), df["DST"].min())
min_node_number

0

In [16]:
# Find the minimum node number
max_node_number = max(df["SRC"].max(), df["DST"].max())
max_node_number

319

In [None]:
# Get the complete set of nodes
all_nodes = range(max_node_number + 1)
all_nodes


range(0, 320)

In [24]:
# Convert timestamp to datetime for easy manipulation
df['time'] = pd.to_datetime(df['timestamp'], unit='s')

In [25]:
# Add a 'month' column to group data by year and month
df['month'] = df['time'].dt.to_period('M')

In [26]:
# Group edges by month
monthly_snapshots = df.groupby('month')

# Print the size (number of edges) for each group
group_sizes = monthly_snapshots.size()
print(group_sizes)


month
1970-01    3115
1970-02    2639
1970-03    1732
1970-04    3360
1970-05    3440
1970-06    3782
1970-07    3574
1970-08    5180
1970-09    2957
1970-10    1955
1970-11    3115
1970-12    4019
1971-01    5032
1971-02    1438
1971-03    4297
1971-04    4819
1971-05    4552
1971-06    1651
1972-03     389
Freq: M, dtype: int64


In [27]:
# Create output directory if it doesn't exist
output_dir = dir_path + dataset + "\\graphs\\"
os.makedirs(output_dir, exist_ok=True)

# Create and save graphs with sequential names
for i, (month, group) in enumerate(monthly_snapshots):
    # Create a directed graph for the current month
    G = nx.DiGraph()
    
    # Add all nodes to ensure consistency across snapshots
    G.add_nodes_from(all_nodes)
    
    # Add edges for the current month's snapshot
    G.add_edges_from(zip(group['SRC'], group['DST']))

    # Save the graph with a sequential name
    filename = os.path.join(output_dir, f'snapshot_{i}.gpickle')
    nx.write_gpickle(G, filename)

    print(f"Graph for {month} saved as {filename}. Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")


Graph for 1970-01 saved as C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs\snapshot_0.gpickle. Nodes: 320, Edges: 820
Graph for 1970-02 saved as C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs\snapshot_1.gpickle. Nodes: 320, Edges: 773
Graph for 1970-03 saved as C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs\snapshot_2.gpickle. Nodes: 320, Edges: 595
Graph for 1970-04 saved as C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs\snapshot_3.gpickle. Nodes: 320, Edges: 873
Graph for 1970-05 saved as C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs\snapshot_4.gpickle. Nodes: 320, Edges: 830
Graph for 1970-06 saved as C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs\snapshot_5.gpickle. Nodes: 320, Edges: 831
Graph for 1970-07 saved as C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs\snapshot_6.gpi

In [34]:
self_loops = [(u, v) for u, v in zip(group['SRC'], group['DST']) if u == v]
print(f"Self-loops: {len(self_loops)}")

Self-loops: 0


In [None]:
duplicates = len(group) - len(set(zip(group['SRC'], group['DST'])))
print(f"Number of duplicate edges: {duplicates}")

Number of duplicate edges: 141


: 

In [28]:
def add_labels_and_features(G, top_percentile=30, uniinfweight=1.0):
    """
    Add IFC scores, labels, and additional features to each node in the graph.

    Parameters:
        G (networkx.Graph): Input graph.
        top_percentile (float): Top percentile for labeling nodes as `1`.
        uniinfweight (float): Weight factor for IFC score calculation.

    Returns:
        networkx.Graph: Graph with added features and labels.
    """
    # Calculate IFC scores
    ifc_scores = IMutil.calculate_ifc_score(G, uniinfweight)

    # Sort nodes by IFC score
    sorted_nodes = sorted(ifc_scores.items(), key=lambda x: x[1], reverse=True)

    # Determine the cutoff for top x percentile
    num_top_nodes = int(len(sorted_nodes) * top_percentile / 100)
    top_nodes = {node for node, _ in sorted_nodes[:num_top_nodes]}

    # Add features and labels to nodes
    for node in G.nodes:
        G.nodes[node]["ifc_score"] = ifc_scores[node]  # IFC score
        G.nodes[node]["label"] = 1 if node in top_nodes else 0  # Label
        G.nodes[node]["features"] = [
            G.degree(node),  # Degree
            nx.average_neighbor_degree(G)[node],  # Average neighbor degree
            G.nodes[node].get("existing_feature", 0),  # Placeholder for other features
        ]

    return G

In [29]:
def process_and_save_graphs(input_dir, output_dir, top_percentile=30, uniinfweight=1.0):
    """
    Process all graphs in the input directory:
    - Calculate IFC scores
    - Add labels based on top percentile
    - Add features

    Parameters:
        input_dir (str): Directory containing input graphs.
        output_dir (str): Directory to save processed graphs.
        top_percentile (float): Top percentile for labeling nodes as `1`.
        uniinfweight (float): Weight factor for IFC score calculation.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in os.listdir(input_dir):
        if file_name.endswith(".gpickle"):
            # Load graph
            graph_path = os.path.join(input_dir, file_name)
            G = nx.read_gpickle(graph_path)

            # Add features and labels
            G = add_labels_and_features(G, top_percentile, uniinfweight)

            # Save the modified graph
            output_path = os.path.join(output_dir, file_name)
            nx.write_gpickle(G, output_path)
            print(f"Processed and saved graph: {output_path}")


In [30]:
# Example usage
input_dir = "C:\\Priyanka\\dynamic-infmax-gnn-lstm\\data\\email-Eu-core-temporal-Dept1\\graphs\\"
output_dir = "C:\\Priyanka\\dynamic-infmax-gnn-lstm\\data\\email-Eu-core-temporal-Dept1\\graphs-lables\\"


In [31]:
# Call the function to add features and labels to graphs and save them
process_and_save_graphs(input_dir, output_dir, top_percentile=30, uniinfweight=1.0)

Processed and saved graph: C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs-lables\snapshot_0.gpickle
Processed and saved graph: C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs-lables\snapshot_1.gpickle
Processed and saved graph: C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs-lables\snapshot_10.gpickle
Processed and saved graph: C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs-lables\snapshot_11.gpickle
Processed and saved graph: C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs-lables\snapshot_12.gpickle
Processed and saved graph: C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs-lables\snapshot_13.gpickle
Processed and saved graph: C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-temporal-Dept1\graphs-lables\snapshot_14.gpickle
Processed and saved graph: C:\Priyanka\dynamic-infmax-gnn-lstm\data\email-Eu-core-tem