# Convert a .txt.gz file into .csv file

In [8]:
import pandas as pd
import gzip

edges = []

filepath = '../data/facebook_combined.txt.gz'
filename = filepath.split('/')[-1][:-7] + '.csv'

with gzip.open(filepath, 'rt') as file:
    for line in file:
        if line.startswith('#'):
            continue
        node1, node2 = map(int, line.strip().split())
        edges.append((node1, node2))

df = pd.DataFrame(edges, columns=['node1', 'node2'])
df.to_csv('../data/' + filename, index=False)

## Remove first column of CSV file

In [5]:
def remove_first_column(input_file, output_file):
    """
    Removes the first column from a file and saves only the second column to a new file.
    
    Parameters:
        input_file (str): Path to the input file with two columns (comma-separated or space-separated).
        output_file (str): Path to the output file containing only the second column.
    """
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            
            # Split the line and extract the second column
            columns = line.split(',') if ',' in line else line.split()
            if len(columns) >= 2:
                outfile.write(columns[1] + '\n')

# Example usage:
# remove_first_column("input.txt", "output.txt")
spam_labels_path = "../data/spam/web-spam-detection.node_labels"
out = "../data/web-spam-detection.node_labels"
remove_first_column(spam_labels_path, out)

## Convert from csv to json

In [5]:
import csv
import json

def csv_to_json (input_file, output_file):
    """
    Converts a CSV file to a JSON file.
    
    Parameters:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the output JSON file.
    """
    data = {}
    with open(input_file, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for rows in reader:
            key = rows['node']
            data[key] = rows

    with open(output_file, 'w') as jsonfile:
        jsonfile.write(json.dumps(data, indent=4))
    

input_file = "../data/proteins/PROTEINS-full.csv"
output_file = "../data/proteins/PROTEINS-full.json"
csv_to_json(input_file, output_file)

KeyError: 'node'

### Convert Attention Walk embeddings from csv to npy
NOTE: it is needed to remove first column, since it refers to node name (large integer)

In [19]:
import numpy as np
import os

def remove_first_column_in_place(file_path):
    """
    This function has the only meaning on convert the .csv data format produced by attention walk in 
    a numpy array, and save it
    
    Parameters:
        file_path (str): Path to the input file with multiple columns (comma-separated or space-separated).
    """
    temp_file_path = file_path + ".tmp"  # Temporary file path
    
    with open(file_path, 'r') as infile, open(temp_file_path, 'w') as outfile:
        for i,line in enumerate(infile):
            if i == 0: # first raw is metadata
                columns = line.split(',') if ',' in line else line.split()
                if columns[0] != "id":
                    print("The .csv file appears to be already preprocessed, no modifications done")
                    return
                continue 
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            
            # Split the line and extract all columns except the first one
            columns = line.split(',') if ',' in line else line.split()
            if len(columns) > 1:
                outfile.write(','.join(columns[1:]) + '\n')
    
    # Replace the original file with the temporary file
    os.replace(temp_file_path, file_path)

### CHOOSE
csv_file = "../result/embeddings_facebook_AW_128.csv"

remove_first_column_in_place(csv_file)

vec = np.loadtxt(csv_file, delimiter=',')
np.save(f"{csv_file[:-4]}.npy", vec)

The .csv file appears to be already preprocessed, no modifications done


### copy of .edges without self loops (spam graph)

In [6]:
def remove_self_loops(input_file, output_file):
    """
    Removes self-loops (edges of the form x,x) from an edges file and saves the result to a new file.

    Parameters:
        input_file (str): Path to the input edges file (e.g., .edges).
        output_file (str): Path to the output edges file without self-loops.
    """
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            line = line.strip()
            if not line:  # Skip empty lines
                continue

            # Split the line to get the two nodes of the edge
            nodes = line.split(',')
            if len(nodes) == 2:
                # Write the edge only if it's not a self-loop
                if nodes[0] != nodes[1]:
                    outfile.write(line + '\n')

# Example usage
input_path = "../data/spam/web-spam-detection.edges"  # Path to your original .edges file
output_path = "../data/spam/web-spam-detection-no-self-loops.edges"  # Path to the new .edges file
remove_self_loops(input_path, output_path)

### create .csv from NX graph loaded from .edges
AW needs a .csv file in input, to be sure AW works on the same graph as LINE and node2vec, the csv file is created from the networkXgraph

In [17]:
import networkx as nx
import gzip
import re

def load_graph(path):
    """
    For files with extension .edges
    nodes are renamed as integers, starting from 0
    """
    G = nx.Graph()
    with open(path, 'rt') as f:
        for line in f:
            if line.startswith('%'):  # Skip comment lines
                continue
            # Split the line based on spaces or commas
            data = re.split(r'[,\s]+', line.strip())
            if len(data) < 2:  # Skip lines that don't have at least two columns
                continue
            # Extract the first two columns (nodes)
            node1, node2 = int(data[0]), int(data[1])
            G.add_edge(node1, node2)
    mapping = {node : i for i,node in enumerate(G.nodes)} # mappoing original : relabeled
    G = nx.relabel_nodes(G, mapping)
    return G

def load_graph_with_gz(path):
    """
    For files with extension .txt.gz
    nodes are renamed as integers, starting from 0
    """
    G = nx.Graph()
    with gzip.open(path, 'rt') as f:
        for line in f:
            node1, node2 = map(int, line.strip().split())
            G.add_edge(node1, node2)
    mapping = {node : i for i,node in enumerate(G.nodes)} # mappoing original : relabeled
    G = nx.relabel_nodes(G, mapping)
    return G

def edges_to_csv(edges_file, output_csv, delimiter=",", is_txt_gz = False):
    """
    Reads a .edges file, constructs a networkx graph, and saves the graph as a .csv file
    with the header "node1,node2".
    
    Parameters:
        edges_file (str): Path to the input .edges file.
        output_csv (str): Path to the output .csv file.
        delimiter (str): Delimiter used in the .edges file (default: ',').
        directed (bool): If True, creates a directed graph; otherwise, creates an undirected graph.
    
    Returns:
        nx.Graph or nx.DiGraph: The graph object created from the .edges file.
    """
    # Create the appropriate type of graph
    G = load_graph(edges_file) if not is_txt_gz else load_graph_with_gz(edges_file)

    # Save the graph as a .csv file with the header
    with open(output_csv, "w") as f:
        f.write("node1,node2\n")  # Write the header
        for u, v in G.edges():
            f.write(f"{u}{delimiter}{v}\n")
    print(f"Graph saved to {output_csv} with header 'node1,node2'.")
    return G
facebook_paths = ("../data/facebook/facebook_combined.txt.gz", "../data/facebook/facebook_combined.csv")
citation_paths = ("../data/citation/cit-HepTh.edges", "../data/citation/cit-HepTh.csv")
biological_paths = ("../data/biological/bio-CE-CX.edges", "../data/biological/bio-CE-CX.csv")
proteins_paths = ("../data/proteins/PROTEINS-full.edges", "../data/proteins/PROTEINS-full.csv")
spam_paths = ("../data/spam/web-spam-detection.edges", "../data/spam/web-spam-detection.csv")

#### MODIFY ONLY THIS TO CHOOSE GRAPH #####
curr_paths = spam_paths
#### MODIFY ONLY THIS TO CHOOSE GRAPH #####

# Define file paths
is_txt_gz = True if curr_paths == facebook_paths else False
print(f"is_txt_gz = {is_txt_gz}")
edges_file = curr_paths[0]
output_csv = curr_paths[1]


# Convert edges file to CSV for Attention Walk
graph = edges_to_csv(edges_file, output_csv, delimiter=",", is_txt_gz = is_txt_gz)

# Use the returned `graph` object for LINE or Node2Vec
print(f"Graph has {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges.")

is_txt_gz = False
Graph saved to ../data/spam/web-spam-detection.csv with header 'node1,node2'.
Graph has 9072 nodes and 473854 edges.
