# Convert a .txt.gz file into .csv file

In [8]:
import pandas as pd
import gzip

edges = []

filepath = '../data/facebook_combined.txt.gz'
filename = filepath.split('/')[-1][:-7] + '.csv'

with gzip.open(filepath, 'rt') as file:
    for line in file:
        if line.startswith('#'):
            continue
        node1, node2 = map(int, line.strip().split())
        edges.append((node1, node2))

df = pd.DataFrame(edges, columns=['node1', 'node2'])
df.to_csv('../data/' + filename, index=False)

# Convert a .edges file into a .csv file

In [1]:
import pandas as pd
import re

edges = []

filepath_input = '../data/proteins/PROTEINS-full.edges'
filepath_output = '../data/proteins/PROTEINS-full.csv'

with open(filepath_input, 'rt') as f:
        for line in f:
            if line.startswith('%'):  # Skip comment lines
                continue
            # Split the line based on spaces or commas
            data = re.split(r'[,\s]+', line.strip())
            if len(data) < 2:  # Skip lines that don't have at least two columns
                continue
            # Extract the first two columns (nodes)
            node1, node2 = int(data[0]), int(data[1])
            edges.append((node1, node2))
            
df = pd.DataFrame(edges, columns=['node1', 'node2'])
df.to_csv(filepath_output, index=False)

## Remove first column of CSV file

In [5]:
def remove_first_column(input_file, output_file):
    """
    Removes the first column from a file and saves only the second column to a new file.
    
    Parameters:
        input_file (str): Path to the input file with two columns (comma-separated or space-separated).
        output_file (str): Path to the output file containing only the second column.
    """
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            
            # Split the line and extract the second column
            columns = line.split(',') if ',' in line else line.split()
            if len(columns) >= 2:
                outfile.write(columns[1] + '\n')

# Example usage:
# remove_first_column("input.txt", "output.txt")
spam_labels_path = "../data/spam/web-spam-detection.node_labels"
out = "../data/web-spam-detection.node_labels"
remove_first_column(spam_labels_path, out)

## Convert from csv to json

In [5]:
import csv
import json

def csv_to_json (input_file, output_file):
    """
    Converts a CSV file to a JSON file.
    
    Parameters:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the output JSON file.
    """
    data = {}
    with open(input_file, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for rows in reader:
            key = rows['node']
            data[key] = rows

    with open(output_file, 'w') as jsonfile:
        jsonfile.write(json.dumps(data, indent=4))
    

input_file = "../data/proteins/PROTEINS-full.csv"
output_file = "../data/proteins/PROTEINS-full.json"
csv_to_json(input_file, output_file)

KeyError: 'node'

### Convert Attention Walk embeddings from csv to npy
NOTE: it is needed to remove first column, since it refers to node name (large integer)

In [15]:
import numpy as np
import os

def remove_first_column_in_place(file_path):
    """
    This function has the only meaning on convert the .csv data format produced by attention walk in 
    a numpy array, and save it
    
    Parameters:
        file_path (str): Path to the input file with multiple columns (comma-separated or space-separated).
    """
    temp_file_path = file_path + ".tmp"  # Temporary file path
    
    with open(file_path, 'r') as infile, open(temp_file_path, 'w') as outfile:
        for i,line in enumerate(infile):
            if i == 0: # first raw is metadata
                columns = line.split(',') if ',' in line else line.split()
                if columns[0] != "id":
                    raise Exception("The .csv file was already preprocessed, aborting operation to avoid delete meaningful data")
                continue 
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            
            # Split the line and extract all columns except the first one
            columns = line.split(',') if ',' in line else line.split()
            if len(columns) > 1:
                outfile.write(','.join(columns[1:]) + '\n')
    
    # Replace the original file with the temporary file
    os.replace(temp_file_path, file_path)

csv_file = "../result/embeddings_proteins_AW_128.csv"
remove_first_column_in_place(csv_file)

vec = np.loadtxt(csv_file, delimiter=',')
np.save(f"{folder + csv_file[:-4]}.npy", vec)