In [None]:
import os
import glob
import pandas as pd

def aggregate_experiment_results(dataset_name: str, name_filter: str = "scar"):
    """
    Aggregates experiment results from CSV files in the dataset_name's folder.
    If name_filter is provided, only files whose names contain that substring are processed.
    """
    folder_path = f"C:/Users/romai/Desktop/gnn/gnn_pu/NNIF-GNN/{dataset_name}_experimentations"
    csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
    
    # Filter files based on substring in their filename (if name_filter is provided)
    if name_filter:
        csv_files = [f for f in csv_files if name_filter in os.path.basename(f)]
    
    aggregated_results = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            
            # Check if the dataframe has exactly 5 rows
            if len(df) == 5:
                aggregated = {}
                for col in df.columns:
                    if pd.api.types.is_numeric_dtype(df[col]):
                        aggregated[col] = df[col].mean()
                    else:
                        mode_val = df[col].mode()
                        aggregated[col] = mode_val.iloc[0] if not mode_val.empty else None
                
                # Specifically add the std of the 'f1' column if it exists
                if 'f1' in df.columns:
                    aggregated['f1_std'] = df['f1'].std()
                else:
                    aggregated['f1_std'] = None
                    
                aggregated_results.append(aggregated)
        except Exception:
            # Ignore CSVs that cause an error
            pass
    
    aggregated_df = pd.DataFrame(aggregated_results)
    
    # Sort by mean F1 in descending order, if it exists
    if 'f1' in aggregated_df.columns:
        aggregated_df = aggregated_df.sort_values(by='f1', ascending=False)
    
    # Save and return the aggregated results
    output_filename = f"{dataset_name}_{name_filter}_aggregated_results.csv"
    aggregated_df.to_csv(output_filename, index=False)
    print(f"Aggregated results saved to {output_filename}")
    return aggregated_df


In [98]:
df=aggregate_experiment_results("citeseer", "scar")
df['batch_size'].head(30)

Aggregated results saved to citeseer_scar_aggregated_results.csv


119      10.0
93       10.0
125       5.0
110      10.0
146    1024.0
207    1024.0
112      10.0
111      10.0
92       10.0
97       10.0
215    1024.0
118      10.0
172    1024.0
102      10.0
103      10.0
127       5.0
189    1024.0
113      10.0
115      10.0
214    1024.0
158    1024.0
192    1024.0
186    1024.0
123      10.0
89       10.0
201    1024.0
174    1024.0
151    1024.0
121      10.0
106      10.0
Name: batch_size, dtype: float64

In [85]:
df=aggregate_experiment_results("citeseer", "sar")['rate_pairs']
df.head(10)

Aggregated results saved to citeseer_sar_aggregated_results.csv


169    10.0
193    10.0
191    10.0
187    10.0
194    10.0
167    10.0
164    10.0
111     9.0
170    10.0
189    10.0
Name: rate_pairs, dtype: float64

In [None]:
df=aggregate_experiment_results("cora", "scar")
df.head(10)

In [87]:
df=aggregate_experiment_results("cora", "sar")['rate_pairs']
df.head(10)

Aggregated results saved to cora_sar_aggregated_results.csv


25    12.0
21    12.0
17    12.0
22    14.0
23    14.0
12    10.0
18    14.0
11     4.0
8     11.0
16    12.0
Name: rate_pairs, dtype: float64

In [88]:
df=aggregate_experiment_results("pubmed","scar")['rate_pairs']
df.head(10)

Aggregated results saved to pubmed_scar_aggregated_results.csv


80     3.0
86     3.0
88     3.0
118    2.0
148    2.0
155    2.0
94     2.0
87     3.0
126    2.0
107    2.0
Name: rate_pairs, dtype: float64

In [96]:
df=aggregate_experiment_results("pubmed","sar")
df.head(10)

Aggregated results saved to pubmed_sar_aggregated_results.csv


Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,batch_size,rate_pairs,reliable_mini_batch,clusters,accuracy,f1,recall,precision,losses,f1_std
82,7.0,2.0,128.0,256.0,,0.00374,removal,0.0,0.256688,366.6,...,10.0,7.0,1.0,500.0,0.898798,0.869767,0.846121,0.89478,"[53.74618089199066, 51.19909358024597, 47.5940...",0.001644
54,8.0,2.0,128.0,256.0,,0.003917,removal,0.0,0.255021,366.6,...,10.0,7.0,1.0,500.0,0.89895,0.869744,0.844698,0.896323,"[55.939280450344086, 53.593082785606384, 49.92...",0.003548
102,8.0,2.0,128.0,256.0,,0.00426,removal,0.0,0.254904,366.6,...,10.0,6.0,1.0,500.0,0.898727,0.869737,0.846502,0.894288,"[55.828895926475525, 53.136748909950256, 49.03...",0.002404
61,8.0,2.0,128.0,256.0,,0.004228,removal,0.0,0.256191,366.6,...,10.0,7.0,1.0,500.0,0.898504,0.8695,0.846603,0.893676,"[55.83849036693573, 53.16435515880585, 49.1003...",0.002747
62,8.0,2.0,128.0,256.0,,0.004255,removal,0.0,0.256799,366.6,...,10.0,7.0,1.0,500.0,0.898311,0.869355,0.847111,0.8928,"[55.85938745737076, 53.17556309700012, 49.0730...",0.001523
110,8.0,2.0,128.0,256.0,,0.004104,removal,0.0,0.26652,366.6,...,10.0,3.0,1.0,500.0,0.897926,0.869337,0.850184,0.889374,"[55.81945329904556, 53.267014026641846, 49.430...",0.001262
93,8.0,2.0,128.0,256.0,,0.00332,removal,0.0,0.263145,366.6,...,10.0,7.0,1.0,500.0,0.898037,0.869148,0.847848,0.891551,"[55.9970378279686, 54.226133704185486, 51.3558...",0.001033
48,8.0,2.0,128.0,256.0,,0.004071,removal,0.0,0.252877,366.6,...,10.0,8.0,1.0,500.0,0.898565,0.869119,0.843251,0.896627,"[55.93415814638138, 53.39403164386749, 49.5923...",0.00216
81,7.0,2.0,128.0,256.0,,0.003767,removal,0.0,0.251707,366.6,...,10.0,7.0,1.0,500.0,0.898615,0.869113,0.842794,0.897132,"[53.818075299263, 51.23932945728302, 47.596707...",0.002664
111,8.0,2.0,128.0,256.0,,0.004122,removal,0.0,0.266988,366.6,...,10.0,3.0,1.0,500.0,0.897642,0.869088,0.850692,0.888303,"[55.79989117383957, 53.24219506978989, 49.3806...",0.001262


In [93]:
df=aggregate_experiment_results("wiki-cs","scar")['rate_pairs']
df.head(10)

Aggregated results saved to wiki-cs_scar_aggregated_results.csv


118    8.0
122    8.0
121    8.0
141    8.0
147    8.0
143    8.0
134    8.0
106    8.0
144    8.0
126    8.0
Name: rate_pairs, dtype: float64

In [46]:
df=aggregate_experiment_results("elliptic-bitcoin")
df.head(10)

In [16]:
import numpy as np
import pandas as pd
path=r"C:\Users\romai\Desktop\elliptic_bitcoin_dataset"
# Define path to CSV file
csv_path = path + r"\elliptic_txs_classes.csv"

# Load data with pandas for better handling
df = pd.read_csv(csv_path)

# Check for missing or malformed values explicitly
missing_rows = df[df.isnull().any(axis=1)]

if not missing_rows.empty:
    print("Found missing or malformed rows at indices:")
    print(missing_rows.index.tolist())
else:
    print("No missing rows detected with pandas.")

# Load data using numpy for original processing
y_str = np.loadtxt(csv_path,
                   delimiter=",", skiprows=1, usecols=(0,1), dtype=str)

print("Loaded numpy data shape:", y_str.shape)
print("Pandas DataFrame shape:", df.shape)

# Identify missing rows by comparing pandas and numpy lengths
expected_rows = df.shape[0]
loaded_rows = y_str.shape[0]

if loaded_rows < expected_rows:
    print(f"Missing {expected_rows - y_str.shape[0]} rows in numpy load.")
    missing_indices = set(df.index) - set(range(y_str.shape[0]))
    print("Missing row indices:", missing_rows)

# Verify exact missing rows if needed
if len(df) != len(y_str):
    missing_rows = df[~df.index.isin(range(len(y_str)))]
    print("Detailed missing rows:")
    print(missing_rows)


No missing rows detected with pandas.
Loaded numpy data shape: (203765, 2)
Pandas DataFrame shape: (203769, 2)
Missing 4 rows in numpy load.
Missing row indices: Empty DataFrame
Columns: [txId, class]
Index: []
Detailed missing rows:
             txId    class
203765  158577750  unknown
203766  158375402        1
203767  158654197  unknown
203768  157597225  unknown
