In [1]:
import os
import glob
import pandas as pd

def aggregate_experiment_results(dataset_name: str, name_filter: str = "scar"):
    """
    Aggregates experiment results from CSV files in the dataset_name's folder.
    If name_filter is provided, only files whose names contain that substring are processed.
    """
    folder_path = f"C:/Users/romai/Desktop/gnn/gnn_pu/NNIF-GNN/{dataset_name}_experimentations"
    csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
    
    # Filter files based on substring in their filename (if name_filter is provided)
    if name_filter:
        csv_files = [f for f in csv_files if name_filter in os.path.basename(f)]
    
    aggregated_results = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            
            # Check if the dataframe has exactly 5 rows
            if len(df) == 5:
                aggregated = {}
                for col in df.columns:
                    if pd.api.types.is_numeric_dtype(df[col]):
                        aggregated[col] = df[col].mean()
                    else:
                        mode_val = df[col].mode()
                        aggregated[col] = mode_val.iloc[0] if not mode_val.empty else None
                
                # Specifically add the std of the 'f1' column if it exists
                if 'f1' in df.columns:
                    aggregated['f1_std'] = df['f1'].std()
                else:
                    aggregated['f1_std'] = None
                    
                aggregated_results.append(aggregated)
        except Exception:
            # Ignore CSVs that cause an error
            pass
    
    aggregated_df = pd.DataFrame(aggregated_results)
    
    # Sort by mean F1 in descending order, if it exists
    if 'f1' in aggregated_df.columns:
        aggregated_df = aggregated_df.sort_values(by='f1', ascending=False)
    
    # Save and return the aggregated results
    output_filename = f"{dataset_name}_{name_filter}_aggregated_results.csv"
    aggregated_df.to_csv(output_filename, index=False)
    print(f"Aggregated results saved to {output_filename}")
    return aggregated_df


In [2]:
df=aggregate_experiment_results("citeseer", "scar")
best_rows = df.loc[df.groupby('sampling')['f1'].idxmax()]

print(best_rows)

Aggregated results saved to citeseer_scar_aggregated_results.csv
       K  layers  hidden_channels  out_channels  norm        lr treatment  \
345  3.0     1.0            128.0         256.0   NaN  0.008911   removal   
321  3.0     1.0            256.0         256.0   NaN  0.006088   removal   
402  3.0     1.0            128.0         256.0   NaN  0.006079   removal   

     dropout     ratio   seed  ...  accuracy        f1    recall  precision  \
345      0.0  0.169687  366.6  ...  0.943613  0.864422  0.853352   0.875953   
321      0.0  0.137797  366.6  ...  0.941930  0.860790  0.851926   0.869903   
402      0.0  0.145731  366.6  ...  0.946017  0.872242  0.874465   0.870099   

                                                losses    f1_std  \
345  [3.159997582435608, 2.977846145629883, 2.97546...  0.004226   
321  [7.2857988476753235, 6.849560081958771, 6.6970...  0.007961   
402  [2.9329919815063477, 2.6585928797721863, 2.535...  0.004125   

              sampling  sampling_k  

In [None]:
from plots_tables_results import experiment_varying_ratio_of_positives
for idx, row in best_rows.iterrows():
    print(f"Best result for sampling '{row['sampling']}':")
    params = {**row.to_dict(), "dataset_name": "citeseer", "mechanism": "SCAR"}
    experiment_varying_ratio_of_positives([0.5,0.4,0.3,0.2,0.1],**params)

In [103]:
df=aggregate_experiment_results("citeseer", "scar")
df.head(30)

Aggregated results saved to citeseer_scar_aggregated_results.csv


Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,accuracy,f1,recall,precision,losses,f1_std,sampling,sampling_k,num_epochs,anomaly_detector
402,3.0,1.0,128.0,256.0,,0.006079,removal,0.0,0.145731,366.6,...,0.946017,0.872242,0.874465,0.870099,"[2.9329919815063477, 2.6585928797721863, 2.535...",0.004125,neighbor,75.0,100.0,nearest_neighbors
429,3.0,1.0,64.0,256.0,,0.007108,removal,0.0,0.116648,366.6,...,0.947039,0.871161,0.849929,0.893551,"[2.948562264442444, 2.671817421913147, 2.53761...",0.006762,neighbor,75.0,100.0,nearest_neighbors
410,3.0,1.0,64.0,256.0,,0.006376,removal,0.0,0.137938,366.6,...,0.945537,0.870434,0.868188,0.872774,"[2.937455415725708, 2.6690739393234253, 2.5415...",0.002763,neighbor,75.0,100.0,nearest_neighbors
433,3.0,1.0,64.0,256.0,,0.006971,removal,0.0,0.115692,366.6,...,0.946799,0.870433,0.847932,0.894258,"[2.9519330263137817, 2.6777883768081665, 2.542...",0.005031,neighbor,75.0,100.0,nearest_neighbors
405,3.0,1.0,64.0,256.0,,0.006075,removal,0.0,0.140749,366.6,...,0.945296,0.870415,0.871897,0.869071,"[2.936182141304016, 2.669817864894867, 2.54163...",0.004382,neighbor,75.0,100.0,nearest_neighbors
369,3.0,1.0,128.0,256.0,,0.007381,removal,0.0,0.151693,366.6,...,0.945056,0.870379,0.875606,0.86527,"[2.9276105165481567, 2.6588831543922424, 2.525...",0.005006,neighbor,25.0,100.0,nearest_neighbors
372,3.0,1.0,128.0,256.0,,0.006566,removal,0.0,0.151197,366.6,...,0.944935,0.870336,0.877033,0.863773,"[2.936996102333069, 2.662279188632965, 2.53457...",0.007009,neighbor,75.0,100.0,nearest_neighbors
376,3.0,1.0,128.0,256.0,,0.007422,removal,0.0,0.15149,366.6,...,0.944995,0.870285,0.875606,0.865186,"[2.932759165763855, 2.659614086151123, 2.52585...",0.003965,neighbor,75.0,100.0,nearest_neighbors
441,3.0,1.0,64.0,256.0,,0.007405,removal,0.0,0.112661,366.6,...,0.946979,0.870246,0.843652,0.89867,"[2.9465993642807007, 2.6738736629486084, 2.535...",0.006581,neighbor,75.0,100.0,nearest_neighbors
401,3.0,1.0,128.0,256.0,,0.006189,removal,0.0,0.151391,366.6,...,0.944995,0.870199,0.875036,0.865502,"[2.9322893023490906, 2.6603822708129883, 2.534...",0.002369,neighbor,75.0,100.0,nearest_neighbors


In [85]:
df=aggregate_experiment_results("citeseer", "sar")['rate_pairs']
df.head(10)

Aggregated results saved to citeseer_sar_aggregated_results.csv


169    10.0
193    10.0
191    10.0
187    10.0
194    10.0
167    10.0
164    10.0
111     9.0
170    10.0
189    10.0
Name: rate_pairs, dtype: float64

In [100]:
df=aggregate_experiment_results("cora", "scar")
df.head(10)

Aggregated results saved to cora_scar_aggregated_results.csv


Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,accuracy,f1,recall,precision,losses,f1_std,sampling,sampling_k,num_epochs,anomaly_detector
125,9.0,2.0,128.0,128.0,,0.004753,removal,0.0,0.219535,366.6,...,0.949335,0.915245,0.905623,0.925115,"[8.46787703037262, 9.080339789390564, 9.225038...",0.001502,,,,
146,9.0,2.0,128.0,128.0,,0.005469,removal,0.0,0.227257,366.6,...,0.949261,0.915142,0.905868,0.924721,"[8.331558585166931, 8.92667829990387, 9.123639...",0.00215,,,,
74,9.0,2.0,128.0,128.0,,0.005562,removal,0.0,0.229005,366.6,...,0.948449,0.914357,0.910513,0.918378,"[8.304111063480377, 8.944101214408875, 9.10925...",0.00575,,,,
128,9.0,2.0,128.0,128.0,,0.005598,removal,0.0,0.226938,366.6,...,0.948597,0.914282,0.907335,0.921397,"[8.389947235584259, 8.973759293556213, 9.15877...",0.006249,,,,
28,9.0,2.0,128.0,128.0,,0.004421,removal,0.0,0.213466,366.6,...,0.948818,0.914177,0.9022,0.926583,"[8.500597298145294, 9.130502104759216, 9.30837...",0.004233,,,,
92,9.0,2.0,128.0,128.0,,0.006688,removal,0.0,0.215347,366.6,...,0.948744,0.914028,0.901956,0.926432,"[8.432208478450775, 9.014741659164429, 9.14462...",0.004531,,,,
130,9.0,2.0,128.0,128.0,,0.005826,removal,0.0,0.22955,366.6,...,0.947784,0.913942,0.917848,0.910076,"[22.801353216171265, 22.64835500717163, 22.110...",0.006615,,,,
57,9.0,2.0,128.0,128.0,,0.006561,removal,0.0,0.220093,366.6,...,0.948227,0.913617,0.906357,0.920997,"[8.374062180519104, 8.959164381027222, 9.09638...",0.002002,,,,
83,9.0,2.0,128.0,128.0,,0.00903,removal,0.0,0.228497,366.6,...,0.947637,0.913522,0.915648,0.911419,"[26.48839569091797, 25.419988572597504, 24.341...",0.001824,,,,
97,9.0,2.0,128.0,128.0,,0.009933,removal,0.0,0.218288,366.6,...,0.948301,0.913424,0.902689,0.924472,"[8.428346157073975, 8.934252500534058, 8.96555...",0.003406,,,,


In [87]:
df=aggregate_experiment_results("cora", "sar")['rate_pairs']
df.head(10)

Aggregated results saved to cora_sar_aggregated_results.csv


25    12.0
21    12.0
17    12.0
22    14.0
23    14.0
12    10.0
18    14.0
11     4.0
8     11.0
16    12.0
Name: rate_pairs, dtype: float64

In [88]:
df=aggregate_experiment_results("pubmed","scar")['rate_pairs']
df.head(10)

Aggregated results saved to pubmed_scar_aggregated_results.csv


80     3.0
86     3.0
88     3.0
118    2.0
148    2.0
155    2.0
94     2.0
87     3.0
126    2.0
107    2.0
Name: rate_pairs, dtype: float64

In [96]:
df=aggregate_experiment_results("pubmed","sar")
df.head(10)

Aggregated results saved to pubmed_sar_aggregated_results.csv


Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,batch_size,rate_pairs,reliable_mini_batch,clusters,accuracy,f1,recall,precision,losses,f1_std
82,7.0,2.0,128.0,256.0,,0.00374,removal,0.0,0.256688,366.6,...,10.0,7.0,1.0,500.0,0.898798,0.869767,0.846121,0.89478,"[53.74618089199066, 51.19909358024597, 47.5940...",0.001644
54,8.0,2.0,128.0,256.0,,0.003917,removal,0.0,0.255021,366.6,...,10.0,7.0,1.0,500.0,0.89895,0.869744,0.844698,0.896323,"[55.939280450344086, 53.593082785606384, 49.92...",0.003548
102,8.0,2.0,128.0,256.0,,0.00426,removal,0.0,0.254904,366.6,...,10.0,6.0,1.0,500.0,0.898727,0.869737,0.846502,0.894288,"[55.828895926475525, 53.136748909950256, 49.03...",0.002404
61,8.0,2.0,128.0,256.0,,0.004228,removal,0.0,0.256191,366.6,...,10.0,7.0,1.0,500.0,0.898504,0.8695,0.846603,0.893676,"[55.83849036693573, 53.16435515880585, 49.1003...",0.002747
62,8.0,2.0,128.0,256.0,,0.004255,removal,0.0,0.256799,366.6,...,10.0,7.0,1.0,500.0,0.898311,0.869355,0.847111,0.8928,"[55.85938745737076, 53.17556309700012, 49.0730...",0.001523
110,8.0,2.0,128.0,256.0,,0.004104,removal,0.0,0.26652,366.6,...,10.0,3.0,1.0,500.0,0.897926,0.869337,0.850184,0.889374,"[55.81945329904556, 53.267014026641846, 49.430...",0.001262
93,8.0,2.0,128.0,256.0,,0.00332,removal,0.0,0.263145,366.6,...,10.0,7.0,1.0,500.0,0.898037,0.869148,0.847848,0.891551,"[55.9970378279686, 54.226133704185486, 51.3558...",0.001033
48,8.0,2.0,128.0,256.0,,0.004071,removal,0.0,0.252877,366.6,...,10.0,8.0,1.0,500.0,0.898565,0.869119,0.843251,0.896627,"[55.93415814638138, 53.39403164386749, 49.5923...",0.00216
81,7.0,2.0,128.0,256.0,,0.003767,removal,0.0,0.251707,366.6,...,10.0,7.0,1.0,500.0,0.898615,0.869113,0.842794,0.897132,"[53.818075299263, 51.23932945728302, 47.596707...",0.002664
111,8.0,2.0,128.0,256.0,,0.004122,removal,0.0,0.266988,366.6,...,10.0,3.0,1.0,500.0,0.897642,0.869088,0.850692,0.888303,"[55.79989117383957, 53.24219506978989, 49.3806...",0.001262


In [93]:
df=aggregate_experiment_results("wiki-cs","scar")['rate_pairs']
df.head(10)

Aggregated results saved to wiki-cs_scar_aggregated_results.csv


118    8.0
122    8.0
121    8.0
141    8.0
147    8.0
143    8.0
134    8.0
106    8.0
144    8.0
126    8.0
Name: rate_pairs, dtype: float64

In [46]:
df=aggregate_experiment_results("elliptic-bitcoin")
df.head(10)

In [16]:
import numpy as np
import pandas as pd
path=r"C:\Users\romai\Desktop\elliptic_bitcoin_dataset"
# Define path to CSV file
csv_path = path + r"\elliptic_txs_classes.csv"

# Load data with pandas for better handling
df = pd.read_csv(csv_path)

# Check for missing or malformed values explicitly
missing_rows = df[df.isnull().any(axis=1)]

if not missing_rows.empty:
    print("Found missing or malformed rows at indices:")
    print(missing_rows.index.tolist())
else:
    print("No missing rows detected with pandas.")

# Load data using numpy for original processing
y_str = np.loadtxt(csv_path,
                   delimiter=",", skiprows=1, usecols=(0,1), dtype=str)

print("Loaded numpy data shape:", y_str.shape)
print("Pandas DataFrame shape:", df.shape)

# Identify missing rows by comparing pandas and numpy lengths
expected_rows = df.shape[0]
loaded_rows = y_str.shape[0]

if loaded_rows < expected_rows:
    print(f"Missing {expected_rows - y_str.shape[0]} rows in numpy load.")
    missing_indices = set(df.index) - set(range(y_str.shape[0]))
    print("Missing row indices:", missing_rows)

# Verify exact missing rows if needed
if len(df) != len(y_str):
    missing_rows = df[~df.index.isin(range(len(y_str)))]
    print("Detailed missing rows:")
    print(missing_rows)


No missing rows detected with pandas.
Loaded numpy data shape: (203765, 2)
Pandas DataFrame shape: (203769, 2)
Missing 4 rows in numpy load.
Missing row indices: Empty DataFrame
Columns: [txId, class]
Index: []
Detailed missing rows:
             txId    class
203765  158577750  unknown
203766  158375402        1
203767  158654197  unknown
203768  157597225  unknown
