In [5]:
import os
import glob
import pandas as pd

def aggregate_experiment_results(dataset_name: str, name_filter: str = "scar", val: str = "f1") -> pd.DataFrame:
    """
    Aggregates experiment results from CSV files in the dataset_name's folder.
    If name_filter is provided, only files whose names contain that substring are processed.
    """
    folder_path = f"C:/Users/romai/Desktop/gnn/gnn_pu/NNIF-GNN/{dataset_name}_experimentations"
    csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
    
    # Filter files based on substring in their filename (if name_filter is provided)
    if name_filter:
        csv_files = [f for f in csv_files if name_filter in os.path.basename(f)]
    
    aggregated_results = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            
            # Check if the dataframe has exactly 5 rows
            if len(df) >= 1:
                aggregated = {}
                for col in df.columns:
                    if pd.api.types.is_numeric_dtype(df[col]):
                        aggregated[col] = df[col].mean()
                    else:
                        mode_val = df[col].mode()
                        aggregated[col] = mode_val.iloc[0] if not mode_val.empty else None
                
                # Specifically add the std of the 'f1' column if it exists
                if 'f1' in df.columns:
                    aggregated['f1_std'] = df['f1'].std()
                else:
                    aggregated['f1_std'] = None
                    
                aggregated_results.append(aggregated)
        except Exception:
            # Ignore CSVs that cause an error
            pass
    
    aggregated_df = pd.DataFrame(aggregated_results)
    
    # Sort by mean F1 in descending order, if it exists
    if 'f1' in aggregated_df.columns:
        aggregated_df = aggregated_df.sort_values(by=val, ascending=False)
    
    # Save and return the aggregated results
    output_filename = f"{dataset_name}_{name_filter}_aggregated_results.csv"
    aggregated_df.to_csv(output_filename, index=False)
    print(f"Aggregated results saved to {output_filename}")
    return aggregated_df


In [2]:
df=aggregate_experiment_results("citeseer", "scar")
best_rows = df.loc[df.groupby('sampling')['f1'].idxmax()]

print(best_rows)

Aggregated results saved to citeseer_scar_aggregated_results.csv
       K  layers  hidden_channels  out_channels  norm        lr treatment  \
345  3.0     1.0            128.0         256.0   NaN  0.008911   removal   
321  3.0     1.0            256.0         256.0   NaN  0.006088   removal   
402  3.0     1.0            128.0         256.0   NaN  0.006079   removal   

     dropout     ratio   seed  ...  accuracy        f1    recall  precision  \
345      0.0  0.169687  366.6  ...  0.943613  0.864422  0.853352   0.875953   
321      0.0  0.137797  366.6  ...  0.941930  0.860790  0.851926   0.869903   
402      0.0  0.145731  366.6  ...  0.946017  0.872242  0.874465   0.870099   

                                                losses    f1_std  \
345  [3.159997582435608, 2.977846145629883, 2.97546...  0.004226   
321  [7.2857988476753235, 6.849560081958771, 6.6970...  0.007961   
402  [2.9329919815063477, 2.6585928797721863, 2.535...  0.004125   

              sampling  sampling_k  

In [None]:
from plots_tables_results import experiment_varying_ratio_of_positives
for idx, row in best_rows.iterrows():
    print(f"Best result for sampling '{row['sampling']}':")
    params = {**row.to_dict(), "dataset_name": "citeseer", "mechanism": "SCAR", "min":0.7,"seeds":3,"output_csv":"citeseer_test_train_pct.csv"}
    params['K']=int(params['K'])
    params['layers']=int(params['layers'])
    params['hidden_channels']=int(params['hidden_channels'])
    params['out_channels']=int(params['out_channels'])
    params['dropout']=float(params['dropout'])
    params['lr']=float(params['lr'])
    params['num_epochs']=int(params['num_epochs'])
    params['batch_size']=int(params['batch_size'])
    params['clusters']=int(params['clusters'])
    params['sampling_k']=int(params['sampling_k'])
    params['rate_pairs']=int(params['rate_pairs'])
    experiment_varying_ratio_of_positives([0.5,0.4,0.3,0.2,0.1],**params)

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'torch_sparse'

In [153]:
df=aggregate_experiment_results("wiki-cs", "scar_val")[['hidden_channels','out_channels','layers','K','ratio','lr','model_type','rate_pairs','batch_size','f1','test_f1','sampling']]
df.head(40)

Aggregated results saved to wiki-cs_scar_val_aggregated_results.csv


Unnamed: 0,hidden_channels,out_channels,layers,K,ratio,lr,model_type,rate_pairs,batch_size,f1,test_f1,sampling
41,64.0,256.0,1.0,31.0,0.34452,0.005,GCNConv,10.0,2048.0,0.976439,0.838533,sage
43,128.0,128.0,1.0,31.0,0.349697,0.005,GCNConv,10.0,2048.0,0.974276,0.837367,sage
49,128.0,128.0,1.0,30.0,0.345471,0.005,GCNConv,10.0,2048.0,0.973961,0.839597,sage
48,128.0,128.0,1.0,32.0,0.349719,0.005,GCNConv,10.0,2048.0,0.973006,0.835236,sage
42,128.0,128.0,1.0,31.0,0.343885,0.005,GCNConv,10.0,2048.0,0.972696,0.837236,sage
51,128.0,128.0,1.0,30.0,0.343787,0.005,GCNConv,10.0,2048.0,0.972691,0.837841,sage
47,128.0,128.0,1.0,30.0,0.344168,0.005,GCNConv,10.0,2048.0,0.972368,0.841392,sage
40,64.0,256.0,1.0,31.0,0.341874,0.005,GCNConv,10.0,2048.0,0.972367,0.848517,sage
39,64.0,256.0,1.0,33.0,0.346313,0.005,GCNConv,10.0,2048.0,0.971746,0.841777,sage
53,128.0,128.0,1.0,29.0,0.326389,0.005,GCNConv,10.0,2048.0,0.971733,0.84885,sage


In [154]:
df=aggregate_experiment_results("wiki-cs", "scar_val","test_f1")[['hidden_channels','out_channels','layers','K','ratio','lr','model_type','rate_pairs','batch_size','f1','test_f1','sampling']]
df.head(40)

Aggregated results saved to wiki-cs_scar_val_aggregated_results.csv


Unnamed: 0,hidden_channels,out_channels,layers,K,ratio,lr,model_type,rate_pairs,batch_size,f1,test_f1,sampling
2,256.0,64.0,2.0,16.0,0.228858,0.005,GCNConv,10.0,2048.0,0.893659,0.89879,sage
9,128.0,64.0,2.0,22.0,0.266843,0.005,GCNConv,10.0,2048.0,0.913538,0.897022,sage
8,256.0,128.0,2.0,5.0,0.322299,0.005,GCNConv,10.0,2048.0,0.925552,0.894726,sage
1,256.0,64.0,2.0,6.0,0.296059,0.005,GCNConv,10.0,2048.0,0.916141,0.893929,sage
3,256.0,64.0,2.0,3.0,0.291999,0.005,GCNConv,10.0,2048.0,0.935779,0.893204,sage
54,128.0,128.0,2.0,27.0,0.344833,0.005,GCNConv,10.0,2048.0,0.928611,0.890846,sage
7,256.0,64.0,2.0,16.0,0.347021,0.005,GCNConv,10.0,2048.0,0.928235,0.886162,sage
6,256.0,64.0,2.0,5.0,0.347614,0.005,GCNConv,10.0,2048.0,0.929298,0.881055,sage
5,256.0,64.0,2.0,2.0,0.291298,0.005,GCNConv,10.0,2048.0,0.90771,0.876239,sage
4,256.0,64.0,2.0,2.0,0.303057,0.005,GCNConv,10.0,2048.0,0.912283,0.87492,sage


In [1]:
df=aggregate_experiment_results("citeseer", "scar")
df.head(10)

NameError: name 'aggregate_experiment_results' is not defined

In [55]:
df=aggregate_experiment_results("cora", "scar_val")
df.head(20)

Aggregated results saved to cora_scar_val_aggregated_results.csv


Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,accuracy,f1,recall,precision,losses,test_accuracy,test_f1,test_recall,test_precision,f1_std
126,3.0,1.0,256.0,128.0,,0.005016,removal,0.0,0.36567,366.6,...,0.862745,0.926171,0.862745,1.0,"[2.5214850902557373, 2.310188412666321, 2.2793...",0.926071,0.8807,0.902934,0.859734,0.013993
105,3.0,1.0,256.0,128.0,,0.003956,removal,0.0,0.376133,366.6,...,0.856863,0.922738,0.856863,1.0,"[2.5509077310562134, 2.3250324726104736, 2.291...",0.923043,0.875904,0.899022,0.854011,0.015445
121,3.0,1.0,256.0,128.0,,0.004755,removal,0.0,0.376355,366.6,...,0.856863,0.922655,0.856863,1.0,"[2.5361611247062683, 2.3236334919929504, 2.289...",0.923486,0.876902,0.901711,0.853639,0.018679
123,3.0,1.0,256.0,128.0,,0.005079,removal,0.0,0.364097,366.6,...,0.854902,0.92165,0.854902,1.0,"[2.5241012573242188, 2.307712733745575, 2.2728...",0.926736,0.881138,0.899022,0.864019,0.013037
101,3.0,1.0,256.0,128.0,,0.003329,removal,0.0,0.37451,366.6,...,0.854902,0.921325,0.854902,1.0,"[2.578955590724945, 2.2824825644493103, 2.2432...",0.922674,0.874974,0.895844,0.855108,0.024674
120,3.0,1.0,256.0,128.0,,0.004623,removal,0.0,0.376325,366.6,...,0.854902,0.921315,0.854902,1.0,"[2.536220669746399, 2.3197379112243652, 2.2842...",0.923191,0.876601,0.902934,0.851829,0.02519
110,3.0,1.0,256.0,128.0,,0.003898,removal,0.0,0.369049,366.6,...,0.854902,0.921249,0.854902,1.0,"[2.5520825386047363, 2.3292168378829956, 2.296...",0.923929,0.876587,0.894132,0.859808,0.02674
119,3.0,1.0,256.0,128.0,,0.004842,removal,0.0,0.365419,366.6,...,0.852941,0.920327,0.852941,1.0,"[2.529228150844574, 2.3203044533729553, 2.2857...",0.924815,0.87827,0.897555,0.859927,0.020593
111,3.0,1.0,256.0,128.0,,0.003931,removal,0.0,0.37056,366.6,...,0.852941,0.920232,0.852941,1.0,"[2.5481361150741577, 2.323235869407654, 2.2881...",0.923338,0.875816,0.894866,0.857669,0.023463
115,3.0,1.0,256.0,128.0,,0.004262,removal,0.0,0.369561,366.6,...,0.85098,0.919304,0.85098,1.0,"[2.5369288325309753, 2.320575535297394, 2.2858...",0.924003,0.87737,0.899756,0.856187,0.015953


In [87]:
df=aggregate_experiment_results("cora", "sar")['rate_pairs']
df.head(10)

Aggregated results saved to cora_sar_aggregated_results.csv


25    12.0
21    12.0
17    12.0
22    14.0
23    14.0
12    10.0
18    14.0
11     4.0
8     11.0
16    12.0
Name: rate_pairs, dtype: float64

In [14]:
df=aggregate_experiment_results("pubmed","scar")[['sampling','f1','f1_std','rate_pairs','batch_size','lr']]
df.head(30)

Aggregated results saved to pubmed_scar_aggregated_results.csv


Unnamed: 0,sampling,f1,f1_std,rate_pairs,batch_size,lr
142,,0.879974,0.001326,3.0,5.0,0.008693
148,,0.879321,0.000871,3.0,5.0,0.008078
150,,0.878943,0.001037,3.0,5.0,0.008035
180,,0.878911,0.002103,2.0,10.0,0.007438
210,,0.878861,0.001381,2.0,10.0,0.007712
217,,0.878822,0.000976,2.0,10.0,0.007348
156,,0.878754,0.00179,2.0,10.0,0.00744
149,,0.878747,0.001518,3.0,5.0,0.008045
188,,0.878637,0.001616,2.0,10.0,0.006811
169,,0.878535,0.001419,2.0,10.0,0.00738


In [96]:
df=aggregate_experiment_results("pubmed","sar")
df.head(10)

Aggregated results saved to pubmed_sar_aggregated_results.csv


Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,batch_size,rate_pairs,reliable_mini_batch,clusters,accuracy,f1,recall,precision,losses,f1_std
82,7.0,2.0,128.0,256.0,,0.00374,removal,0.0,0.256688,366.6,...,10.0,7.0,1.0,500.0,0.898798,0.869767,0.846121,0.89478,"[53.74618089199066, 51.19909358024597, 47.5940...",0.001644
54,8.0,2.0,128.0,256.0,,0.003917,removal,0.0,0.255021,366.6,...,10.0,7.0,1.0,500.0,0.89895,0.869744,0.844698,0.896323,"[55.939280450344086, 53.593082785606384, 49.92...",0.003548
102,8.0,2.0,128.0,256.0,,0.00426,removal,0.0,0.254904,366.6,...,10.0,6.0,1.0,500.0,0.898727,0.869737,0.846502,0.894288,"[55.828895926475525, 53.136748909950256, 49.03...",0.002404
61,8.0,2.0,128.0,256.0,,0.004228,removal,0.0,0.256191,366.6,...,10.0,7.0,1.0,500.0,0.898504,0.8695,0.846603,0.893676,"[55.83849036693573, 53.16435515880585, 49.1003...",0.002747
62,8.0,2.0,128.0,256.0,,0.004255,removal,0.0,0.256799,366.6,...,10.0,7.0,1.0,500.0,0.898311,0.869355,0.847111,0.8928,"[55.85938745737076, 53.17556309700012, 49.0730...",0.001523
110,8.0,2.0,128.0,256.0,,0.004104,removal,0.0,0.26652,366.6,...,10.0,3.0,1.0,500.0,0.897926,0.869337,0.850184,0.889374,"[55.81945329904556, 53.267014026641846, 49.430...",0.001262
93,8.0,2.0,128.0,256.0,,0.00332,removal,0.0,0.263145,366.6,...,10.0,7.0,1.0,500.0,0.898037,0.869148,0.847848,0.891551,"[55.9970378279686, 54.226133704185486, 51.3558...",0.001033
48,8.0,2.0,128.0,256.0,,0.004071,removal,0.0,0.252877,366.6,...,10.0,8.0,1.0,500.0,0.898565,0.869119,0.843251,0.896627,"[55.93415814638138, 53.39403164386749, 49.5923...",0.00216
81,7.0,2.0,128.0,256.0,,0.003767,removal,0.0,0.251707,366.6,...,10.0,7.0,1.0,500.0,0.898615,0.869113,0.842794,0.897132,"[53.818075299263, 51.23932945728302, 47.596707...",0.002664
111,8.0,2.0,128.0,256.0,,0.004122,removal,0.0,0.266988,366.6,...,10.0,3.0,1.0,500.0,0.897642,0.869088,0.850692,0.888303,"[55.79989117383957, 53.24219506978989, 49.3806...",0.001262


In [None]:
df=aggregate_experiment_results("elliptic-bitcoin","cluster")
df.head(50)

In [46]:
df=aggregate_experiment_results("elliptic-bitcoin")
df.head(10)

In [16]:
import numpy as np
import pandas as pd
path=r"C:\Users\romai\Desktop\elliptic_bitcoin_dataset"
# Define path to CSV file
csv_path = path + r"\elliptic_txs_classes.csv"

# Load data with pandas for better handling
df = pd.read_csv(csv_path)

# Check for missing or malformed values explicitly
missing_rows = df[df.isnull().any(axis=1)]

if not missing_rows.empty:
    print("Found missing or malformed rows at indices:")
    print(missing_rows.index.tolist())
else:
    print("No missing rows detected with pandas.")

# Load data using numpy for original processing
y_str = np.loadtxt(csv_path,
                   delimiter=",", skiprows=1, usecols=(0,1), dtype=str)

print("Loaded numpy data shape:", y_str.shape)
print("Pandas DataFrame shape:", df.shape)

# Identify missing rows by comparing pandas and numpy lengths
expected_rows = df.shape[0]
loaded_rows = y_str.shape[0]

if loaded_rows < expected_rows:
    print(f"Missing {expected_rows - y_str.shape[0]} rows in numpy load.")
    missing_indices = set(df.index) - set(range(y_str.shape[0]))
    print("Missing row indices:", missing_rows)

# Verify exact missing rows if needed
if len(df) != len(y_str):
    missing_rows = df[~df.index.isin(range(len(y_str)))]
    print("Detailed missing rows:")
    print(missing_rows)


No missing rows detected with pandas.
Loaded numpy data shape: (203765, 2)
Pandas DataFrame shape: (203769, 2)
Missing 4 rows in numpy load.
Missing row indices: Empty DataFrame
Columns: [txId, class]
Index: []
Detailed missing rows:
             txId    class
203765  158577750  unknown
203766  158375402        1
203767  158654197  unknown
203768  157597225  unknown
