In [34]:
import os
import glob
import pandas as pd

def aggregate_experiment_results(dataset_name: str):
    folder_path = f"C:/Users/romai/Desktop/gnn/gnn_pu/NNIF-GNN/{dataset_name}_experimentations"
    csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
    
    aggregated_results = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            
            # Check if the dataframe has exactly 5 rows
            if len(df) == 5:
                aggregated = {}
                for col in df.columns:
                    if pd.api.types.is_numeric_dtype(df[col]):
                        aggregated[col] = df[col].mean()
                    else:
                        mode_val = df[col].mode()
                        aggregated[col] = mode_val.iloc[0] if not mode_val.empty else None
                
                # Specifically add the std of the 'f1' column if it exists
                if 'f1' in df.columns:
                    aggregated['f1_std'] = df['f1'].std()
                else:
                    aggregated['f1_std'] = None
                    
                aggregated_results.append(aggregated)
        except Exception:
            # Ignore CSVs that cause an error
            pass
    
    aggregated_df = pd.DataFrame(aggregated_results)
    
    # Sort by mean F1 in descending order, if it exists
    if 'f1' in aggregated_df.columns:
        aggregated_df = aggregated_df.sort_values(by='f1', ascending=False)
    
    # Save and return the aggregated results
    aggregated_df.to_csv(f"{dataset_name}_aggregated_results.csv", index=False)
    return aggregated_df


In [35]:
df=aggregate_experiment_results("citeseer")
df.head(10)

Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,rate_pairs,reliable_mini_batch,clusters,pos_weight,accuracy,f1,recall,precision,losses,f1_std
327,5.0,1.0,128.0,256.0,,0.001273,removal,0.0,0.155378,366.6,...,1.0,1.0,100.0,,0.944334,0.86705,0.861341,0.872867,"[8.676393866539001, 7.749640345573425, 7.93696...",0.00613
301,5.0,1.0,128.0,256.0,,0.002146,removal,0.0,0.145865,366.6,...,1.0,1.0,100.0,,0.945056,0.866872,0.849073,0.885475,"[8.539441049098969, 7.715812385082245, 7.89558...",0.005939
333,5.0,1.0,128.0,256.0,,0.00125,removal,0.0,0.153321,366.6,...,2.0,1.0,100.0,,0.944454,0.86677,0.857632,0.876131,"[17.130761742591858, 15.982258141040802, 16.69...",0.003947
318,5.0,1.0,128.0,256.0,,0.001291,removal,0.0,0.150807,366.6,...,1.0,1.0,100.0,,0.944454,0.866553,0.85592,0.877485,"[8.703179717063904, 7.766626954078674, 7.96671...",0.003269
320,5.0,1.0,128.0,256.0,,0.001193,removal,0.0,0.150954,366.6,...,1.0,1.0,100.0,,0.943793,0.865723,0.859914,0.871617,"[8.76364678144455, 7.768976449966431, 7.956275...",0.006523
129,5.0,1.0,256.0,64.0,,0.007393,removal,0.0,0.126253,366.6,...,9.0,1.0,200.0,0.233462,0.945236,0.865679,0.837946,0.89543,"[3.7778583616018295, 3.1326910853385925, 3.113...",0.008124
319,5.0,1.0,128.0,256.0,,0.001162,removal,0.0,0.151524,366.6,...,1.0,1.0,100.0,,0.944034,0.865487,0.854494,0.876789,"[8.755184948444366, 7.76377409696579, 7.970774...",0.003802
300,5.0,1.0,128.0,256.0,,0.001916,removal,0.0,0.143588,366.6,...,1.0,1.0,100.0,,0.944274,0.865292,0.849358,0.881966,"[8.54531705379486, 7.682579517364502, 7.882440...",0.002563
305,5.0,1.0,128.0,256.0,,0.001957,removal,0.0,0.149794,366.6,...,1.0,1.0,100.0,,0.943974,0.865271,0.853638,0.877308,"[8.508025646209717, 7.693136394023895, 7.87796...",0.005619
326,5.0,1.0,128.0,256.0,,0.001311,removal,0.0,0.152822,366.6,...,1.0,1.0,100.0,,0.943733,0.865037,0.85592,0.874394,"[8.699468851089478, 7.737050175666809, 7.93732...",0.00475


In [36]:
df=aggregate_experiment_results("cora")
df.head(10)

Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,rate_pairs,reliable_mini_batch,clusters,pos_weight,accuracy,f1,recall,precision,losses,f1_std
146,9.0,2.0,128.0,128.0,,0.004753,removal,0.0,0.219535,366.6,...,6.0,1.0,100.0,,0.949335,0.915245,0.905623,0.925115,"[8.46787703037262, 9.080339789390564, 9.225038...",0.001502
167,9.0,2.0,128.0,128.0,,0.005469,removal,0.0,0.227257,366.6,...,4.0,1.0,100.0,,0.949261,0.915142,0.905868,0.924721,"[8.331558585166931, 8.92667829990387, 9.123639...",0.00215
95,9.0,2.0,128.0,128.0,,0.005562,removal,0.0,0.229005,366.6,...,5.0,1.0,100.0,,0.948449,0.914357,0.910513,0.918378,"[8.304111063480377, 8.944101214408875, 9.10925...",0.00575
149,9.0,2.0,128.0,128.0,,0.005598,removal,0.0,0.226938,366.6,...,5.0,1.0,100.0,,0.948597,0.914282,0.907335,0.921397,"[8.389947235584259, 8.973759293556213, 9.15877...",0.006249
49,9.0,2.0,128.0,128.0,,0.004421,removal,0.0,0.213466,366.6,...,8.0,1.0,100.0,,0.948818,0.914177,0.9022,0.926583,"[8.500597298145294, 9.130502104759216, 9.30837...",0.004233
113,9.0,2.0,128.0,128.0,,0.006688,removal,0.0,0.215347,366.6,...,3.0,1.0,100.0,,0.948744,0.914028,0.901956,0.926432,"[8.432208478450775, 9.014741659164429, 9.14462...",0.004531
151,9.0,2.0,128.0,128.0,,0.005826,removal,0.0,0.22955,366.6,...,6.0,1.0,200.0,,0.947784,0.913942,0.917848,0.910076,"[22.801353216171265, 22.64835500717163, 22.110...",0.006615
78,9.0,2.0,128.0,128.0,,0.006561,removal,0.0,0.220093,366.6,...,7.0,1.0,100.0,,0.948227,0.913617,0.906357,0.920997,"[8.374062180519104, 8.959164381027222, 9.09638...",0.002002
104,9.0,2.0,128.0,128.0,,0.00903,removal,0.0,0.228497,366.6,...,8.0,1.0,100.0,,0.947637,0.913522,0.915648,0.911419,"[26.48839569091797, 25.419988572597504, 24.341...",0.001824
118,9.0,2.0,128.0,128.0,,0.009933,removal,0.0,0.218288,366.6,...,4.0,1.0,100.0,,0.948301,0.913424,0.902689,0.924472,"[8.428346157073975, 8.934252500534058, 8.96555...",0.003406


In [37]:
df=aggregate_experiment_results("pubmed")
df.head(10)

Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,batch_size,rate_pairs,reliable_mini_batch,clusters,accuracy,f1,recall,precision,losses,f1_std
80,12.0,2.0,256.0,256.0,,0.008693,removal,0.0,0.244014,366.6,...,5.0,3.0,1.0,100.0,0.90671,0.879974,0.856229,0.905075,"[25.59938246011734, 26.38034325838089, 25.3141...",0.001326
86,14.0,2.0,256.0,256.0,,0.008078,removal,0.0,0.234383,366.6,...,5.0,3.0,1.0,100.0,0.906974,0.879321,0.848559,0.912398,"[26.13716834783554, 27.11927592754364, 26.2919...",0.000871
88,14.0,2.0,256.0,256.0,,0.008035,removal,0.0,0.235017,366.6,...,5.0,3.0,1.0,100.0,0.906578,0.878943,0.849143,0.910918,"[26.091298282146454, 27.085623562335968, 26.26...",0.001037
118,14.0,2.0,256.0,256.0,,0.007438,removal,0.0,0.225443,366.6,...,10.0,2.0,1.0,100.0,0.907217,0.878911,0.843073,0.917932,"[12.540437936782837, 12.772928714752197, 12.85...",0.002103
148,14.0,2.0,256.0,256.0,,0.007712,removal,0.0,0.223146,366.6,...,10.0,2.0,1.0,100.0,0.907369,0.878861,0.841295,0.919939,"[12.533140420913696, 12.756486296653748, 12.83...",0.001381
155,14.0,2.0,256.0,256.0,,0.007348,removal,0.0,0.222739,366.6,...,10.0,2.0,1.0,100.0,0.907349,0.878822,0.841168,0.920009,"[12.53984785079956, 12.770195245742798, 12.857...",0.000976
94,14.0,2.0,256.0,256.0,,0.00744,removal,0.0,0.225676,366.6,...,10.0,2.0,1.0,100.0,0.906953,0.878754,0.844241,0.91621,"[12.544865489006042, 12.782307982444763, 12.85...",0.00179
87,14.0,2.0,256.0,256.0,,0.008045,removal,0.0,0.236688,366.6,...,5.0,3.0,1.0,100.0,0.906294,0.878747,0.850159,0.909325,"[26.148466765880585, 27.161507189273834, 26.33...",0.001518
126,14.0,2.0,256.0,256.0,,0.006811,removal,0.0,0.217749,366.6,...,10.0,2.0,1.0,100.0,0.907562,0.878637,0.83779,0.923672,"[12.55463707447052, 12.791762948036194, 12.885...",0.001616
107,14.0,2.0,256.0,256.0,,0.00738,removal,0.0,0.224265,366.6,...,10.0,2.0,1.0,100.0,0.906923,0.878535,0.842768,0.917474,"[12.53557276725769, 12.76647663116455, 12.8463...",0.001419


In [38]:
df=aggregate_experiment_results("wiki-cs")
df.head(10)

Unnamed: 0,K,layers,hidden_channels,out_channels,norm,lr,treatment,dropout,ratio,seed,...,batch_size,rate_pairs,reliable_mini_batch,clusters,accuracy,f1,recall,precision,losses,f1_std
118,34.0,2.0,256.0,256.0,,0.001899,removal,0.0,0.173316,366.6,...,15.0,8.0,1.0,100.0,0.965268,0.924417,0.92751,0.921366,"[20.818426847457886, 19.3007470369339, 19.6022...",0.003821
122,32.0,2.0,256.0,256.0,,0.001814,removal,0.0,0.176828,366.6,...,15.0,8.0,1.0,100.0,0.9652,0.924367,0.928854,0.919934,"[20.597373962402344, 19.06372606754303, 19.350...",0.000609
121,32.0,2.0,256.0,256.0,,0.00184,removal,0.0,0.175225,366.6,...,15.0,8.0,1.0,100.0,0.965131,0.92403,0.926166,0.921911,"[20.619243383407593, 19.10641849040985, 19.399...",0.001207
141,32.0,2.0,256.0,256.0,,0.00214,removal,0.0,0.169342,366.6,...,15.0,8.0,1.0,100.0,0.965029,0.923822,0.926166,0.921489,"[20.738324642181396, 19.19844114780426, 19.488...",0.000757
147,35.0,2.0,256.0,256.0,,0.002407,removal,0.0,0.172208,366.6,...,15.0,8.0,1.0,100.0,0.964738,0.923505,0.929675,0.917416,"[20.83068037033081, 19.272576689720154, 19.599...",0.000516
143,33.0,2.0,256.0,256.0,,0.002042,removal,0.0,0.171827,366.6,...,15.0,8.0,1.0,100.0,0.964806,0.923375,0.926092,0.920693,"[20.789230585098267, 19.237472534179688, 19.56...",0.002292
134,34.0,2.0,256.0,256.0,,0.001461,removal,0.0,0.176494,366.6,...,15.0,8.0,1.0,100.0,0.964721,0.923365,0.928182,0.918616,"[20.769208669662476, 19.239400029182434, 19.53...",0.003075
106,34.0,2.0,256.0,256.0,,0.001533,removal,0.0,0.164841,366.6,...,15.0,8.0,1.0,100.0,0.964875,0.923218,0.922284,0.924165,"[20.90528655052185, 19.365336656570435, 19.672...",0.001873
144,33.0,2.0,256.0,256.0,,0.002066,removal,0.0,0.1799,366.6,...,15.0,8.0,1.0,100.0,0.964447,0.923109,0.932064,0.914331,"[20.641557455062866, 19.076834440231323, 19.40...",0.002528
126,34.0,2.0,256.0,256.0,,0.001992,removal,0.0,0.178291,366.6,...,15.0,8.0,1.0,100.0,0.964567,0.923038,0.927958,0.918204,"[20.737881898880005, 19.19690454006195, 19.501...",0.001908


In [16]:
import numpy as np
import pandas as pd
path=r"C:\Users\romai\Desktop\elliptic_bitcoin_dataset"
# Define path to CSV file
csv_path = path + r"\elliptic_txs_classes.csv"

# Load data with pandas for better handling
df = pd.read_csv(csv_path)

# Check for missing or malformed values explicitly
missing_rows = df[df.isnull().any(axis=1)]

if not missing_rows.empty:
    print("Found missing or malformed rows at indices:")
    print(missing_rows.index.tolist())
else:
    print("No missing rows detected with pandas.")

# Load data using numpy for original processing
y_str = np.loadtxt(csv_path,
                   delimiter=",", skiprows=1, usecols=(0,1), dtype=str)

print("Loaded numpy data shape:", y_str.shape)
print("Pandas DataFrame shape:", df.shape)

# Identify missing rows by comparing pandas and numpy lengths
expected_rows = df.shape[0]
loaded_rows = y_str.shape[0]

if loaded_rows < expected_rows:
    print(f"Missing {expected_rows - y_str.shape[0]} rows in numpy load.")
    missing_indices = set(df.index) - set(range(y_str.shape[0]))
    print("Missing row indices:", missing_rows)

# Verify exact missing rows if needed
if len(df) != len(y_str):
    missing_rows = df[~df.index.isin(range(len(y_str)))]
    print("Detailed missing rows:")
    print(missing_rows)


No missing rows detected with pandas.
Loaded numpy data shape: (203765, 2)
Pandas DataFrame shape: (203769, 2)
Missing 4 rows in numpy load.
Missing row indices: Empty DataFrame
Columns: [txId, class]
Index: []
Detailed missing rows:
             txId    class
203765  158577750  unknown
203766  158375402        1
203767  158654197  unknown
203768  157597225  unknown
