In [1]:
import os

import numpy as np
import pandas as pd

### 1. Get all file names of the formatted labeled data

In [28]:
csvs = []

for root, dirs, files in os.walk("../../data/user_labeled/labeled_formatted//"):
    csvs += files

### 2. Merge labels with network info for labeled nodes

In [30]:
dfs = []

for filename in csvs:
    labeled = filename
    unlabeled = filename.replace(" ", "_").replace(".csv", ".zip")
    
    # get labeled and unlabeled date for merge
    labeled_df = pd.read_csv(f"../../data/user_labeled/labeled_formatted//{labeled}")
    unlabeled_df = pd.read_csv(f"../../data/user_labeled/unlabeled/{unlabeled}")
    
    # rename columns in labeled to match those in unlabeled. 
    labeled_df.rename(columns={"Topic": "node", "Before/After": "label"}, inplace=True)
    
    
    merged = labeled_df.merge(unlabeled_df, on="node", how="left")
    
    dfs.append(merged)
    


### 3. Combine all labeled data and drop NAs. Save to `final_labeled.csv`

In [31]:
total_labeled = dfs[0]
for i in range(1, len(dfs)):
    df = dfs[i]
    total_labeled = pd.concat([total_labeled, df])
    
total_labeled.dropna().to_csv("../../data/user_labeled/final_labeled.csv", index=False)
        

### 4. Confirm `final_labeled.csv`

In [33]:
final_labeled = pd.read_csv("../../data/user_labeled/final_labeled.csv")
final_labeled

Unnamed: 0,node,label,degree,category_matches_with_source,in_edges,out_edges,shared_neighbors_with_entry_score,centrality,page_rank,adjusted_reciprocity,shortest_path_length_from_entry,shortest_path_length_to_entry,jaccard_similarity,primary_link,similarity_rank
0,Slow movement (music),after,0.001519,0.50,0.000719,0.007280,0.020524,0.001689,0.000829,0.003650,0.166667,0.25,0.008345,0.0,0.577697
1,Sonata cycle,after,0.009239,1.00,0.004511,0.040562,0.096802,0.005245,0.002313,0.118613,0.166667,0.25,0.096639,0.0,1.000000
2,Musical development,before,0.018415,0.50,0.014513,0.036401,0.195297,0.021138,0.020009,0.083942,0.166667,0.25,0.209302,1.0,0.873610
3,Tonality,before,0.040818,0.50,0.032163,0.080083,0.062095,0.026654,0.042905,0.107664,0.166667,0.25,0.053275,1.0,0.587683
4,Period (music),before,0.008860,0.50,0.004903,0.034321,0.064836,0.005490,0.015954,0.083942,0.166667,0.25,0.069106,0.0,0.515058
5,Musical form,before,0.055373,0.50,0.030006,0.216849,0.068211,0.034494,0.057159,0.206204,0.166667,0.25,0.075160,1.0,0.502052
6,Developing variation,before,0.009303,0.50,0.005622,0.032241,0.065454,0.005546,0.011473,0.082117,0.166667,0.25,0.069519,0.0,0.510226
7,Symphony No. 9 (Schubert),after,0.008670,0.50,0.004576,0.035361,0.032895,0.017270,0.003367,0.040146,0.166667,0.25,0.016861,0.0,0.499566
8,Binary form,before,0.011897,0.50,0.007910,0.035361,0.092651,0.011164,0.015488,0.085766,0.166667,0.25,0.088657,0.0,0.496613
9,Recapitulation (music),before,0.017276,0.00,0.014578,0.026521,0.194788,0.016950,0.017878,0.080292,0.166667,0.25,0.210594,1.0,0.489005
