In [1]:
import os

import numpy as np
import pandas as pd

### 1. Get all file names of the formatted labeled data

In [34]:
csvs = []

for root, dirs, files in os.walk("../../data/user_labeled/labeled_formatted//"):
    csvs += files
    
csvs

['Counterpoint.csv',
 'Sonata form.csv',
 'Decision tree.csv',
 'Mental health.csv']

### 2. Merge labels with network info for labeled nodes

In [35]:
dfs = []

for filename in csvs:
    labeled = filename
    unlabeled = filename.replace(" ", "_").replace(".csv", ".zip")
    
    # get labeled and unlabeled date for merge
    labeled_df = pd.read_csv(f"../../data/user_labeled/labeled_formatted//{labeled}")
    unlabeled_df = pd.read_csv(f"../../data/user_labeled/unlabeled/{unlabeled}")
    
    # rename columns in labeled to match those in unlabeled. 
    labeled_df.rename(columns={"Topic": "node", "Before/After": "label"}, inplace=True)
    
    
    merged = labeled_df.merge(unlabeled_df, on="node", how="left")
    
    dfs.append(merged)
    


### 3. Combine all labeled data and drop NAs. Save to `final_labeled.csv`

In [36]:
total_labeled = dfs[0]
for i in range(1, len(dfs)):
    df = dfs[i]
    total_labeled = pd.concat([total_labeled, df])
    
total_labeled.dropna().to_csv("../../data/user_labeled/final_labeled.csv", index=False)
        

### 4. Confirm `final_labeled.csv`

In [37]:
final_labeled = pd.read_csv("../../data/user_labeled/final_labeled.csv")
final_labeled

Unnamed: 0,node,label,degree,category_matches_with_source,in_edges,out_edges,shared_neighbors_with_entry_score,centrality,page_rank,adjusted_reciprocity,shortest_path_length_from_entry,shortest_path_length_to_entry,jaccard_similarity,primary_link,similarity_rank
0,Pitch contour,before,0.005948,0.50,0.003660,0.013499,0.003672,0.005395,0.005592,0.032787,0.111111,0.40,0.002206,1.0,1.000000
1,Contrapuntal motion,before,0.002721,0.50,0.001961,0.004846,0.016748,0.005917,0.003279,0.005855,0.111111,0.20,0.013647,0.0,0.777372
2,False relation,before,0.005758,0.50,0.003203,0.014884,0.017125,0.006395,0.004419,0.029274,0.111111,0.20,0.014211,0.0,0.739844
3,Consecutive fifths,before,0.007530,0.50,0.002092,0.030460,0.024269,0.008024,0.002905,0.008197,0.111111,0.20,0.009043,0.0,0.719724
4,Part (music),before,0.014238,0.00,0.008627,0.032537,0.038132,0.015269,0.018632,0.050351,0.111111,0.20,0.026391,1.0,0.661378
5,Common practice period,after,0.017212,0.00,0.013791,0.021461,0.063220,0.056626,0.017873,0.035129,0.111111,0.20,0.060056,1.0,0.650981
6,Monophony,before,0.016642,0.50,0.012614,0.024576,0.017769,0.008420,0.024175,0.028103,0.111111,0.20,0.013514,0.0,0.622047
7,The Well-Tempered Clavier,after,0.033854,0.50,0.018301,0.088612,0.036702,0.059787,0.015477,0.117096,0.111111,0.20,0.025856,0.0,0.519419
8,Post-tonal music theory,after,0.006518,0.50,0.002092,0.024922,0.022684,0.007004,0.002093,0.028103,0.222222,0.20,0.006767,0.0,0.508584
9,Atmosphères,after,0.009302,0.50,0.003137,0.034614,0.015248,0.004193,0.001571,0.048009,0.222222,0.20,0.004448,0.0,0.490293
