In [8]:
import os
import pdb
import numpy as np
import pandas as pd

### 1. Get all file names of the formatted labeled data

In [26]:
csvs = []

for root, dirs, files in os.walk("../../data/user_labeled/labeled_formatted/"):
    csvs += files
    
csvs

['Counterpoint.csv',
 'Sonata form.csv',
 'Decision tree.csv',
 'StarCraft II: Wings of Liberty.csv',
 '[SKIP]Stargate.csv',
 'Mental health.csv',
 'Prevention science.csv',
 'Digital recording.csv',
 'Exercise physiology.csv',
 'Atmospheric science.csv',
 'Evidence-based policy.csv']

### 2. Merge labels with network info for labeled nodes

In [27]:
dfs = []

unlabeled_path = "../../data/user_labeled/unlabeled_normalized/"

for filename in csvs:
    if "SKIP" in filename:
        continue
        
    labeled = filename
    unlabeled = filename.replace(".csv", ".zip")
    
    # get labeled and unlabeled date for merge
    labeled_df = pd.read_csv(f"../../data/user_labeled/labeled_formatted/{labeled}")
    unlabeled_df = pd.read_csv(f"{unlabeled_path}{unlabeled}")
    
    # rename columns in labeled to match those in unlabeled. 
    labeled_df.rename(columns={"Topic": "node", "Before/After": "label"}, inplace=True)
    
    
    merged = labeled_df.merge(unlabeled_df, on="node", how="left")
    
    dfs.append(merged)
    


### 3. Combine all labeled data and drop NAs. Save to `final_labeled.csv`

In [28]:
total_labeled = dfs[0]
for i in range(1, len(dfs)):
    df = dfs[i]
    total_labeled = pd.concat([total_labeled, df])

total_labeled.label.value_counts()

before    141
after     137
Name: label, dtype: int64

In [29]:
total_labeled.dropna().to_csv("../../data/user_labeled/final_labeled.csv", index=False)

### 4. Confirm `final_labeled.csv`

In [30]:
final_labeled = pd.read_csv("../../data/user_labeled/final_labeled.csv")
final_labeled
# final_labeled[final_labeled.node == "Stargate"]

Unnamed: 0,node,label,degree,category_matches_with_source,in_edges,out_edges,shared_neighbors_with_entry_score,centrality,page_rank,adjusted_reciprocity,shortest_path_length_from_entry,shortest_path_length_to_entry,jaccard_similarity,primary_link,similarity_rank
0,Pitch contour,before,0.811917,0.008546,0.478604,0.333313,3.565994e-05,9.155767e-06,4.538325e-07,0.000058,0.008546,0.017093,1.885257e-05,0.008546,1.091970e-02
1,Contrapuntal motion,before,0.799874,0.018602,0.539450,0.260424,3.073561e-04,2.314712e-05,5.486180e-07,0.000022,0.018602,0.018602,2.397493e-04,0.000000,1.833478e-02
2,False relation,before,0.815793,0.008867,0.434498,0.381294,1.562247e-04,1.179012e-05,3.516609e-07,0.000053,0.008867,0.008867,1.260127e-04,0.000000,8.475446e-03
3,Consecutive fifths,before,0.788316,0.006569,0.210218,0.578099,1.626513e-04,1.016740e-05,1.852400e-07,0.000011,0.006569,0.006569,5.940591e-05,0.000000,6.170700e-03
4,Part (music),before,0.812984,0.000000,0.471821,0.341163,1.377333e-04,1.176928e-05,6.551787e-07,0.000037,0.003629,0.003629,9.060537e-05,0.003629,3.244365e-03
5,Common practice period,after,0.778737,0.000000,0.601881,0.176856,1.816769e-04,3.517272e-05,4.421708e-07,0.000021,0.002853,0.002853,1.713103e-04,0.002853,2.543915e-03
6,Monophony,before,0.788887,0.002988,0.576724,0.212163,5.457056e-05,5.152674e-06,4.896373e-07,0.000017,0.002988,0.002988,4.038120e-05,0.000000,2.538365e-03
7,The Well-Tempered Clavier,after,0.816198,0.001520,0.427098,0.389100,5.648655e-05,2.005541e-05,2.275949e-07,0.000037,0.001520,0.001520,3.927449e-05,0.000000,1.148115e-03
8,Post-tonal music theory,after,0.783535,0.000000,0.195884,0.587651,4.425679e-04,1.749719e-05,1.436792e-07,0.000042,0.043530,0.021765,1.498728e-04,0.000000,3.285244e-04
9,Atmosphères,after,0.800124,0.005406,0.259500,0.540624,8.510854e-05,4.176538e-06,8.031867e-08,0.000053,0.010812,0.005406,2.404556e-05,0.000000,3.432770e-03
