In [12]:
import os

import numpy as np
import pandas as pd

### 1. Get all file names of the formatted labeled data

In [13]:
csvs = []

for root, dirs, files in os.walk("../../data/user_labeled/labeled_formatted//"):
    csvs += files
    
csvs

['Starcraft.csv',
 'Evidence-based policy.csv',
 'Mental health.csv',
 'Counterpoint.csv',
 'Atmospheric science.csv',
 'Stargate.csv',
 'Decision tree.csv',
 'Sonata form.csv']

### 2. Merge labels with network info for labeled nodes

In [14]:
dfs = []

for filename in csvs:
    labeled = filename
    unlabeled = filename.replace(" ", "_").replace(".csv", ".zip")
    
    # get labeled and unlabeled date for merge
    labeled_df = pd.read_csv(f"../../data/user_labeled/labeled_formatted//{labeled}")
    unlabeled_df = pd.read_csv(f"../../data/user_labeled/unlabeled/{unlabeled}")
    
    # rename columns in labeled to match those in unlabeled. 
    labeled_df.rename(columns={"Topic": "node", "Before/After": "label"}, inplace=True)
    
    
    merged = labeled_df.merge(unlabeled_df, on="node", how="left")
    
    dfs.append(merged)
    


### 3. Combine all labeled data and drop NAs. Save to `final_labeled.csv`

In [15]:
total_labeled = dfs[0]
for i in range(1, len(dfs)):
    df = dfs[i]
    total_labeled = pd.concat([total_labeled, df])
    
total_labeled.dropna().to_csv("../../data/user_labeled/final_labeled.csv", index=False)
        

### 4. Confirm `final_labeled.csv`

In [16]:
final_labeled = pd.read_csv("../../data/user_labeled/final_labeled.csv")
final_labeled

Unnamed: 0,node,label,degree,category_matches_with_source,in_edges,out_edges,shared_neighbors_with_entry_score,centrality,page_rank,adjusted_reciprocity,shortest_path_length_from_entry,shortest_path_length_to_entry,jaccard_similarity,primary_link,similarity_rank
0,StarCraft II: Heart of the Swarm,after,0.010420,1.000000,0.005118,0.043292,0.320449,0.592755,0.002671,0.056061,0.166667,0.20,0.247350,0.0,1.000000
1,StarCraft (video game),before,0.017115,0.916667,0.005065,0.095554,0.196211,1.000000,0.003881,0.087879,0.166667,0.20,0.252669,1.0,0.905361
2,StarCraft: Brood War,before,0.011578,0.833333,0.006120,0.044852,0.202199,0.648033,0.002642,0.053030,0.166667,0.20,0.203883,0.0,0.816960
3,StarCraft II: Legacy of the Void,after,0.008859,0.750000,0.004221,0.037832,0.294207,0.542680,0.001999,0.048485,0.166667,0.20,0.208633,0.0,0.775480
4,StarCraft II: Wings of Liberty,after,0.021544,0.083333,0.013506,0.067473,1.000000,0.995219,0.007313,0.110606,0.000000,0.00,1.000000,0.0,0.548565
5,Jim Raynor,after,0.003977,0.000000,0.002163,0.015211,0.109752,0.088870,0.000506,0.034848,0.166667,0.20,0.133588,1.0,0.101773
6,List of StarCraft media,after,0.006393,0.083333,0.001741,0.037051,0.123632,0.131356,0.000439,0.033333,0.166667,0.20,0.128906,0.0,0.099830
7,Races of StarCraft,before,0.008557,0.083333,0.003218,0.042902,0.134990,0.144347,0.001264,0.036364,0.166667,0.20,0.182836,0.0,0.097667
8,Real-time strategy,before,0.080439,0.083333,0.068271,0.118955,0.042367,0.268845,0.040270,0.271212,0.166667,0.20,0.031271,1.0,0.084069
9,StarCraft,before,0.032468,0.083333,0.019363,0.108814,0.193881,0.316930,0.011237,0.121212,0.166667,0.20,0.198077,0.0,0.078426
