In [1]:
import os
logs_dirs = [
    "data/large_sample_20/log/", 
    "data/large_v2_sample_20/log/", 
    "data/large_sample_40/log/",
    "data/large_sample_80/log/"
]
users = {logs_dir:list({x.split("_")[0] for x in os.listdir(logs_dir)}) for logs_dir in logs_dirs}
users

{'data/large_sample_20/log/': ['rmealey2',
  'Jonathan Daberkow',
  'Xun',
  'Aash',
  'jason.kowalewski@snagajob.com',
  'johnmoon'],
 'data/large_sample_40/log/': ['rmealey',
  'ghost-rider',
  'Xun',
  'jason.kowalewski@snagajob.com',
  'johnmoon'],
 'data/large_sample_80/log/': ['jmexp1'],
 'data/large_v2_sample_20/log/': ['rmealey',
  'chrislin',
  'ghost-rider',
  'Aash',
  'johnmoon']}

In [2]:
def parse_logfile_name(logfile_name, logs_dir):
    out = dict()
    out["filename"] = logfile_name
    out["dir"] = logs_dir + "/"
    d = logfile_name[0:-4].split("_")
    out["user"] = d[0]
    if d[1] == "log":
        out["date"] = d[3]
        out["time"] = d[4]
        out["n"] = d[2]
        out["label_file"] = False
    else:
        out["date"] = d[1]
        out["time"] = d[2]
        out["label_file"] = True
        out["update"] = "update" in d
    return out


    
def get_last_update(logs_dir, user):
    user_files = [parse_logfile_name(x, logs_dir)
                  for x in os.listdir(logs_dir)
                  if x.find(user) >= 0]
    label_files = [f for f in user_files if f["label_file"]]
    label_files = sorted(label_files, key=lambda x: (x["date"], x["time"]))
    last_file = label_files[-1]
    return last_file
    
    
user_files = {logs_dir:{user:get_last_update(logs_dir, user) for user in users} for logs_dir, users in users.items()}
    
user_files.keys()

dict_keys(['data/large_sample_20/log/', 'data/large_v2_sample_20/log/', 'data/large_sample_40/log/', 'data/large_sample_80/log/'])

In [3]:
from collections import defaultdict

def parse_label_file(label_file):
    if label_file["update"]:
        header_n = -1
    else:
        header_n = 0 
    headers = ["TOP IDS:", "User labels:", "Classification results:", "All classification results:", ""]
    out = defaultdict(lambda: [])
    with open(label_file["dir"] + label_file["filename"]) as f:
        for i in range(8):
            out["header"].append(f.readline())
        for l in f:
            if l.strip() == headers[header_n+1]:
                header_n = header_n + 1
            if l.strip() not in headers:
                out[headers[header_n]].append(l)
    out["Classification results:"] = [parse_label(r) for r in out["Classification results:"]]
    out["User labels:"] = [parse_label(r) for r in out["User labels:"]]
    out["All classification results:"] = [parse_label(r) for r in out["All classification results:"]]
    out["IsFinal"] = label_file["update"] is False
    out = {k:v for k,v in out.items()}
    return out
                

def parse_label(r):
    out = dict()
    d = r.strip().split(",")
    out["posting_id"], out["label"] = d[0].split(":")
    if len(d) == 2:
        out["score"] = float(d[1])
    return out

    
user_labels = {l:{user:parse_label_file(user_files[l][user]) for user in user_files[l].keys()} for l in user_files.keys()}
user_labels.keys()

dict_keys(['data/large_sample_20/log/', 'data/large_v2_sample_20/log/', 'data/large_sample_40/log/', 'data/large_sample_80/log/'])

In [4]:
label_sets = {l:{user:sorted({r["label"] for r in user_labels[l][user]["User labels:"]})
                 for user in user_labels[l]} for l in user_labels.keys()}
label_sets

{'data/large_sample_20/log/': {'Aash': ['animal care',
   'business or financial services',
   'catering',
   'customer support',
   'emergency services',
   'engineering',
   'food services',
   'home services',
   'hospitality',
   'human care services',
   'it',
   'machine operator',
   'medical services',
   'music instruction',
   'sales',
   'transportation services'],
  'Jonathan Daberkow': ['accounting',
   'automotive',
   'business administration',
   'childcare',
   'construction',
   'customer service',
   'education',
   'engineering',
   'facility maintenance',
   'food and restaurant',
   'general contracting',
   'healthcare',
   'hospitality',
   'petcare',
   'police fire ems',
   'real estate',
   'retail',
   'software engineering',
   'truck driving',
   'tutoring',
   'warehouse and production'],
  'Xun': ['administratives',
   'business and management',
   'childcare',
   'customer service',
   'education and teaching',
   'finance and accounting',
   'foodservi

In [5]:
posting_labels = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: list())))
for d in user_labels.keys():
    ntopics = d.split("/")[1].split("_")[-1]
    for user in user_labels[d].keys():
        if user_labels[d][user]["IsFinal"]: 
            for label in user_labels[d][user]["All classification results:"]:
                posting_labels[label["posting_id"]][user][ntopics] += [label["label"],]
    else:
        for label in user_labels[d][user]["User labels:"]:
            posting_labels[label["posting_id"]][user][ntopics] += [label["label"],]

len(posting_labels.keys())
posting_labels = {pid:{user:{ntopics:l for ntopics,l in labels.items()} for user, labels in user_labels.items()} for pid, user_labels in posting_labels.items()}
posting_labels

{'37958256': {'jason.kowalewski@snagajob.com': {'40': ['tutoring']},
  'rmealey2': {'20': ['personal services and instruction']}},
 '37958013': {'Jonathan Daberkow': {'20': ['food and restaurant']},
  'jason.kowalewski@snagajob.com': {'40': ['catering']},
  'rmealey2': {'20': ['event services']}},
 '44099820': {'Jonathan Daberkow': {'20': ['business administration']},
  'rmealey2': {'20': ['professional']}},
 '30523041': {'Jonathan Daberkow': {'20': ['food and restaurant']},
  'rmealey2': {'20': ['food and beverage']}},
 '43735830': {'Jonathan Daberkow': {'20': ['childcare']},
  'jason.kowalewski@snagajob.com': {'40': ['childcare']},
  'rmealey2': {'20': ['childcare and early education']}},
 '34489524': {'Jonathan Daberkow': {'20': ['childcare']},
  'jason.kowalewski@snagajob.com': {'40': ['childcare']},
  'rmealey2': {'20': ['childcare and early education']}},
 '44201834': {'Jonathan Daberkow': {'20': ['healthcare']},
  'rmealey2': {'20': ['health and wellness professional']}},
 '2872