In [1]:
import os
logs_dirs = [x[0] for x in os.walk("data/") if x[0].find("/log") > 0]
users = {logs_dir:list({x.split("_")[0] for x in os.listdir(logs_dir)}) for logs_dir in logs_dirs}
users

{'data/large_sample/log': ['rmealey'],
 'data/large_sample_20/log': ['chrislin', 'ghost-rider', 'johnmoon', 'Aash'],
 'data/large_sample_80/log': ['jmexp1'],
 'data/small/log': ['rmealey'],
 'data/synthetic/log': ['rmealey']}

In [2]:
def parse_logfile_name(logfile_name, logs_dir):
    out = dict()
    out["filename"] = logfile_name
    out["dir"] = logs_dir + "/"
    d = logfile_name[0:-4].split("_")
    out["user"] = d[0]
    if d[1] == "log":
        out["date"] = d[3]
        out["time"] = d[4]
        out["n"] = d[2]
        out["label_file"] = False
    else:
        out["date"] = d[1]
        out["time"] = d[2]
        out["label_file"] = True
        out["update"] = "update" in d
    return out


    
def get_last_update(logs_dir, user):
    user_files = [parse_logfile_name(x, logs_dir)
                  for x in os.listdir(logs_dir)
                  if x.find(user) >= 0]
    label_files = [f for f in user_files if f["label_file"]]
    label_files = sorted(label_files, key=lambda x: (x["date"], x["time"]))
    last_file = label_files[-1]
    return last_file
    
    
user_files = {logs_dir:{user:get_last_update(logs_dir, user) for user in users} for logs_dir, users in users.items()}
    
user_files

{'data/large_sample/log': {'rmealey': {'date': '20171005',
   'dir': 'data/large_sample/log/',
   'filename': 'rmealey_20171005_144013_update.log',
   'label_file': True,
   'time': '144013',
   'update': True,
   'user': 'rmealey'}},
 'data/large_sample_20/log': {'Aash': {'date': '20171006',
   'dir': 'data/large_sample_20/log/',
   'filename': 'Aash_20171006_170734_update.log',
   'label_file': True,
   'time': '170734',
   'update': True,
   'user': 'Aash'},
  'chrislin': {'date': '20171006',
   'dir': 'data/large_sample_20/log/',
   'filename': 'chrislin_20171006_170653_update.log',
   'label_file': True,
   'time': '170653',
   'update': True,
   'user': 'chrislin'},
  'ghost-rider': {'date': '20171006',
   'dir': 'data/large_sample_20/log/',
   'filename': 'ghost-rider_20171006_170659_update.log',
   'label_file': True,
   'time': '170659',
   'update': True,
   'user': 'ghost-rider'},
  'johnmoon': {'date': '20171006',
   'dir': 'data/large_sample_20/log/',
   'filename': 'johnm

In [3]:
from collections import defaultdict

def parse_label_file(label_file):
    if label_file["update"]:
        header_n = -1
    else:
        header_n = 0 
    headers = ["TOP IDS:", "User labels:", "Classification results:", "All classification results:", ""]
    out = defaultdict(lambda: [])
    with open(label_file["dir"] + label_file["filename"]) as f:
        for i in range(8):
            out["header"].append(f.readline())
        for l in f:
            if l.strip() == headers[header_n+1]:
                header_n = header_n + 1
            if l.strip() not in headers:
                out[headers[header_n]].append(l)
    out["Classification results:"] = [parse_label(r) for r in out["Classification results:"]]
    out["User labels:"] = [parse_label(r) for r in out["User labels:"]]
    out["All classification results:"] = [parse_label(r) for r in out["All classification results:"]]
    out["IsFinal"] = label_file["update"] is False
    out = {k:v for k,v in out.items()}
    return out
                

def parse_label(r):
    out = dict()
    d = r.strip().split(",")
    out["posting_id"], out["label"] = d[0].split(":")
    if len(d) == 2:
        out["score"] = float(d[1])
    return out

    
user_labels = {l:{user:parse_label_file(user_files[l][user]) for user in user_files[l].keys()} for l in user_files.keys()}
user_labels

{'data/near_postings_20170501_20171201_sample_40/log': {'rmealey': {'All classification results:': [{'label': 'automotive services',
     'posting_id': '47720941',
     'score': 0.19029033039318252},
    {'label': 'childcare and early education',
     'posting_id': '42760142',
     'score': 0.28666508807025715},
    {'label': 'direct sales and customer support',
     'posting_id': '44862238',
     'score': 0.6532641536254645},
    {'label': 'industrial warehouse and manufacturing skilled/technical',
     'posting_id': '45526971',
     'score': 0.14706760259345433},
    {'label': 'direct sales and customer support',
     'posting_id': '42593858',
     'score': 0.613830144121779},
    {'label': 'industrial warehouse and manufacturing front line',
     'posting_id': '49130695',
     'score': 0.40023111228662916},
    {'label': 'childcare and early education',
     'posting_id': '44933999',
     'score': 0.8489704133001112},
    {'label': 'health and wellness technical',
     'posting_id':

In [5]:
label_sets = {l:{user:sorted({r["label"] for r in user_labels[l][user]["All classification results:"] + user_labels[l][user]["User labels:"]})
                 for user in user_labels[l]} for l in user_labels.keys()}
label_sets

{'data/near_postings_20170501_20171201_sample_40/log': {'rmealey': ['admin and paraprofessional',
   'animal services',
   'automotive services',
   'beauty and grooming services',
   'childcare and early education',
   'corporate internship',
   'corporate professional',
   'corporate recruiting sales marketing professional',
   'direct sales and customer support',
   'environmental services',
   'exclude spanish language',
   'food and beverage',
   'health and wellness direct care',
   'health and wellness home care',
   'health and wellness professional',
   'health and wellness technical',
   'hospitality',
   'industrial warehouse and manufacturing front line',
   'industrial warehouse and manufacturing professional',
   'industrial warehouse and manufacturing skilled/technical',
   'information technology professional',
   'military recruiting and contracting',
   'personal instruction and tutoring',
   'real estate sales and related services',
   'renovation contracting constru

In [5]:
posting_labels = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: list())))
for d in user_labels.keys():
    ntopics = d.split("/")[1].split("_")[-1]
    for user in user_labels[d].keys():
        if user_labels[d][user]["IsFinal"]: 
            for label in user_labels[d][user]["All classification results:"]:
                posting_labels[label["posting_id"]][user][ntopics] += [label["label"],]
    else:
        for label in user_labels[d][user]["User labels:"]:
            posting_labels[label["posting_id"]][user][ntopics] += [label["label"],]

len(posting_labels.keys())
posting_labels = {pid:{user:{ntopics:l for ntopics,l in labels.items()} for user, labels in user_labels.items()} for pid, user_labels in posting_labels.items()}
posting_labels
len(posting_labels)
len({k:v for k,v in posting_labels.items() if len(v)>1})

682