In [2]:
import os
logs_dir = "data/large_sample_20/log/"
users = list({x.split("_")[0] for x in os.listdir(logs_dir)})
users

['Aash', 'ghost-rider', 'chrislin', 'johnmoon']

In [18]:
def parse_logfile_name(logfile_name):
    out = dict()
    out["filename"] = logfile_name
    d = logfile_name[0:-4].split("_")
    out["user"] = d[0]
    if d[1] == "log":
        out["date"] = d[3]
        out["time"] = d[4]
        out["n"] = d[2]
        out["label_file"] = False
    else:
        out["date"] = d[1]
        out["time"] = d[2]
        out["label_file"] = True
        out["update"] = len(d)==4
    return out


    
def get_last_update(logs_dir, user):
    user_files = [parse_logfile_name(x)
                  for x in os.listdir(logs_dir)
                  if x.find(user) >= 0]
    label_files = [f for f in user_files if f["label_file"]]
    label_files = sorted(label_files, key=lambda x: x["time"])
    last_file = label_files[-1]
    last_non_update = [f for f in label_files if not f["update"]][-1]
    return [last_file, last_non_update]
    
    
user_files = {user:get_last_update(logs_dir, user) for user in users}
    
user_files

{'Aash': [{'date': '20171006',
   'filename': 'Aash_20171006_170734_update.log',
   'label_file': True,
   'time': '170734',
   'update': True,
   'user': 'Aash'},
  {'date': '20171006',
   'filename': 'Aash_20171006_170208.log',
   'label_file': True,
   'time': '170208',
   'update': False,
   'user': 'Aash'}],
 'chrislin': [{'date': '20171006',
   'filename': 'chrislin_20171006_170653_update.log',
   'label_file': True,
   'time': '170653',
   'update': True,
   'user': 'chrislin'},
  {'date': '20171006',
   'filename': 'chrislin_20171006_170628.log',
   'label_file': True,
   'time': '170628',
   'update': False,
   'user': 'chrislin'}],
 'ghost-rider': [{'date': '20171006',
   'filename': 'ghost-rider_20171006_170659_update.log',
   'label_file': True,
   'time': '170659',
   'update': True,
   'user': 'ghost-rider'},
  {'date': '20171006',
   'filename': 'ghost-rider_20171006_170640.log',
   'label_file': True,
   'time': '170640',
   'update': False,
   'user': 'ghost-rider'}],


In [60]:
from collections import defaultdict

def parse_label_file(log_dir, filename):
    headers = ["TOP IDS:", "User labels:", "Classification results:", "All classification results:", ""]
    out = defaultdict(lambda: [])
    with open(log_dir + "/" + filename) as f:
        header_n = -1
        for i in range(8):
            out["header"].append(f.readline())
        for l in f:
            if l.strip() == headers[header_n+1]:
                header_n = header_n + 1
            if l.strip() not in headers:
                out[headers[header_n]].append(l)
    out["Classification results:"] = [parse_classification_result(r)
                                    for r in out["Classification results:"]]
    out["User labels:"] = [parse_user_label(r)
                                    for r in out["User labels:"]]
    out = {k:v for k,v in out.items()}
    return out
                

def parse_classification_result(r):
    out = dict()
    d = r.strip().split(",")
    out["posting_id"], out["label"] = d[0].split(":")
    out["score"] = float(d[1])
    return out

    
def parse_user_label(r):
    out = dict()
    out["posting_id"], out["label"] = r.strip().split(":")
    return out
    

user_labels = {user:parse_label_file(logs_dir, user_files[user][0]["filename"]) for user in users}
user_labels

{'Aash': {'All classification results:': ['40833549:human care services,0.2565450826852724\n',
   '37958013:event services,0.9869487118879179\n',
   '37903780:technical services,0.29127736635346496\n',
   '29972102:human care services,0.29700523083494307\n',
   '42063689:human care services,0.7543314775465478\n',
   '39398891:human care services,0.3818699044058074\n',
   '43744307:hotel and hospitality,0.9982499111255706\n',
   '41734445:it or software services,0.21377643936777876\n',
   '22792419:it or software services,0.21646262356102156\n',
   '37903788:technical services,0.3227724316836302\n',
   '39263940:human care services,0.3026198072856141\n',
   '37901118:technical services,0.302191234998833\n',
   '35474928:human care services,0.41487047964346535\n',
   '37834682:food and beverage,0.269723621849499\n',
   '42635340:human care services,0.8687087867450027\n',
   '41811171:human care services,0.3226214451783049\n',
   '42406321:technical services,0.3174789987773842\n',
   '358