<a href="https://colab.research.google.com/github/Vrroom/notebooks/blob/main/PilotReview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Setup 
!git clone https://github.com/openai/CLIP
!pip install -e ./CLIP
!git clone https://github.com/Vrroom/vectorrvnn
!pip install setuptools-rust
!apt install rustc
import os
os.chdir('vectorrvnn')
!python3 setup.py install --user
os.chdir('../')
!pip install gdown  
!gdown --fuzzy  https://drive.google.com/file/d/1C58pb-EDjcuA1yidG1cc4qBcFy2FQXBb/view?usp=sharing
!unzip pilot-2.zip
import sys
sys.path.append('./CLIP')
!pip install reverse_geocode

## Review Pilot Annotators

Session ID | Turk ID | Email ID | Survey Code | Location | Completion Time | Trees Annotated | Avg TED | Tutorial Scores | Comments

In [None]:
import pandas as pd
from vectorrvnn.utils import *
from vectorrvnn.data import *
import os
import json
from datetime import time, datetime, date
import reverse_geocode

DATA_DIR = './data/'

def text (g) : 
    if g is None :
        return 'none'
    with open(g) as fd :
        return fd.read().strip()

def location (f) : 
    if f is None: 
        return 'none'
    with open(f) as fp :
        data = json.load(fp)
    lat, lon = data['latitude'], data['longitude']
    places = reverse_geocode.search([(lat, lon)])
    return f'{places[0]["city"]},{places[0]["country"]}'

def json2time (data) :
    t = time(hour=data['hrs'], minute=data['min'], second=data['sec'])
    return datetime.combine(date.today(), t)

def completionTime(st, en):
    """ Assume people did this only in one day """
    if st is None or en is None: 
        return 'none'
    with open(en) as fp : 
        data = json.load(fp)
        end = json2time(data)
    with open(st) as fp :   
        data = json.load(fp)
        start = json2time(data)
    return (end - start).total_seconds()

def avgCTED (f) : 
    with open(f) as fp :
        data = json.load(fp)
    filename = data['filename']
    filename = './' + '/'.join(filename.split('/')[2:])
    mytreepkl = filterByName(listdir(osp.split(filename)[0]), 'pkl')
    myT = SVGData(filename, mytreepkl)
    T = appGraph2nxGraph(data['graph'])
    T = SVGData(filename, tree=T)
    return norm_cted(T, myT)

def surveyResponse (f) : 
    if f is None : 
        return 'none' 
    return pd.read_csv(f, names=['qid', 'rating'])
        
def filterByName (files, name) :
    return next(filter(lambda x : x.endswith(name), files), None)
        
sid = sorted(os.listdir(DATA_DIR))
tid, email, loc, scode, comp, tann, avgted, tut, tutAttempts, comm = [], [], [], [], [], [], [], [], [], []
survey = []
for f in listdir(DATA_DIR) :
    files = listdir(f) 
    survey.append(surveyResponse(filterByName(files, 'survey.txt')))
    tid.append(text(filterByName(files, 'turkid.txt'))) 
    email.append(text(filterByName(files, 'email.txt')))
    loc.append(location(filterByName(files, 'ip.json')))
    scode.append(text(filterByName(files, 'cid.txt')))
    comp.append(completionTime(filterByName(files, 'startTime.json'), filterByName(files, 'endTime.json')))
    cnt = 0
    for g in files :
        if getBaseName(g).startswith('treeData') : 
            cnt += 1
    tann.append(cnt)
    if cnt == 0 :
        avgted.append('none')
    else :
        avgs = []
        for g in files :
            if getBaseName(g).startswith('treeData') : 
                avgs.append(avgCTED(g))
        avgted.append(avg(avgs))
    tut.append(text(filterByName(files, 'tutscores.txt')))
    tut[-1] = tut[-1].split('\n')
    tutAttempts.append(len(tut[-1]))
    tut[-1] = tut[-1][-1]
    comm.append(text(filterByName(files, 'comments.txt')))
        
data = {
    'session-id': sid,
    'turker-id': tid,
    'email-id': email,
    'survey-code': scode,
    'location': loc,
    'completion-time': comp,
    'trees-annotated': tann,
    'avg-cted': avgted,
    'tutorial-score': tut,
    'tutorial-attempts': tutAttempts,
    'comments': comm
}
df = pd.DataFrame(data=data)
participants = df[df['survey-code'] != 'none']
participants

Unnamed: 0,session-id,turker-id,email-id,survey-code,location,completion-time,trees-annotated,avg-cted,tutorial-score,tutorial-attempts,comments
11,3cd97536-e4c0-45dc-aae3-b5b7bff9fbef,A25PFSORDO3SWQ,furtive_fox_five@yahoo.com,b980ee,"Oil City,United States",708.0,5,0.111811,0.2272727272727272,3,The tutorial was the hardest part for me. I gu...
12,3e715b67-2239-477f-8999-11b159d55f53,A2K0L9M1ZZO5C9,spacht1978@gmail.com,385772,"Mountain Lake Park,United States",687.0,5,0.264137,0.2170542635658914,2,That definitely was something I could really p...
14,47271d5a-3410-41d4-bbf6-779455f8b974,AVTI7X2H3RKPE,rebeccastuarts2018@gmail.com,7223e7,"Salvador,Brazil",942.0,5,0.217903,0.15,20,
18,52af0056-1889-48b0-a1fd-2571cdde4cd8,A3S3WYVCVWW8IZ,ninemick@yahoo.com,194f62,none,383.0,5,0.279376,0.3313008130081301,5,
20,601ce705-aa94-4ecf-a72a-0b8f4e98259d,A30XBXE53J85WN,bear2bear829@hotmail.com,c7245d,"Matthews,United States",882.0,5,0.330975,0.2272727272727272,10,Thank you!
40,b50373d2-b724-40af-928b-6b58adade6cc,AVIEE6LDH0BT5,mturk12902ds@gmail.com,b86a8e,none,471.0,5,0.416524,0.2272727272727272,4,
44,c53905c8-6d8f-4267-afe5-7c046efbaf4f,A26BHQZCY7GRNP,mukesh.kumar.masterji@gmail.com,d639a5,"Pimpri,India",-85449.0,5,0.494128,0.2272727272727272,4,nice
49,cc96da5a-9f07-4f24-9f5c-2f21d7811a51,A1WPBIRI0HGTWG,c_tonce@outlook.com,0dd8ad,"Taubaté,Brazil",491.0,5,0.386113,0.1996124031007751,3,
50,d04d61a8-6a85-423b-98e5-5e7f6b045a66,ADQHGQF65JJ08,papanasamshanthi05@gmail.com,a1a9d4,"Whittingham,United States",1102.0,5,0.428846,0.2484848484848485,2,
56,de39fdda-91bf-4693-9a5c-207a4f226889,A3CGQOJC28OVGN,cubbie78@gmail.com,5e2527,"Cordell,United States",771.0,5,0.250396,0.1627906976744186,2,I had trouble with the groups of two arms and ...


## Question Ratings

Question Number | Avg | Std | Median

Ratings are on a 5 point likert scale

In [None]:
survey = list(filter(lambda x : not isinstance(x, str), survey))

def getratings (qid) : 
    return [int(s[s['qid'] == qid]['rating']) for s in survey]

avg, std, med = [], [], []
for q in range(4) :
    r = getratings(q)
    avg.append(np.mean(r))
    std.append(np.std(r))
    med.append(np.median(r))
    
surveyRes = pd.DataFrame(data=dict(qid=list(range(4)), avg=avg, std=std, median=med))
surveyRes


Unnamed: 0,qid,avg,std,median
0,0,3.818182,0.935966,4.0
1,1,3.636364,1.149919,3.0
2,2,4.545455,0.655555,5.0
3,3,3.909091,0.792527,4.0


## Time taken for slides

In [None]:
def getSlideTimes (f) : 
    if f is None : 
        return 'none'
    dump = text(f)
    times = []
    for _ in dump.split('\n'): 
        times.append(json2time(json.loads(_)))
    diffs = [(t_ - t).total_seconds() for t, t_ in zip(times, times[1:])]
    return diffs

times = []
for f in listdir(DATA_DIR) : 
    files = listdir(f)
    if text(filterByName(files, 'cid.txt')) == 'none' :
        continue
    times.append(getSlideTimes(filterByName(files, 'slideTime.json')))

tuttimes = [t[0] for t in times]
tasktimes = list(flatten([t[2:-1] for t in times]))
print(len(tasktimes))
print('tutorial time')
print(np.mean(tuttimes), np.std(tuttimes), np.median(tuttimes))
print('individual task time')
print(np.mean(tasktimes), np.std(tasktimes), np.median(tasktimes))

50
tutorial time
328.9 145.1912187427325 337.0
individual task time
60.46 71.01639529010184 42.0


## Number of grouping decisions

In [None]:
def numberNonLeaves (f) : 
    with open(f) as fp :
        data = json.load(fp)
    filename = data['filename']
    filename = './' + '/'.join(filename.split('/')[2:])
    T = appGraph2nxGraph(data['graph'])
    T = SVGData(filename, tree=T)
    return len(nonLeaves(T))

cnt = 0
for f in listdir(DATA_DIR) : 
    files = listdir(f)
    if text(filterByName(files, 'cid.txt')) == 'none' :
        continue
    for g in files :
        if getBaseName(g).startswith('treeData') : 
            cnt += numberNonLeaves(g)

print(f'Number of groups created ~ {cnt}')

Number of groups created ~ 456
