In [1]:
## remember to run conn_cocolab from the terminal before running cells in this notebook!

import os
import urllib, cStringIO

import pymongo as pm

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('poster')
sns.set_style('white')

import numpy as np
from __future__ import division
import scipy.stats as stats
import pandas as pd
import json
import re

from PIL import Image
import base64
import datetime
import svg_render_helpers as rdh

### file hierarchy and database connection vars

In [2]:
# set vars 
auth = pd.read_csv('../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'rxdhawkins.me' ## cocolab ip address

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['kiddraw']
coll = db['cdm_run_v4']

### Render out images from museum station -- takes a *long* time.

#### Grab some subset of desired sketches to render

In [3]:
iteration_name = 'cdm_run_v4'
cdm_run_v4 = db['cdm_run_v4']
cdm_run_v3 = db['cdm_run_v3']
this_collection = db[iteration_name]
##
# sessions_to_render =  this_collection.find({'$and': [{'dataType':'finalImage'}]}).distinct('sessionId')
sessions_to_render = this_collection.find({'category':'this square'}).distinct('sessionId')
print('we have {} unique kids'.format(len(sessions_to_render)))


we have 2775 unique kids


#### And where are we rendering these skeches?

In [4]:
## And where are we rendering these sketches?
analysis_dir = os.getcwd()
sketch_dir = os.path.join(analysis_dir,'tracing_{}'.format(iteration_name))
if not os.path.exists(sketch_dir):
    os.makedirs(sketch_dir)

#### Open up variables for csv file writing

In [5]:
# basic descriptors
session_id = []; trial_num = []; category = []; age = []; filename = []

# stroke times and duration
svg = []; svg_times = []; draw_duration_old = []; draw_duration_new = []

# drawing usage stats
num_strokes = []
mean_intensity = []
bounding_box = []

# trial time and duration
start_time = []; submit_time = []; trial_duration = []

# other timing variables
submit_date = []
submit_date_readable = []

def load_image_data(imgData,imsize):
    filestr = base64.b64decode(imgData)
    fname = os.path.join('sketch.png')
    with open(fname, "wb") as fh:
        fh.write(imgData.decode('base64'))
    im = Image.open(fname).resize((imsize,imsize))
    _im = np.array(im)
    return(_im)

def get_mean_intensity(img,imsize):
    thresh = 250
    numpix = imsize**2
    mean_intensity = len(np.where(img[:,:,3].flatten()>thresh)[0])/numpix
    return mean_intensity
    
def get_bounding_box(img):
    bounding_box = None
    try:
        rows = np.any(img, axis=1)
        cols = np.any(img, axis=0)
        rmin, rmax = np.where(rows)[0][[0, -1]]
        cmin, cmax = np.where(cols)[0][[0, -1]]
        bounding_box = tuple((rmin, rmax, cmin, cmax))
    except Exception as e:
        print 'Oops, something went wrong! Here is the error:'  
        print e
        pass
                
    return bounding_box

#### Render out sketches and save descriptives for subset

In [8]:
reload(rdh)
really_run_this=1

# basic variables for counting throughout the loop
skipCount = 0;
writeImageCount = 0;
interferenceCount = 0;
timeSave = []
# tracing = ['square', 'shape', 'this circle']
tracing = ['this square']
canvas_size = 809
imsize = 400

if really_run_this:
   
    for s in sessions_to_render:        
        if (this_collection == cdm_run_v4):
            image_recs = this_collection.find({'$and': [{'sessionId':s}, {'dataType':'finalImage'}, {'category':{"$in": tracing}}]}).sort('startTrialTime')    
            
            ## get survey data for this kid (if it exists), use to exclude
            survey_session = this_collection.find({'$and': [{'dataType':'survey'},{'sessionId':s}]})
            if survey_session.count()>0:
                interference = (survey_session[0]['other_drew']==True | survey_session[0]['parent_drew']==True)
            else:
                interference = False
            
            if interference==True:
                interferenceCount = interferenceCount+1
                if np.mod(interferenceCount,10)==0:
                    print('excluded {} kids for reported inference...'.format(interferenceCount))

        elif (this_collection == cdm_run_v3):
            image_recs = this_collection.find({'$and': [{'sessionId':s}, {'dataType':'finalImage'}, {'category':{"$in": tracing}}]}).sort('time')    
            interference = False

        if interference==False: ## if they made it past the practice trials & no reported interference
            for imrec in image_recs:                                                            
                category_dir = os.path.join(sketch_dir,imrec['category'])
                if not os.path.exists(category_dir):
                    os.makedirs(category_dir)
                fname = os.path.join(category_dir,'{}_sketch_{}_{}.png'.format(imrec['category'], imrec['age'],imrec['sessionId']))

                ## if this image exists already, skip it
                if os.path.isfile(fname):
                    skipCount = skipCount + 1;
                    if np.mod(skipCount,100)==0:
                        print('Weve skipped {} images...'.format(skipCount))
                        if (this_collection == cdm_run_v4):
                            timeSave.append(imrec['startTrialTime']) 
                        elif (this_collection == cdm_run_v3):
                            timeSave.append(imrec['time']) 
                else:
                    ## timing info was different in different collections, switch here
                    if (this_collection == cdm_run_v4):
                        stroke_recs = this_collection.find({'$and': [{'sessionId':s}, 
                                          {'dataType':'stroke'},
                                          {'category': imrec['category']}]}).sort('startTrialTime')   
                    elif (this_collection == cdm_run_v3):
                        stroke_recs = this_collection.find({'$and': [{'sessionId':s}, 
                                          {'dataType':'stroke'},
                                          {'category': imrec['category']}]}).sort('time')  

                    # don't do adults for now or blank images
                    if stroke_recs.count()>0 and stroke_recs.count()<50 and imrec['age']!='adult': 

                        ## Append session ID, trial Num, category, age                            
                        session_id.append(imrec['sessionId'])        
                        trial_num.append(imrec['trialNum']) 
                        category.append(imrec['category'])
                        age.append(imrec['age'])

                        ## again, regularize based on timing info change
                        if (this_collection == cdm_run_v4):
                            start_time.append(imrec['startTrialTime'])
                            submit_time.append(imrec['endTrialTime'])
                            trial_duration.append((imrec['endTrialTime'] - imrec['startTrialTime'])/1000.00)
                            readadble_date = datetime.datetime.fromtimestamp(imrec['endTrialTime']/1000.0).strftime('%Y-%m-%d %H:%M:%S.%f')

                        elif (this_collection == cdm_run_v3):
                            start_time.append('NaN')
                            submit_time.append(imrec['time'])
                            trial_duration.append('NaN')
                            readadble_date = datetime.datetime.fromtimestamp(imrec['time']/1000.0).strftime('%Y-%m-%d %H:%M:%S.%f')

                        ## readable date (not just time, has other info for sanity cecks)
                        submit_date_readable.append(readadble_date)
                        submit_date.append(imrec['date'])

                        ## Count number of strokes and timing information as well on stroke basis
                        num_strokes.append(stroke_recs.count())
                        _svg = [] # this keeps track of the strokes from THIS final image
                        _svg_end_times = []
                        _svg_start_times = []
                        _svg_times = []
                        
                        for strec in stroke_recs:
                            _svg.append(strec['svg'])
                            if (this_collection == cdm_run_v3):
                                 _svg_times.append(strec['time'])
                            elif (this_collection == cdm_run_v4):
                                _svg_end_times.append(strec['endStrokeTime'])
                                _svg_start_times.append(strec['startStrokeTime'])
                       
                        ## draw duration
                        svg.append(_svg)
                        if (this_collection == cdm_run_v3):
                            draw_duration_new.append((_svg_times[-1] - _svg_times[0])/1000) ## in seconds
                            draw_duration_old.append((_svg_times[-1] - _svg_times[0])/1000) ## in seconds
                        elif (this_collection == cdm_run_v4):
                            draw_duration_new.append((_svg_end_times[-1] - _svg_start_times[0])/1000) ## in seconds
                            draw_duration_old.append((_svg_end_times[-1] - _svg_end_times[0])/1000) ## in seconds
                        
                        ## get bounding box and mean pixel intensity
                        this_image = load_image_data(imrec['imgData'],imsize)
                        this_bounding_box = get_bounding_box(this_image)
                        this_intensity = get_mean_intensity(this_image,imsize)
                        #
                        bounding_box.append(this_bounding_box)
                        mean_intensity.append(this_intensity)

                        # render out images
                        try:

                            ## now get me some verts and codes!
                            Verts, Codes = rdh.get_verts_and_codes(_svg)

                            outpath = os.path.join(category_dir,'{}_{}_{}.png'.format(imrec['age'],imrec['sessionId'],imrec['category']))
                            filename.append(outpath)

                            ## now render out your cumulative sketches and save out as pngs!
                            rdh.render_and_save(Verts,
                                            Codes,
                                            outpath,
                                            line_width=5,
                                            imsize=imsize,
                                            canvas_size=canvas_size)

                        except Exception as e:
                            print 'Oops, something went wrong! Here is the error:'  
                            print e
                            pass

Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
excluded 10 kids for reported inference...
Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
Oops, something went wrong! Here is the error:
index 0 is out of bounds for axis 1 with size 0
Oops, s

AutoReconnect: connection closed

In [9]:
saveTime ='111'
if really_run_this:
    X_out = pd.DataFrame([session_id,trial_num,category,age,submit_time,submit_date,num_strokes,draw_duration_old,draw_duration_new,trial_duration, mean_intensity, bounding_box, filename])
    X_out = X_out.transpose()
    X_out.columns = ['session_id','trial_num','category','age','submit_time','submit_date','num_strokes','draw_duration_old','draw_duration_new','trial_duration','mean_intensity','bounding_box','filename']
    X_out.to_csv('MuseumStation_Tracing_Descriptives_v4_this_square.csv')

In [None]:
### Notes: It won't write out an image if it already exists. 
# Every time I get through a certain set of the  images, I record the time threshold 
#(since images are written out in chronological order)

## Time stamps every 100 images.
# [1525899407923.0,
#  1525980096385.0,
#  1526061840113.0,
#  1526072921014.0,
#  1526152365279.0,
#  1526160570739.0,
#  1526239856392.0,
#  1526251698552.0,
#  1526254954475.0,
#  1526418747635.0,
#  1526495927262.0,
#  1526503859627.0,
#  1526581190700.0,
#  1526593428928.0,
#  1526666014298.0,
#  1526680293892.0,
#  1526760285554.0,
#  1526766056162.0,
#  1527018277044.0,
#  1527793231321.0,
#  1527881543709.0,
#  1527960926822.0,
#  1527970125532.0,
#  1527977827733.0,
#  1527982709894.0,
#  1528059661642.0,
#  1528064507404.0,
#  1528068114827.0,
#  1528140581078.0,
#  1528225691857.0,
#  1528238270169.0,
#  1528321410290.0,
#  1528396097251.0,
#  1528409134891.0,
#  1528482501089.0,
#  1528492105365.0,
#  1528501366255.0,
#  1528579092501.0,
#  1528584985113.0,
#  1528661341497.0,
#  1528741749248.0,
#  1528747687974.0,
#  1528761045959.0,
#  1528835768499.0,
#  1528841240592.0,
#  1528916818096.0,
#  1528923498102.0,
#  1529005606993.0,
#  1529015740138.0,
#  1529092461367.0,
#  1529102005265.0,
#  1529178354515.0,
#  1529183429730.0,
#  1529190138056.0,
#  1529272790915.0,
#  1529351265769.0,
#  1529360098520.0,
#  1529435563121.0,
#  1529443877705.0,
#  1529518216924.0,
#  1529529413026.0,
#  1529690851878.0,
#  1529705056613.0,
#  1530311006680.0]


# time_threshold = 1530650649659.0 ## first ~7000 images! (cdm_run_v3)
# time_threshold = 1533252327962.0 ## all images up until morning of Aug 3 (cdm_run_v3)