In [1]:
import pandas as pd
import jsonpickle
import numpy as np
import matplotlib.pyplot as plt
import os, os.path
import dateutil.parser
from datetime import datetime
import statistics

# Constants

In [2]:
SELF_PATH = os.getcwd()
DEFAULT_PATH_TO_JSON = os.path.join(SELF_PATH,'../tag_states.json')
DEFAULT_PATH_TO_OUTPUT_CSV = os.path.join(SELF_PATH,'../tagging_data.csv')

# Depickle tag state json.

In [3]:
depickled = None
with open(DEFAULT_PATH_TO_JSON, 'r') as f:
    depickled = jsonpickle.decode(f.read())

# Quick information on the state of tagging

In [4]:
total_finished =len(depickled.finished_tagged_queue)
total_incomplete_tagged=0
    
for image in depickled.pending_images_queue:
    if len(image.get_taggers()) > 0:
        total_incomplete_tagged+=1

total_skipped = len(depickled.max_skipped_queue)
total_currently_getting_tagged = len(depickled.current_image)

# How many images have circulated.
total_images_handled = total_incomplete_tagged + total_finished + total_skipped + total_currently_getting_tagged
tagged_ratio = total_incomplete_tagged/total_images_handled

df_basic_tag_data = pd.DataFrame([{ 
        'partial':total_incomplete_tagged,
        'done':total_finished,
        'skipped':total_skipped,
        'current':total_currently_getting_tagged,
        'tagged_ratio': tagged_ratio
    }])

df_basic_tag_data

Unnamed: 0,partial,done,skipped,current,tagged_ratio
0,113,250,0,9,0.303763


# Generate CSV

In [5]:
finished_images = depickled.finished_tagged_queue
df_of_image_tags = pd.DataFrame(dtype = int)

For each image, get the list of taggers and place it as a list, and get the time data and the tags them self

In [6]:
for image in finished_images:

    # This is the row we will populate and insert into the DF.
    image_dict_to_insert ={
        'image_id':image.get_rel_path().split("/")[-1],
        'list_of_taggers':list(image.get_taggers())
    }

    # First put in the tag data.
    image_dict_to_insert.update(image.final_tags )
    
    # List of times,from which we will sort to get min,max,mean,median and what not.
    starting_times = []
    ending_times = []
    assinged_times =[]
    session_times =[]

    # get the times and append to its respective list.
    for tagger in image.stats_tagging_start:
        starting_times.append(image.stats_tagging_start[tagger])
    
    for tagger in image.stats_tagging_stop:
        ending_times.append(image.stats_tagging_stop[tagger])
    
    for tagger in image.stats_tag_elapsed_assigned:
        assinged_times.append(image.stats_tag_elapsed_assigned[tagger])
    
    for tagger in image.stats_tag_elapsed_session:
        session_times.append(image.stats_tag_elapsed_session[tagger])

    # Put the time data in.
    image_dict_to_insert.update({
        'time_start':min(starting_times),
        'time_end':max(ending_times),
        'time_assigned':min(assinged_times),
        'time_elapsed':max(ending_times)-min(starting_times),
        'session_avg_time':statistics.mean(session_times),
        'session_max_time':max(session_times),
        'session_min_time':min(session_times),
        'session_median':statistics.median(session_times),
        'session_stdev':statistics.stdev(session_times)
    })

    # Convert our row dict into a df.
    tag_row_df = pd.DataFrame([image_dict_to_insert],dtype = int) 

    # Then insert.
    df_of_image_tags = pd.concat([df_of_image_tags,tag_row_df],sort=True, ignore_index = True)

## Clean DF up

In [7]:
df_of_image_tags.replace([False,True],[0,1], inplace=True)

columns_to_fill_na_wih_zero = [
        'washover',
        'impact',
        'development',
        'ocean',
        'terrain_inland','terrain_marsh','terrain_river','terrain_sandy_coastline','terrain_undefined'
    ]
# For these columns, replace NaN with 0's
df_of_image_tags[columns_to_fill_na_wih_zero] = df_of_image_tags[columns_to_fill_na_wih_zero].fillna(0.0).astype(int)

In [8]:
df_of_image_tags.head()

Unnamed: 0,development,image_id,impact,list_of_taggers,ocean,session_avg_time,session_max_time,session_median,session_min_time,session_stdev,terrain_inland,terrain_marsh,terrain_river,terrain_sandy_coastline,terrain_undefined,time_assigned,time_elapsed,time_end,time_start,washover
0,0,S26047238.jpg,0,"[google-oauth2|112971301179841927329, google-o...",0,12.3475,16.509,12.3475,8.186,5.88525,0,0,0,0,1,20.314602,75139.6879,1573503000.0,1573428000.0,0
1,1,P26057146.jpg,2,"[google-oauth2|112971301179841927329, google-o...",0,37.6255,47.805,37.6255,27.446,14.395987,0,0,0,1,0,31.980647,75222.597478,1573503000.0,1573428000.0,0
2,1,P26054308.jpg,2,"[google-oauth2|112971301179841927329, google-o...",0,27.146,39.721,27.146,14.571,17.783736,0,0,0,1,0,17.932065,75285.077463,1573503000.0,1573428000.0,0
3,1,P26050957.jpg,0,"[google-oauth2|112971301179841927329, google-o...",0,12.9825,20.097,12.9825,5.868,10.061422,0,0,0,0,1,11.001374,75319.602875,1573504000.0,1573428000.0,0
4,0,P26057962.jpg,3,"[google-oauth2|112971301179841927329, google-o...",0,32.914,60.004,32.914,5.824,38.311045,0,0,0,1,0,10.357999,75393.615183,1573504000.0,1573428000.0,1


In [9]:
df_of_image_tags.to_csv(DEFAULT_PATH_TO_OUTPUT_CSV)
print("Done")

Done
