In [1]:
from IPython.display import HTML

In [2]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<a href="javascript:code_toggle()">TOGGLE CODE ON/OFF</a>.''')

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#List-of-questions" data-toc-modified-id="List-of-questions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>List of questions</a></span></li><li><span><a href="#answer-to-question-1" data-toc-modified-id="answer-to-question-1-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>answer to question 1</a></span></li><li><span><a href="#answer-to-questions-2---5" data-toc-modified-id="answer-to-questions-2---5-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>answer to questions 2 - 5</a></span></li><li><span><a href="#answer-to-questions-6-7" data-toc-modified-id="answer-to-questions-6-7-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>answer to questions 6-7</a></span></li></ul></div>

Import and set constants

In [3]:
import os.path
import pprint
import json
import hashlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [4]:
timestamps = [
    '2019_06_20_12_54',  
    '2019_06_21_13_08',  
    '2019_06_24_12_38',  
    '2019_06_25_13_24',  
    '2019_07_03_15_03',  
    '2019_07_09_11_02',  
    '2019_07_10_17_40',  
    '2019_07_12_11_11',
    '2019_06_20_13_27',  
    '2019_06_21_13_34',  
    '2019_06_24_13_06',  
    '2019_06_25_14_06',  
    '2019_07_03_16_32',  
    '2019_07_10_12_18',  
    '2019_07_10_17_42',  
    '2019_07_17_17_17',
    '2019_06_20_13_45',  
    '2019_06_21_14_25',  
    '2019_06_24_13_31',  
    '2019_06_27_11_33',  
    '2019_07_08_17_13',  
    '2019_07_10_17_19', 
    '2019_07_11_11_21',
]
data_folder = '/home/adrian/SingleCP_DotsReversal/raw/'
folder_list = [data_folder + t + '/' for t in timestamps]
metadata_file = '/home/adrian/Documents/MATLAB/projects/Analysis_SingleCP_DotsReversal/data/subj_metadata.json'

# List of questions
1. do I have as many timestamps in the metadata file and in my directory tree?
2. for all FIRA files, are the column names the same?
3. for all dots files, are the column names the same?
4. for all FIRA files, are the column types the same?
5. same for dots files
6. for each FIRA file, is the taskID column consistent with the metadata?
7. for each FIRA file, is the trialIndex column consistent with the metadata?

In [5]:
def md5(fname):
    """
    function taken from here
    https://stackoverflow.com/a/3431838
    :param fname: filename
    :return: string of hexadecimal number
    """
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def get_files_and_hashes(show=False, hash_map=False):
    """
    builds and returns a list of dicts with fields 'FIRA', 'dots' and 'session'. The values are as follows:
    FIRA: list of pairs of the form (<path to .csv file>, <MD5 checksum for this file>)
    dots: same as for FIRA, but for dots data
    session: single string representing the timestamp of the session, in the format YYYY_MM_DD_HH_mm
    
    for the values corresponding to the FIRA and dots keys, the absence of any file is encoded as an empty list
    
    :param show: (bool) whether to print the resulting list or not
    :return: (list) described above
    """
    file_names = []
    if hash_map:
        hashes = {}
    for timestamp, folder_name in zip(timestamps, folder_list):

        # check that the standard FIRA exists
        filename = folder_name + timestamp + '_FIRA.csv'
        custom = folder_name + timestamp + 'customFIRA.csv'
        to_append = []
        if os.path.exists(filename):
            hash_val = md5(filename)
            to_append.append((filename, hash_val))
            if hash_map:
                hashes[filename] = hash_val
        if os.path.exists(custom):
            has_val = md5(custom)
            to_append.append((custom, hash_val))
            if hash_map:
                hashes[custom] = hash_val
                
        files = {'session': timestamp, 'FIRA': to_append}
        dots = folder_name + timestamp + '_dotsPositions.csv'
        if os.path.exists(dots):
            hash_val = md5(dots)
            string = [(dots, hash_val)] 
            if hash_map:
                hashes[dots] = hash_val
        else:
            string = []
        files['dots'] = string

        file_names.append(files)
    if show:
        pprint.pprint(file_names)
    if hash_map:
        return file_names, hashes
    else:
        return file_names

In [6]:
files_data, latest_hashes = get_files_and_hashes(show=False, hash_map=True)
# pprint.pprint(latest_hashes)

In [7]:
# hard code the hashes in case files change
ref_hashes = {
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_20_12_54/2019_06_20_12_54_FIRA.csv': '046ca06830aeebb62194e3c8d2e97046',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_20_12_54/2019_06_20_12_54_dotsPositions.csv': 'b3aff823355bb4cda726a0857fa1ba74',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_20_13_27/2019_06_20_13_27_FIRA.csv': '8b945181914a9f1c6a4784c8899d72ed',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_20_13_27/2019_06_20_13_27_dotsPositions.csv': '92c7f5484127202f8bc088e8e025cc9e',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_20_13_45/2019_06_20_13_45customFIRA.csv': 'fd77b1add52105f0ceba419f72b515b4',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_21_13_08/2019_06_21_13_08_FIRA.csv': '41485cf9922b03cd5176824887a2c99a',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_21_13_08/2019_06_21_13_08_dotsPositions.csv': '3073e147156bdbbffa593c190de19b91',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_21_13_34/2019_06_21_13_34customFIRA.csv': '92c7f5484127202f8bc088e8e025cc9e',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_21_14_25/2019_06_21_14_25customFIRA.csv': 'fd77b1add52105f0ceba419f72b515b4',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_24_12_38/2019_06_24_12_38_FIRA.csv': '20102a1b8e7d68305455bacccd6fc5cb',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_24_12_38/2019_06_24_12_38_dotsPositions.csv': '7299c4dc38bb2e519d797b4483a83c1f',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_24_12_38/2019_06_24_12_38customFIRA.csv': '20102a1b8e7d68305455bacccd6fc5cb',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_24_13_06/2019_06_24_13_06_FIRA.csv': 'c8423313d1e47ba799be602d3642e4eb',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_24_13_06/2019_06_24_13_06_dotsPositions.csv': '1be8454c740020101e4fcfc57d720b14',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_24_13_31/2019_06_24_13_31_FIRA.csv': '78daa9667716d994f0d75f7fb87e93b4',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_24_13_31/2019_06_24_13_31_dotsPositions.csv': '36ecb7509c192e14a68ec9476aca94ac',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_25_13_24/2019_06_25_13_24_FIRA.csv': '92d73437503dbc26a61e15c93b963773',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_25_13_24/2019_06_25_13_24_dotsPositions.csv': 'f1d59e1830260881749823abc670d469',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_25_14_06/2019_06_25_14_06customFIRA.csv': '1be8454c740020101e4fcfc57d720b14',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_06_27_11_33/2019_06_27_11_33customFIRA.csv': '36ecb7509c192e14a68ec9476aca94ac',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_03_15_03/2019_07_03_15_03_FIRA.csv': '7dc6c4e1c1dcef8e3bbacdb80e926435',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_03_15_03/2019_07_03_15_03_dotsPositions.csv': 'ddfd162fe111d466fff74d4b53657a46',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_03_16_32/2019_07_03_16_32_FIRA.csv': '4077575d11460a76a140f3f0ee4a3153',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_08_17_13/2019_07_08_17_13customFIRA.csv': '36ecb7509c192e14a68ec9476aca94ac',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_09_11_02/2019_07_09_11_02customFIRA.csv': 'ddfd162fe111d466fff74d4b53657a46',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_10_12_18/2019_07_10_12_18_FIRA.csv': '0e689f4bf92aa2c4f67eb74ee56f6046',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_10_12_18/2019_07_10_12_18_dotsPositions.csv': '235f0964a5d2fd088aeea66c4df97fd4',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_10_17_19/2019_07_10_17_19_FIRA.csv': 'b799cb9ceea49d9d7cb8616493228183',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_10_17_19/2019_07_10_17_19_dotsPositions.csv': '97b178d6c061d4347582f1981a420292',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_10_17_40/2019_07_10_17_40_FIRA.csv': 'e43b0bc86c6e02f957afbe335989c0e5',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_10_17_42/2019_07_10_17_42customFIRA.csv': '235f0964a5d2fd088aeea66c4df97fd4',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_11_11_21/2019_07_11_11_21customFIRA.csv': '97b178d6c061d4347582f1981a420292',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_12_11_11/2019_07_12_11_11_FIRA.csv': 'e7dd32800d1c34722fcd049f0426005d',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_12_11_11/2019_07_12_11_11_dotsPositions.csv': '20e2225a5eebca406372854a1beb91b7',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_17_17_17/2019_07_17_17_17_FIRA.csv': 'f6252353958133c4fd37aba7bc0d6570',
 '/home/adrian/SingleCP_DotsReversal/raw/2019_07_17_17_17/2019_07_17_17_17_dotsPositions.csv': 'fd77b1add52105f0ceba419f72b515b4'
}

# answer to question 1
do I have as many timestamps in the metadata file and in my directory tree?

--> YES

In [8]:
# number of timestamps in notebook variable
num_timestamps_nb = len(timestamps)
print('total timestamps in notebook', num_timestamps_nb)

total timestamps in notebook 23


In [9]:
# get checksum of metadata file ... 
meta_chksum = md5(metadata_file)
# print(meta_chksum)

In [10]:
# and hard code the first one in case file has changed
meta_chksum_ref = '24e31da81bd43f2e2cd51df0ef111689'
assert meta_chksum == meta_chksum_ref

In [11]:
# number of timestamps in metadafile
with open(metadata_file, 'r') as f:
    meta_data = json.load(f)
# recall: meta_data is a dict. Its keys are hash codes for subjects.
# its values are themselves dicts, with keys session names and values dicts with session info.
# So, to access the session info corresponding to the first session of the the first subject, do:
# meta_data[<subj code>]['session1']

In [12]:
counter = 0
for v in meta_data.values():
    for vv in v.values():
#         print(vv['sessionTag'])
        counter += 1
print('total timestamps in metadata file', counter)

total timestamps in metadata file 23


In [13]:
print('total number of directories in data folder')

total number of directories in data folder


In [14]:
!command ls /home/adrian/SingleCP_DotsReversal/raw/ | wc -l

23


# answer to questions 2 - 5
for all FIRA and DOTS files, are the column names and data types (by column) the same?

--> YES

In [15]:
def check_homogeneity():
    """
    checks that all FIRA files have same column names and data types (columnwise)
    same for DOTS files
    """
    counter_f = 0
    for file_dict in files_data:
        for k in ['FIRA','dots']:
            cols = []  # will be a list of dicts with key-value pairs <column name>: <dtype>
            list_of_files = file_dict[k]
            for filename, hhsh in list_of_files:          
                assert ref_hashes[filename] == hhsh
                table = pd.read_csv(filename)

                # build dict with key-value pairs <column name>: <dtype>
                types_dict = {}
                for col in table.columns:
                    types_dict[col] = table[col].dtype
                cols.append(types_dict)

            counter = 0
            for new in cols:
                if counter == 0:
                    last = new
                    counter += 1
                assert set(new.keys()) == set(last.keys())
                assert set(new.values()) == set(last.values())
    print('TEST PASSED')

In [16]:
check_homogeneity()

TEST PASSED


# answer to questions 6-7
for each FIRA file, are the `taskID` and `trialIndex` columns consistent with the metadata?

In [17]:
# pprint.pprint(meta_data)

In [18]:
def get_keys(timestamp):
    """
    for a give timestamp (i.e. a session), returns the subject code and session name required to index 
    the meta_data dict appropriately to find back the session. 
    
    Example:
        >>> c, s = get_keys('2019_06_21_11_52')
        >>> session_dict = meta_data[c][s]
    """
    for subj_code, sessions_dict in meta_data.items():
        for session_name, session_metadata in sessions_dict.items():
            if session_metadata['sessionTag'] == timestamp:
                return subj_code, session_name

In [19]:
# c, s = get_keys('2019_06_20_12_54')
# print(c, s)
# pprint.pprint(meta_data[c][s])

In [20]:
# the following dict should match the row order of DefaultBlockSequence.csv
mapping_task_type_id_name = {
    1: 'Tut1',
    2: 'Quest',
    3: 'Tut2',
    4: 'Block2',
    5: 'Tut3',
    6: 'Block3',
    7: 'Block4',
    8: 'Block5',
    9: 'Block6',
    10: 'Block7',
    11: 'Block8',
    12: 'Block9',
    13: 'Block10',
    14: 'Block11'
}
counter = 0
for file_dict in files_data:
    time_stamp = file_dict['session']
    c, s = get_keys(time_stamp)
    for filename, hhsh in file_dict['FIRA']:
        assert ref_hashes[filename] == hhsh
        
        table = pd.read_csv(filename)

        # check the taskID
        # taskID should start at 1 and increment by one for each task (block) performed in a single session
        task_ids = table['taskID'].unique()
        task_ids.sort()
        pprint.pprint((filename, 
                       list(task_ids), 
                       [mapping_task_type_id_name[x] for x in task_ids],
                       meta_data[c][s]))
        num_blocks = len(task_ids)
#         try:
#             assert (task_ids == np.arange(1, num_blocks)).sum() == num_blocks
#         except AssertionError:
#             print(filename, task_ids)

('/home/adrian/SingleCP_DotsReversal/raw/2019_06_20_12_54/2019_06_20_12_54_FIRA.csv',
 [2, 4],
 ['Quest', 'Block2'],
 {'Quest': {'QuestFit': [18.5, 2, 0.5, 0.001],
            'aborted': 0,
            'completed': 1,
            'numTrials': 80,
            'reward': 2},
  'Tut1': {'aborted': 0, 'completed': 1, 'numTrials': 20, 'reward': 0},
  'Tut2': {'aborted': 0, 'completed': 1, 'numTrials': 10, 'reward': 0},
  'sessionTag': '2019_06_20_12_54',
  'trialFolder': 'Blocks003'})
('/home/adrian/SingleCP_DotsReversal/raw/2019_06_21_13_08/2019_06_21_13_08_FIRA.csv',
 [2, 4],
 ['Quest', 'Block2'],
 {'Quest': {'QuestFit': [34.5, 2, 0.5, 0.001],
            'aborted': 0,
            'completed': 1,
            'numTrials': 80,
            'reward': 2},
  'Tut1': {'aborted': 0, 'completed': 1, 'numTrials': 20, 'reward': 0},
  'Tut2': {'aborted': 1, 'completed': 0, 'numTrials': 0, 'reward': 0},
  'sessionTag': '2019_06_21_13_08',
  'trialFolder': 'Blocks003'})
('/home/adrian/SingleCP_DotsRever

('/home/adrian/SingleCP_DotsReversal/raw/2019_07_17_17_17/2019_07_17_17_17_FIRA.csv',
 [2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14],
 ['Quest',
  'Block2',
  'Block3',
  'Block4',
  'Block5',
  'Block6',
  'Block7',
  'Block8',
  'Block9',
  'Block10',
  'Block11'],
 {'Block10': {'aborted': 0, 'completed': 1, 'numTrials': 204, 'reward': 4},
  'Block11': {'aborted': 0, 'completed': 1, 'numTrials': 204, 'reward': 4},
  'Block2': {'aborted': 0, 'completed': 1, 'numTrials': 204, 'reward': 2},
  'Block3': {'aborted': 1, 'completed': 0, 'numTrials': 0, 'reward': 0},
  'Block4': {'aborted': 1, 'completed': 0, 'numTrials': 0, 'reward': 0},
  'Block5': {'aborted': 1, 'completed': 0, 'numTrials': 0, 'reward': 0},
  'Block6': {'aborted': 1, 'completed': 0, 'numTrials': 0, 'reward': 0},
  'Block7': {'aborted': 1, 'completed': 0, 'numTrials': 0, 'reward': 0},
  'Block8': {'aborted': 0, 'completed': 1, 'numTrials': 204, 'reward': 4},
  'Block9': {'aborted': 0, 'completed': 1, 'numTrials': 204, 'reward': 4