In [1]:
import pandas as pd
import numpy as np
from docopt import docopt
import re
import os


#reads in log file and subtracts the initial TRs/MRI startup time
def read_in_logfile(path):
    log_file=pd.read_csv(path, sep='\t', skiprows=3)

    time_to_subtract=int(log_file.Duration[log_file.Code=='MRI_start'])

    log_file.Time=log_file.Time-time_to_subtract #subtracts mri start times from all onset times

    return log_file


#Grabs the starts of blocks and returns rows for them
def get_blocks(log,vid_info):
    #identifies the video trial types (as opposed to button press events etc)
    mask = ["vid" in log['Code'][i] for i in range(0,log.shape[0])]

    #creates the dataframe with onset times and event types
    df = pd.DataFrame({'onset':log.loc[mask]['Time'],
                  'trial_type':log.loc[mask]['Event Type'],
                  'movie_name':log.loc[mask]['Code']})
    #adds trial type info
    df['trial_type']=df['movie_name'].apply(lambda x: "circle_block" if "cvid" in x else "EA_block")
    #add durations and convert them into the units used here
    df['duration']=df['movie_name'].apply(lambda x: int(vid_info[x]['duration'])*10000 if x in vid_info else "n/a")
    #adds names of stim_files, according to the vid_info spreadsheet
    df['stim_file']=df['movie_name'].apply(lambda x: vid_info[x]['stim_file'] if x in vid_info else "n/a")
    #adds an end column to the beginning of blocks (it's useful for processing but will remove later)
    df['end']=df['onset']+df['duration']
    return(df)

#grabs stimulus metadata
def format_vid_info(vid):
    vid.columns = [c.lower() for c in vid.columns]	
    vid = vid.rename(index={0:"stim_file", 1:"duration"}) #grabs the file name and the durations from the info file
    vid = vid.to_dict()
    return(vid)

#Reads in gold standard answers
def read_in_standard(timing_path):
    df = pd.read_csv(timing_path).astype(str)
    df.columns = [c.lower() for c in df.columns]	
    df_dict = df.drop([0,0]).reset_index(drop=True).to_dict(orient='list') #drops the video name
    return(df_dict)

#grabs gold standards as a series
def get_series_standard(gold_standard, block_name):
    return([float(x) for x in gold_standard[block_name] if x != 'nan'])

#grabs partcipant ratings
def get_ratings(log):

    rating_mask = ["rating" in log['Code'][i] for i in range(0,log.shape[0])]

    #gives the time and value of the partiicipant rating
    df = pd.DataFrame({'onset':log['Time'].loc[rating_mask].values, 'participant_value':log.loc[rating_mask]['Code'].values, 'event_type':'button_press', 'duration':0})


    #gets rating substring from participant numbers
    df['participant_value'] = df['participant_value'].str.strip().str[-1]

    return(df)


    #combines the block rows with the ratings rows and sorts them
def combine_dfs(blocks,ratings):
    combo=blocks.append(ratings).sort_values("onset").reset_index(drop=True)

    mask = pd.notnull(combo['trial_type'])

    combo['rating_duration']=combo['onset'].shift(-1)-combo['onset'].where(mask==False)

    onsets=pd.Series(combo.onset)
    
    combo['space_b4_prev']=onsets.diff(periods=1)

    block_start_locs=combo[mask].index.values

    
    #this ends up not assigning a value for the final button press - there must be a more elegant way to do all this
    for i in range(len(block_start_locs)):
        if block_start_locs[i] != 0:
            #maybe i should calculate these vars separately for clarity
            combo.rating_duration[block_start_locs[i-1]]=combo.end[block_start_locs[i-1]] - combo.onset[block_start_locs[i-1]]
            print(combo.rating_duration[block_start_locs[i-1]])


#adds rows that contain the 5 second at the beginning default value
    for i in block_start_locs:
            new_row={'onset':combo.onset[i],
            'rating_duration':combo.onset[i+1] - combo.onset[i],
            'event_type':'default_rating',
            'duration':0,
            'participant_value':5}
            combo=combo.append(new_row,ignore_index=True)
    combo=combo.sort_values(by=["onset","event_type"],na_position='first').reset_index(drop=True)
    #combo = combo[(combo['space_b4_prev'] >200)]
    combo=combo.drop(combo[(combo['space_b4_prev']<1000) & (combo['event_type']=='button_press') & (combo['event_type'].shift()=='default_rating')].index)
    combo=combo.sort_values(by=["onset","event_type"],na_position='first').reset_index(drop=True)

    return(combo)



#calculates pearsons r by comparing participant ratings w a gold standard
def block_scores(ratings_dict,combo):
    list_of_rows=[]
    summary_vals = {}
    mask = pd.notnull(combo['trial_type']) #selects the beginning of trials/trial headers #i feel like im recalculating that in lots of places, seems bad maybe
    block_start_locs=combo[mask].index.values #i could just append the end to that
    block_start_locs= np.append(block_start_locs, combo.tail(1).index.values, axis=None)

    for idx in range(1, len(block_start_locs)):
            #df['trial_type']=df['movie_name'].apply(lambda x: "circle_block" if "cvid" in x else "EA_block")

        block_start=combo.onset[block_start_locs[idx-1]]
        block_end=combo.end[block_start_locs[idx-1]]

        #selects the rows between the start and the end that contain button presses
        #should just change this to select the rows, idk why not lol

        block = combo.iloc[block_start_locs[idx-1]:block_start_locs[idx]][pd.notnull(combo.event_type)]#between is inclusive by default
        block_name=combo.movie_name.iloc[block_start_locs[idx-1]:block_start_locs[idx]][pd.notnull(combo.movie_name)].reset_index(drop=True).astype(str).get(0)

        ###############################################################################################
        gold=get_series_standard(ratings_dict,block_name)

        if "cvid" in block_name:
            interval = np.arange(combo.onset[block_start_locs[idx-1]], combo.end[block_start_locs[idx-1]],step=40000) #AAA oh no this only applies to the vid not the cvid (put a conditional here)
        else:
            interval = np.arange(combo.onset[block_start_locs[idx-1]], combo.end[block_start_locs[idx-1]],step=20000) #AAA oh no this only applies to the vid not the cvid (put a conditional here)



        #todo: remove print statements lol, turn them into logger things.

        if len(gold) < len(interval):
            interval=interval[:len(gold)]
            #TODO: convert this to logger stuff eventually
            print("warning:gold standard is shorter than the number of pt ratings, pt ratings truncated", block_name)


        if len(interval) < len(gold):
            gold=gold[:len(interval)]
            #TODO: convert this to logger stuff eventually
            print("warning:number of pt ratings is shorter than the number of gold std,gold std truncated", block_name)

        interval=np.append(interval, block_end) #this is to append for the remaining fraction of a second (so that the loop goes to the end i guess...)- maybe i dont need to do this

        two_s_avg=[]
        for x in range(len(interval)-1):
            start=interval[x]
            end=interval[x+1]
            #things that start within the time interval plus the one that starts during the time interval
            sub_block= block[block['onset'].between(start,end) | block['onset'].between(start,end).shift(-1)]
            block_length=end-start
            if len(sub_block) !=0:
                ratings=[]
                last_val=sub_block.participant_value.iloc[[-1]]
                for index, row in sub_block.iterrows():
                    #for rows that are in the thing
                    if (row.onset < start): #and (row.onset+row.duration)>start: #what's the best order to do these conditionals in?
                        #if (row.onset+row.duration)>start: # this is just to be safe i guess, gonna see what happens if i comment it out
                        numerator=(row.onset+row.rating_duration)-start
                    else:#if row.onset>=start and row.onset<end: #ooo should i do row.onset<end for everything??
                        if (row.onset+row.rating_duration) <= end:
                            numerator=row.rating_duration
                        elif (row.onset+row.rating_duration) > end:
                            numerator = end - row.onset
                        else:
                            numerator=9999999 #add error here
                    last_row=row.participant_value
                    #okay so i want to change this to actually create the beginnings of an important row in our df!
                    ratings.append({'start':start,'end':end,'row_time':row.rating_duration, 'row_start': row.onset, 'block_length':block_length,'rating':row.participant_value, 'time_held':numerator})#, 'start': start, 'end':end})
                    nums=[float(d['rating']) for d in ratings]
                    times=[float(d['time_held'])/block_length for d in ratings]
                    avg=np.sum(np.multiply(nums,times))
            else:
                avg=last_row

            #okay so i want to change this to actually create the beginnings of an important row in our df!
            two_s_avg.append(float(avg))
            #list_of_rows.append({'event_type':"two_sec_avg",'block_name':block_name, 'participant_value':float(avg),'onset':start,'duration':end-start, 'gold_std': gold[x]})
            list_of_rows.append({'event_type':"running_avg", 'participant_value':float(avg),'onset':start,'duration':end-start, 'gold_std': gold[x]})
            #removed block_name from above

        n_button_press=len(block[block.event_type=='button_press'].index)
        block_score=np.corrcoef(gold,two_s_avg)[1][0]
        key=str(block_name)
        summary_vals.update({key:{'n_button_press':int(n_button_press),'block_score':block_score,'onset':block_start,'duration':block_end-block_start}})
        #summary_vals.append(block_name:{'block_score':block_score,'block_name':block_name,'onset':block_start,'duration':block_end-block_start}) #i can probably not recalculate duration, just gotta remember how
    return(list_of_rows,summary_vals)


  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as libops
  from pandas._libs.interval import (
  from pandas._libs import internals as libinternals
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import (lib, reduction,
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from pandas._libs import algos, lib, writers as libwriters


In [2]:
log = read_in_logfile('/archive/data/SPINS/task/SPN01_CMH_0001_01_01/SPN01_CMH_0001-UCLAEmpAcc_part1.log')
vid_in = pd.read_csv('EA-vid-lengths.csv')
log

Unnamed: 0,Subject,Trial,Event Type,Code,Time,TTime,Uncertainty,Duration,Uncertainty.1,ReqTime,ReqDur,Stim Type,Pair Index
0,SPN01_CMH_0001,1,Picture,MRI_start,-1050148,0,1,1050314.0,2.0,0.0,next,hit,2.0
1,SPN01_CMH_0001,1,Response,101,4,1050152,2,,,,,,
2,SPN01_CMH_0001,4,Video,vid_4,131392,0,1,,,0.0,,other,0.0
3,SPN01_CMH_0001,5,Picture,scale,131559,0,1,335.0,2.0,0.0,100,other,0.0
4,SPN01_CMH_0001,5,Response,103,256809,125250,2,,,,,,
5,SPN01_CMH_0001,6,Picture,rating_equal_to6,257102,0,1,334.0,3.0,0.0,100,other,0.0
6,SPN01_CMH_0001,6,Response,102,306328,49226,2,,,,,,
7,SPN01_CMH_0001,7,Picture,rating_equal_to5,306416,0,2,335.0,3.0,0.0,100,other,0.0
8,SPN01_CMH_0001,7,Response,102,319003,12587,1,,,,,,
9,SPN01_CMH_0001,8,Picture,rating_equal_to4,319121,0,1,335.0,2.0,0.0,100,other,0.0


In [3]:
vid_info = format_vid_info(vid_in)
vid_info

{'cvid_1': {'duration': '40', 'stim_file': 'circles1'},
 'cvid_2': {'duration': '40', 'stim_file': 'circles2'},
 'cvid_4': {'duration': '40', 'stim_file': 'circles4'},
 'cvid_5': {'duration': '40', 'stim_file': 'circles5'},
 'cvid_7': {'duration': '40', 'stim_file': 'circles7'},
 'cvid_8': {'duration': '40', 'stim_file': 'circles8'},
 'ucla_emp_accuracy_9_clip': {'duration': '0', 'stim_file': 'data_point'},
 'vid_10': {'duration': '130', 'stim_file': 'DH_6_anger'},
 'vid_12': {'duration': '137', 'stim_file': 'AR_3_delighted'},
 'vid_13': {'duration': '179', 'stim_file': 'TA_6_anger'},
 'vid_2': {'duration': '146', 'stim_file': 'AR_4_sad'},
 'vid_3': {'duration': '147', 'stim_file': 'ME_5_amuse'},
 'vid_4': {'duration': '170', 'stim_file': 'NW_6_delighted'},
 'vid_5': {'duration': '111', 'stim_file': 'TA_2_amuse'},
 'vid_6': {'duration': '144', 'stim_file': 'CT_3_anger'},
 'vid_8': {'duration': '119', 'stim_file': 'HR_1_sad'}}

In [4]:
blocks = get_blocks(log, vid_info)
blocks

Unnamed: 0,movie_name,onset,trial_type,duration,stim_file,end
2,vid_4,131392,EA_block,1700000,NW_6_delighted,1831392
28,cvid_1,1888657,circle_block,400000,circles1,2288657
86,vid_2,2340343,EA_block,1460000,AR_4_sad,3800343
118,cvid_2,3852039,circle_block,400000,circles2,4252039
168,vid_5,4303522,EA_block,1110000,TA_2_amuse,5413522


In [5]:
ratings = get_ratings(log)
ratings

Unnamed: 0,duration,event_type,onset,participant_value
0,0,button_press,257102,6
1,0,button_press,306416,5
2,0,button_press,319121,4
3,0,button_press,338011,3
4,0,button_press,396520,4
5,0,button_press,422096,5
6,0,button_press,481775,6
7,0,button_press,546135,7
8,0,button_press,565025,8
9,0,button_press,593777,9


In [6]:
combo=combine_dfs(blocks,ratings)
combo

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1700000.0
400000.0
1460000.0
400000.0


Unnamed: 0,duration,end,event_type,movie_name,onset,participant_value,stim_file,trial_type,space_b4_prev,rating_duration
0,1700000,1831392.0,,vid_4,131392.0,,NW_6_delighted,EA_block,,1700000.0
1,0,,default_rating,,131392.0,5,,,,125710.0
2,0,,button_press,,257102.0,6,,,125710.0,49314.0
3,0,,button_press,,306416.0,5,,,49314.0,12705.0
4,0,,button_press,,319121.0,4,,,12705.0,18890.0
5,0,,button_press,,338011.0,3,,,18890.0,58509.0
6,0,,button_press,,396520.0,4,,,58509.0,25576.0
7,0,,button_press,,422096.0,5,,,25576.0,59679.0
8,0,,button_press,,481775.0,6,,,59679.0,64360.0
9,0,,button_press,,546135.0,7,,,64360.0,18890.0


In [7]:
ratings_dict=read_in_standard('EA-timing.csv')
ratings_dict

{'cvid_1': ['5',
  '4',
  '7',
  '4',
  '5',
  '8',
  '9',
  '6',
  '9',
  '7',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan'],
 'cvid_2': ['5',
  '6',
  '3',
  '4',
  '1',
  '2',
  '4',
  '2',
  '1',
  '3',
  'nan',
  'nan',
  'nan',
  'nan',
  

In [8]:
two_s_chunks,scores= block_scores(ratings_dict,combo) #okay so i need to fix the naming here
scores





{'cvid_1': {'block_score': 0.8996346008743014,
  'duration': 400000.0,
  'n_button_press': 28,
  'onset': 1888657.0},
 'cvid_2': {'block_score': 0.5329381161525056,
  'duration': 400000.0,
  'n_button_press': 24,
  'onset': 3852039.0},
 'vid_2': {'block_score': 0.2270645046604821,
  'duration': 1460000.0,
  'n_button_press': 15,
  'onset': 2340343.0},
 'vid_4': {'block_score': 0.44142400122715514,
  'duration': 1700000.0,
  'n_button_press': 12,
  'onset': 131392.0},
 'vid_5': {'block_score': 0.21079467082786044,
  'duration': 1110000.0,
  'n_button_press': 11,
  'onset': 4303522.0}}

In [9]:

combo['block_score']=np.nan
combo['n_button_press']=np.nan
combo['space_b4_prev']=np.nan

combo = combo.append(two_s_chunks).sort_values("onset").reset_index(drop=True) #this needs to be fixed etc #need to sort according to name too...
combo

Unnamed: 0,block_score,duration,end,event_type,gold_std,movie_name,n_button_press,onset,participant_value,rating_duration,space_b4_prev,stim_file,trial_type
0,,1700000.0,1831392.0,,,vid_4,,131392.0,,1700000.0,,NW_6_delighted,EA_block
1,,20000.0,,running_avg,5.000,,,131392.0,5,,,,
2,,0.0,,default_rating,,,,131392.0,5,125710.0,,,
3,,20000.0,,running_avg,5.000,,,151392.0,5,,,,
4,,20000.0,,running_avg,5.000,,,171392.0,5,,,,
5,,20000.0,,running_avg,5.000,,,191392.0,5,,,,
6,,20000.0,,running_avg,5.000,,,211392.0,5,,,,
7,,20000.0,,running_avg,5.000,,,231392.0,5,,,,
8,,20000.0,,running_avg,5.000,,,251392.0,5.7145,,,,
9,,0.0,,button_press,,,,257102.0,6,49314.0,,,


In [10]:
test = combo.ix[pd.notnull(combo.stim_file)]
for index, row in test.iterrows(): #adds n button presses
    combo.block_score.ix[index]=scores[row['movie_name']]['block_score']
    combo.n_button_press.ix[index]=scores[row['movie_name']]['n_button_press']
    combo.event_type.ix[index]='block_summary'
combo

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing th

Unnamed: 0,block_score,duration,end,event_type,gold_std,movie_name,n_button_press,onset,participant_value,rating_duration,space_b4_prev,stim_file,trial_type
0,0.441424,1700000.0,1831392.0,block_summary,,vid_4,12.0,131392.0,,1700000.0,,NW_6_delighted,EA_block
1,,20000.0,,running_avg,5.000,,,131392.0,5,,,,
2,,0.0,,default_rating,,,,131392.0,5,125710.0,,,
3,,20000.0,,running_avg,5.000,,,151392.0,5,,,,
4,,20000.0,,running_avg,5.000,,,171392.0,5,,,,
5,,20000.0,,running_avg,5.000,,,191392.0,5,,,,
6,,20000.0,,running_avg,5.000,,,211392.0,5,,,,
7,,20000.0,,running_avg,5.000,,,231392.0,5,,,,
8,,20000.0,,running_avg,5.000,,,251392.0,5.7145,,,,
9,,0.0,,button_press,,,,257102.0,6,49314.0,,,


In [11]:

cols=['onset', 'duration','trial_type','event_type','participant_value','gold_std','block_score','n_button_press', 'stim_file']
combo=combo[cols]

combo['onset']=combo.onset/10000.0
combo.duration=combo.duration/10000.0
combo = combo.sort_values(by=['onset', 'event_type']).reset_index(drop=True) #by sorting it makes the fill down accurate instead of mis-labeling (should possibly do this in a better way in future)
combo.stim_file=combo.stim_file.ffill(axis=0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [13]:
pd.options.display.max_rows = 400

combo

Unnamed: 0,onset,duration,trial_type,event_type,participant_value,gold_std,block_score,n_button_press,stim_file
0,13.1392,170.0,EA_block,block_summary,,,0.441424,12.0,NW_6_delighted
1,13.1392,0.0,,default_rating,5.0,,,,NW_6_delighted
2,13.1392,2.0,,running_avg,5.0,5.0,,,NW_6_delighted
3,15.1392,2.0,,running_avg,5.0,5.0,,,NW_6_delighted
4,17.1392,2.0,,running_avg,5.0,5.0,,,NW_6_delighted
5,19.1392,2.0,,running_avg,5.0,5.0,,,NW_6_delighted
6,21.1392,2.0,,running_avg,5.0,5.0,,,NW_6_delighted
7,23.1392,2.0,,running_avg,5.0,5.0,,,NW_6_delighted
8,25.1392,2.0,,running_avg,5.7145,5.0,,,NW_6_delighted
9,25.7102,0.0,,button_press,6.0,,,,NW_6_delighted


In [None]:
    log_head, log_tail =os.path.split(log_file)

    find=re.compile('RESOURCES\/(SPN01[^\/]*)')
    m = find.findall(log_head)
    find2=re.compile('(part\d).log')
    n = find2.findall(log_tail)
    if m and n:	
        part=n[0]	
        sub_id=m[0]	
    else:	
        part="NULL"	
        sub_id="NULL"	

    file_name='/projects/gherman/ea_parser/out2/{}/{}_EAtask_{}.tsv'.format(sub_id, sub_id,part)

    if not os.path.exists(os.path.dirname(file_name)):
        os.makedirs(os.path.dirname(file_name))


    combo.to_csv(file_name, sep='\t', na_rep='n/a', index=False)

    #writes stuff to csv
#    hs = open("/projects/gherman/ea_parser/out/generated_list.csv","a")
#    hs.write("{},{},{}_parsed.tsv\n".format(log_head,log_tail,file_name))
#    hs.close()



    EA_mask = combo.ix[combo.trial_type=="EA_block"]

    #score_file=open("/projects/gherman/ea_parser/out/compiled_scores.csv","a+")
    #for index, row in EA_mask.iterrows():
    #    score_file.write("\n{},{},{},{}".format(sub_id,EA_mask.stim_file.ix[index],EA_mask.block_score.ix[index], log_file))
    #score_file.close()
    #Do i also want to write a csv that says where each thing was generated from? probably.