In [None]:
#Goes subject by subject for a given list length, calculates the difference between its serial position and the one of the next 
#recalled item (aka lag), calculates all possible lags, and calculates the conditional response probability for each lag by dividing 
#these two values. Returns a dataframe containing lags, possible counts, actual counts, and conditional response probability.

#Mask: qualifies items that you want to analyze based on the transitions that are possible from this item. For example, if your mask 
#contains 1, it means that you only want to analyze items where a +1 transition from this item is possible (aka serialpos + 1 has not 
#already been recalled, and serialpos + 1 is within the range specified by the list length. Most typically, a [-1, 1] mask is used,
#which is equivalent to analyzing only items that come after a first order error.
def lagCRP_curve(dataframe, list_length, how = '', when = ''):
    
    dataframe1 = dataframe[(dataframe.list_length == list_length) & (dataframe.type != 'WORD')]
    #cut out all presentation events (aka WORD events) to make it run quicker
    if how != '':
        dataframe1 = dataframe1[dataframe1.how == how]
    if when != '':
        dataframe1 = dataframe1[dataframe1.when == when]
    #if how or when isn't specified, it will go through all data

    return_df = pd.DataFrame(columns = ['x_values', 'actual_count', 'possible_count', 'probability', 'subject', 'how', 'when'], 
                             index = list(range(list_length - (list_length + list_length - 1), list_length)))
    #the index in the above dataframe corresponds to all transitions that could have been made
    serialvals = list(range(1, list_length + 1)) #list to later iterate through all possible serial positions
    subject_df = pd.DataFrame(columns = dataframe1.columns) #create empty dataframe to later store a subject's data
    df_list = [] #the return_df for each subject will be added to this list to later be concatenated
    stat_df_list = []
    sub_list = dataframe1.subject.unique() #array of subject IDs
    
    for i in sub_list: #for each subject
        subject_df = dataframe1[dataframe1.subject == i]
        iterator = list(range(subject_df.shape[0]))
        return_df.x_values = return_df.index
        return_df.how = how
        return_df.when = when
        return_df.possible_count = 0
        return_df.actual_count = 0
        return_df.subject = i
        wordlist = [] #this later serves to catch repetitions
        
        for j in iterator:
            lag = 0 #instantiate lag to 0
            if (subject_df.type.iloc[j] == 'START_RECALL') | (subject_df.type.iloc[j] == 'END_RECALL'): 
                wordlist = []
                continue
                #accounts for list changes
            if subject_df.intrusion.iloc[j] != 0.0: 
                continue
                #accounts for if the current item is an intrusion
            if subject_df.serialpos.iloc[j] in wordlist:
                continue 
                #accounts for if the current item is a repetition
            if subject_df.serialpos.iloc[j] not in wordlist:
                #add the serial pos of recalled item to this list, since it was recalled and is not already in there
                wordlist.append(subject_df.serialpos.iloc[j]) 
                for l in serialvals:
                    tracker = 0
                    if l not in wordlist: #if the number hasn't been recalled already
                        if how == 'fwd':
                            tracker = l - subject_df.serialpos.iloc[j] 
                            return_df.loc[tracker,'possible_count'] += 1
                        if how == 'bwd':
                            tracker = subject_df.serialpos.iloc[j] - l
                            return_df.loc[tracker,'possible_count'] += 1
                        #possible lags are the difference between each possible serial position and the serial pos of the recalled item
                if (subject_df.intrusion.iloc[j + 1] != 0.0) | (subject_df.serialpos.iloc[j + 1] in wordlist):
                    continue
                    #accounts for if the next recalled item is an intrusion or repetition. We only count actual lags if transition is viable
                if how == 'fwd':
                    lag = subject_df.serialpos.iloc[j + 1] - subject_df.serialpos.iloc[j]
                if how == 'bwd':
                    lag = subject_df.serialpos.iloc[j] - subject_df.serialpos.iloc[j + 1]
                return_df.loc[lag, 'actual_count'] += 1

        return_df['probability'] = return_df['actual_count']/return_df['possible_count']
        df_list.append(return_df.copy()) #add a copy of the return_df to the list
        
        #All of these lines clear the data from a subject before populating these same dataframes with the next subject's
        #data. That is why we add a copy of return_df above instead of the actual thing
        subject_df = subject_df.iloc[0:0]
        return_df = pd.DataFrame(columns = return_df.columns, index = list(range(list_length - (list_length + list_length - 1), list_length)))
        stat_return_df = pd.DataFrame(columns = ['subject', 'ratio', 'how', 'when', 'difference'])
        iterator = []
    
    if stat == True:
        final_return = pd.concat(stat_df_list)
        return final_return
    else:
        final_return = pd.concat(df_list) #concatenate the dataframes in the list into one
        #If we want only positions -3 through 3, for purposes of clarity:
        final_return = final_return.loc[[-3, -2, -1, 0, 1, 2, 3]]
        return final_return