In [None]:
from crim_intervals import *
import pandas as pd
import ast
import matplotlib
from itertools import tee, combinations
import numpy as np

In [140]:
# THIS IS DEV COPY for use with CLOSE/EXACT Matches

import numpy as np

# Converts lists to tuples

def lists_to_tuples_a(el):
    if isinstance(el, list):
        return tuple(el)
    else:
        return el

# Filters for the length of the Presentation Type in the Classifier

def limit_offset_size(array, limit):
    under_limit = np.cumsum(array) <= limit
    return array[: sum(under_limit)]

# Gets the the list of offset differences for each group 

def get_offset_difference_list_a(group):
    # if we do sort values as part of the func call, then we don't need this first line
    group = group.sort_values("start_offset")
    group["next_offset"] = group.start_offset.shift(-1)
    offset_difference_list = (group.next_offset - group.start_offset).dropna().tolist()
    return offset_difference_list

# The classifications are done here

def classify_offsets_a(offset_difference_list):
    """
    Put logic for classifying an offset list here
    """
    #
    offset_difference_list = limit_offset_size(offset_difference_list, 80)
    
    alt_list = offset_difference_list[::2]
    
    if len(set(offset_difference_list)) == 1 and len(offset_difference_list) > 1:
        return ("PEN", offset_difference_list)
    # elif (len(offset_difference_list) %2 != 0) and (len(set(alt_list)) == 1):
    elif (len(offset_difference_list) %2 != 0) and (len(set(alt_list)) == 1) and (len(offset_difference_list) >= 3):
        return ("ID", offset_difference_list)
    elif len(offset_difference_list) >= 1:
        return ("Fuga", offset_difference_list)
    else: 
        return ("Singleton", offset_difference_list)
    
# adds predicted type, offsets and entry numbers to the results

def predict_type_a(group):
    offset_differences = get_offset_difference_list_a(group)
    predicted_type, offsets = classify_offsets_a(offset_differences)

    group["predicted_type"] = [predicted_type for i in range(len(group))]
    group["offset_diffs"] = [offsets for i in range(len(group))]
    group["entry_number"] = [i + 1 for i in range(len(group))]

    return group

In [None]:
df = pd.read_csv(('ave_test_set.csv'), index_col=0)
df = df.drop(columns=["ema", "ema_url", "end_measure", "end_beat"])
df.head()


In [99]:
# lists for views.  These are used when calling the df below
simple_view = ["piece_title", "part", "pattern_generating_match", "pattern_matched", "start_offset"]
offset_details = ["start_measure", "start_beat", "end_offset", "note_durations", "prev_entry_off", "next_entry_off"]
drop_cols = ["pattern_matched", "part", "pattern_generating_match", "piece_title", "start_measure", "start_beat", "end_offset", "note_durations", "prev_entry_off", "next_entry_off"]
ready_classify = ["pattern_generating_match", "pattern_matched", "piece_title", "part", "start_measure", "start_beat", "start_offset", "sum_durs", "sub_group_id"]

In [130]:
# now process the original match data df by:
# sorting by start offset, then group by pattern generating match and applying the function above for prev entry
# also give each group a number
# sort by group number and start offset
# so that we can find PARALLEL (=0) Forward Gaps (the distance to NEXT entry), and Backward Gaps (distance to PREVIOUS Entry)
# All Proximate Matches are part of same sub Group (and go to classifier)
# Parallels are part of Sub Group but NOT part of Classified.  We will filter them OUT before classification
# Forward ONLY means the NEXT entry is a GAP.  No problem for Forward ONLY, since these are part of the previous set.
# Backward ONLY means the LAST entry was a GAP.  So these are NEW subgroups, since a new Presentation Type begins
# Forward AND Backward Gaps are SINGLETONS:  We will filter then OUT before Classification
# If an entry is the LAST in a Group and also has a BACKWARD gap it is also a SINGLETON

df2 = df.sort_values("start_offset")
df2["group_number"] = df2.groupby('pattern_generating_match').ngroup()
df2 = df2.sort_values(['group_number', 'start_offset'])
df2["prev_entry_off"] = df2["start_offset"].shift(1)
df2["next_entry_off"] = df2["start_offset"].shift(-1)
y = df2.drop_duplicates(subset=["pattern_matched"], keep='first').index
df2["is_first"] = df2.index.isin(y)
z = df2.drop_duplicates(subset=["pattern_matched"], keep='last').index
df2["is_last"] = df2.index.isin(z)
df2["last_off_diff"] = df2["start_offset"] - df2["prev_entry_off"]
df2["next_off_diff"] = df2["next_entry_off"] - df2["start_offset"]
df2["parallel"] = df2["last_off_diff"] == 0
df2["forward_gapped"] = df2["next_off_diff"] >= 80
df2["back_gapped"] = df2["last_off_diff"] >= 80
df2["singleton"] = ((df2['forward_gapped'] == True) & (df2['back_gapped'] == True) | (df2['back_gapped'] == True) & (df2["is_last"]))
df2["split_group"] = (df2['forward_gapped'] == False) & (df2['back_gapped'] == True)


#now mask out Parallels and Singletons
df2 = df2[df2["parallel"] != True]
df2 = df2[df2["singleton"] != True]
df2["next_off_diff"] = df2["next_off_diff"].abs()
df2["last_off_diff"] = df2["last_off_diff"].abs()
df2.head()



Unnamed: 0,pattern_generating_match,pattern_matched,piece_title,part,start_measure,start_beat,start_offset,end_offset,note_durations,sum_durs,...,next_entry_off,is_first,is_last,last_off_diff,next_off_diff,parallel,forward_gapped,back_gapped,singleton,split_group
18,"(-2, -2, -2, -2, -2)","[-2, -2, -2, -2, -2]",Ave Maria,[Superius],19,1.0,144.0,156.0,"[3.0, 1.0, 2.0, 2.0, 4.0, 2.0]",14.0,...,158.0,True,False,,14.0,False,False,False,False,False
19,"(-2, -2, -2, -2, -2)","[-2, -2, -2, -2, -2]",Ave Maria,[Superius],20,4.0,158.0,172.0,"[3.0, 1.0, 4.0, 2.0, 4.0, 4.0]",18.0,...,216.0,False,False,14.0,58.0,False,False,False,False,False
21,"(-2, -2, -2, -2, -2)","[-2, -2, -2, -2, -2]",Ave Maria,Tenor,28,1.0,216.0,225.0,"[3.0, 1.0, 1.0, 1.0, 3.0, 1.0]",10.0,...,672.0,False,False,58.0,456.0,False,True,False,False,False
73,"(-2, -2, -2, -2, -3)","[-2, -2, -2, -2, -3]",Ave Maria,Tenor,42,1.0,328.0,348.0,"[6.0, 2.0, 4.0, 4.0, 4.0, 2.0]",22.0,...,328.0,True,False,344.0,0.0,False,False,False,False,False
177,"(-2, -2, -2, -2, 1)","[-2, -2, -2, -2, 1]",Ave Maria,Altus,95,1.0,756.0,776.0,"[6.0, 2.0, 2.0, 2.0, 8.0, 4.0]",24.0,...,804.0,True,False,132.0,48.0,False,False,False,False,False


In [131]:

df2["combined_group"] = (df2.split_group | df2.is_first)
df2.loc[(df2["combined_group"]), "sub_group_id"] = range(df2.combined_group.sum())
df2["sub_group_id"] = df2["sub_group_id"].ffill()
df2.head()


Unnamed: 0,pattern_generating_match,pattern_matched,piece_title,part,start_measure,start_beat,start_offset,end_offset,note_durations,sum_durs,...,is_last,last_off_diff,next_off_diff,parallel,forward_gapped,back_gapped,singleton,split_group,combined_group,sub_group_id
18,"(-2, -2, -2, -2, -2)","[-2, -2, -2, -2, -2]",Ave Maria,[Superius],19,1.0,144.0,156.0,"[3.0, 1.0, 2.0, 2.0, 4.0, 2.0]",14.0,...,False,,14.0,False,False,False,False,False,True,0.0
19,"(-2, -2, -2, -2, -2)","[-2, -2, -2, -2, -2]",Ave Maria,[Superius],20,4.0,158.0,172.0,"[3.0, 1.0, 4.0, 2.0, 4.0, 4.0]",18.0,...,False,14.0,58.0,False,False,False,False,False,False,0.0
21,"(-2, -2, -2, -2, -2)","[-2, -2, -2, -2, -2]",Ave Maria,Tenor,28,1.0,216.0,225.0,"[3.0, 1.0, 1.0, 1.0, 3.0, 1.0]",10.0,...,False,58.0,456.0,False,True,False,False,False,False,0.0
73,"(-2, -2, -2, -2, -3)","[-2, -2, -2, -2, -3]",Ave Maria,Tenor,42,1.0,328.0,348.0,"[6.0, 2.0, 4.0, 4.0, 4.0, 2.0]",22.0,...,False,344.0,0.0,False,False,False,False,False,True,1.0
177,"(-2, -2, -2, -2, 1)","[-2, -2, -2, -2, 1]",Ave Maria,Altus,95,1.0,756.0,776.0,"[6.0, 2.0, 2.0, 2.0, 8.0, 4.0]",24.0,...,False,132.0,48.0,False,False,False,False,False,True,2.0


In [132]:
df3 = df2[ready_classify]
df3.head()

Unnamed: 0,pattern_generating_match,pattern_matched,piece_title,part,start_measure,start_beat,start_offset,sum_durs,sub_group_id
18,"(-2, -2, -2, -2, -2)","[-2, -2, -2, -2, -2]",Ave Maria,[Superius],19,1.0,144.0,14.0,0.0
19,"(-2, -2, -2, -2, -2)","[-2, -2, -2, -2, -2]",Ave Maria,[Superius],20,4.0,158.0,18.0,0.0
21,"(-2, -2, -2, -2, -2)","[-2, -2, -2, -2, -2]",Ave Maria,Tenor,28,1.0,216.0,10.0,0.0
73,"(-2, -2, -2, -2, -3)","[-2, -2, -2, -2, -3]",Ave Maria,Tenor,42,1.0,328.0,22.0,1.0
177,"(-2, -2, -2, -2, 1)","[-2, -2, -2, -2, 1]",Ave Maria,Altus,95,1.0,756.0,24.0,2.0


In [141]:
classified2 = df3.applymap(lists_to_tuples_a).groupby("sub_group_id").apply(predict_type_a)
classified2.drop(classified2[classified2['predicted_type'] == "Singleton"].index, inplace = True)

classified2 = classified2[classified2["predicted_type"] == "ID"]

classified2.head(50)
# classified2.to_csv('test4')


Unnamed: 0,pattern_generating_match,pattern_matched,piece_title,part,start_measure,start_beat,start_offset,sum_durs,sub_group_id,predicted_type,offset_diffs,entry_number
129,"(-2, 2, 2, -3, -2)","[-2, 2, 2, -3, -2]",Ave Maria,[Superius],55,1.0,432.0,23.0,14.0,ID,"[4.0, 36.0, 4.0]",1
130,"(-2, 2, 2, -3, -2)","[-2, 2, 2, -3, -2]",Ave Maria,Altus,55,3.0,436.0,23.0,14.0,ID,"[4.0, 36.0, 4.0]",2
131,"(-2, 2, 2, -3, -2)","[-2, 2, 2, -3, -2]",Ave Maria,Tenor,60,1.0,472.0,23.0,14.0,ID,"[4.0, 36.0, 4.0]",3
132,"(-2, 2, 2, -3, -2)","[-2, 2, 2, -3, -2]",Ave Maria,Bassus,60,3.0,476.0,23.0,14.0,ID,"[4.0, 36.0, 4.0]",4
144,"(-3, 2, -3, 2, -2)","[-3, 2, -3, 2, -2]",Ave Maria,[Superius],65,1.0,512.0,20.0,16.0,ID,"[16.0, 8.0, 16.0]",1
145,"(-3, 2, -3, 2, -2)","[-3, 2, -3, 2, -2]",Ave Maria,Altus,67,1.0,528.0,20.0,16.0,ID,"[16.0, 8.0, 16.0]",2
146,"(-3, 2, -3, 2, -2)","[-3, 2, -3, 2, -2]",Ave Maria,Tenor,68,1.0,536.0,20.0,16.0,ID,"[16.0, 8.0, 16.0]",3
147,"(-3, 2, -3, 2, -2)","[-3, 2, -3, 2, -2]",Ave Maria,Bassus,70,1.0,552.0,20.0,16.0,ID,"[16.0, 8.0, 16.0]",4
164,"(1, 1, 2, 1, 2)","[1, 1, 2, 1, 2]",Ave Maria,[Superius],94,1.0,744.0,32.0,21.0,ID,"[8.0, 40.0, 8.0]",1
166,"(1, 1, 2, 1, 2)","[1, 1, 2, 1, 2]",Ave Maria,Tenor,94,3.0,752.0,32.0,21.0,ID,"[8.0, 40.0, 8.0]",2


In [None]:
classified = df.applymap(lists_to_tuples_a).sort_values("start_offset").groupby("sub_group_id").apply(predict_type_a)
classified["group_number"] = classified.groupby('pattern_generating_match').ngroup()
classified = classified[["group_number", "pattern_generating_match", "pattern_matched", "part", "start_measure", "start_beat", "entry_number", "start_offset", "sum_durs", "offset_diffs", "predicted_type"]]

# remove singleton entries

classified.drop(classified[classified['predicted_type'] == "Singleton"].index, inplace = True)

# now classified results, in order by OFFSET and ENTRY NUMBER, but the group numbers can overlap with each other.
classified.head(25)

classified = classified.sort_values(["group_number", "entry_number"])
classified= classified[["group_number", "entry_number", "pattern_matched", "part", "start_measure", "start_beat", "start_offset", "sum_durs", "offset_diffs", "predicted_type"]]
classified["offset_list_length"] = classified["offset_diffs"].apply(len) + 1
classified.head(50)


In [None]:
for name, group in df3.groupby("group_number"):
    df3.loc[df3.split_group, "subgroup_id"] = range(1, df3.split_group.sum() + 1)
df3.head(20)

In [None]:
df2.loc[df2.split_group, "new_subgroup_id"] = range(2, df2.split_group.sum() + 2)

df2["new_subgroup_id"] = df2["new_subgroup_id"].ffill().fillna(1)
df2.head(5)

In [None]:
# Now we need to interate over each group in order to split the groups at wherever "split_group" is TRUE
# # initalize sub-group counter
sub_counter = 0

# the split-group column tells us if we need to start a new sub-group for that set of matches
# if TRUE, then we need to add a new sub_group_number
# if FALSE, then sub_group is the same
for name, group in df2.groupby("group_number"):
    if df2["split_group"] is False:
        df2["sub_group_id"] = sub_counter   
    else:
        sub_counter = sub_counter + 1
        df2["sub_group_id"] = sub_counter

df2.head(10)



In [None]:
classified = classified[classified.entry_number <= classified.offset_list_length]
classified

In [None]:


classify = df.sort_values("start_offset").groupby("pattern_matched")
for pattern_matched, group in classify: 
    classify["prev_ent_offset"] = classify["start_offset"]
    print(prev_ent_offset) 
    # print(group) 

In [None]:
# now run the classifer, but on DF from which the Parallel Entries have been removed!

classified2 = df3.applymap(lists_to_tuples_a).sort_values("start_offset").groupby("pattern_matched").apply(predict_type_a)
classified2["group_number"] = classified2.groupby('pattern_generating_match').ngroup()
classified2 = classified2[["group_number", "pattern_generating_match", "pattern_matched", "part", "start_measure", "start_beat", "entry_number", "start_offset", "prev_entry_off","sum_durs", "offset_diffs", "predicted_type"]]

# remove singleton entries

# classified2.drop(classified[classified['predicted_type'] == "Singleton"].index, inplace = True)

# now classified results, in order by OFFSET and ENTRY NUMBER, but the group numbers can overlap with each other.

classified2 = classified2.sort_values(["group_number", "entry_number"])
classified2= classified2[["group_number", "entry_number", "pattern_matched", "part", "start_measure", "start_beat", "start_offset", "prev_entry_off", "sum_durs", "offset_diffs", "predicted_type"]]
# classified2 = classified2[classified2["predicted_type"] == "PEN"]

classified2.head()