In [1]:
import json
from difflib import SequenceMatcher
import itertools

In [2]:
def create_tokenizers(token_dict_1, token_dict_2):
    node_type_token_dict = {
        node_type: chr(i)
        for i, node_type in enumerate(
            set(
                token_dict_1.keys()
                | token_dict_2.keys()
            )
        )
    }

    token_node_type_dict = {token: node_type for node_type, token in node_type_token_dict.items()}
    return node_type_token_dict, token_node_type_dict

In [3]:
def tokenize_seqs(node_type_token_dict, seqs):
    def _tokenize(path) -> str:
        return "".join(list(map(lambda x: node_type_token_dict[x], path)))

    tokenized_seqs = [_tokenize(seq) for seq in seqs]
    return tokenized_seqs

In [4]:
def detokenize_seqs(token_node_type_dict, seqs):
    def _detokenize(tokenized_seq):
        return [token_node_type_dict[token] for token in tokenized_seq]

    detokenized_seqs = [_detokenize(seq) for seq in seqs]
    return detokenized_seqs

In [5]:
def recursive_submatching(seqs, seq_len):
    submatches = set(seqs)
    for i, j in itertools.combinations(seqs, 2):
        matcher = SequenceMatcher(None, i, j)
        matching_blocks = list(matcher.get_matching_blocks())
        for matching_block in matching_blocks:
            a, b, size = matching_block.a, matching_block.b, matching_block.size
            if matching_block.size > 3:
                match_1 = i[a : a + size]
                submatches.add(match_1)
                if i in submatches:
                    submatches.remove(i)
                if j in submatches:
                    submatches.remove(j)
    print(len(submatches))
    if seq_len - len(submatches) > 0:
        return recursive_submatching(set(submatches), len(submatches))
    return seqs

In [6]:
def get_unq_sequences(seqs):
    total_sequences = 0
    unique_sequences = set()
    for i, (_, sequences) in enumerate(seqs['results'].items()):
        total_sequences += len(sequences)
        unique_sequences |= set(sequences)

    unique_sequences_stripped = set()
    for seq in unique_sequences:
        new_seq = seq.strip(seqs['node_type_token_dict']['input'])
        new_seq = new_seq.strip(seqs['node_type_token_dict']['output'])
        if len(new_seq) > 2:
            unique_sequences_stripped.add(new_seq)

    unique_sequences_decoded = []
    for sequence in unique_sequences_stripped:
        decoded_sequence = [seqs['token_node_type_dict'][i] for i in sequence]
        unique_sequences_decoded.append(decoded_sequence)

    print(f"Total Sequences: {total_sequences:,}, Total Unique Sequences: {len(unique_sequences_stripped):,}, Total Paths Compared: {seqs['total_path_pairs_analyzed']:,}")
    return unique_sequences_stripped, unique_sequences_decoded, total_sequences 

# PyTorch

In [7]:
with open('./onnx_parsing_results/torch_mismatch_seq_match_results.json', 'r') as f:
    mismatched_seq = json.load(f)

with open('./onnx_parsing_results/torch_correct_mismatch_seq_match_results.json', 'r') as f:
    mismatched_correct = json.load(f)

with open('./onnx_parsing_results/torch_test_mismatch_seq_match_results.json', 'r') as f:
    mismatched_test = json.load(f)

In [8]:
print("Mismatched Ops:", len(mismatched_seq['node_type_token_dict'].keys()))
print("Mismatched-Correct Ops:", len(mismatched_correct['node_type_token_dict'].keys()))
print("Mismatched-Test Ops:", len(mismatched_test['node_type_token_dict'].keys()))


Mismatched Ops: 58
Mismatched-Correct Ops: 59
Mismatched-Test Ops: 62


In [9]:
mismatched_seq['node_type_token_dict'].keys() - mismatched_correct['node_type_token_dict'].keys()

set()

In [10]:
mismatched_test['node_type_token_dict'].keys() - mismatched_correct['node_type_token_dict'].keys()


{'DepthToSpace', 'Flatten', 'Unsqueeze'}

## Mismatched Sequences

In [11]:
(
    unique_mismatched_sequences,
    unique_mismatched_sequences_decoded,
    total_mismatched_sequences,
) = get_unq_sequences(mismatched_seq)

Total Sequences: 70,301, Total Unique Sequences: 980, Total Paths Compared: 213,258,432


In [12]:
len(recursive_submatching(unique_mismatched_sequences, len(unique_mismatched_sequences)))

654
640
640


640

## Correct-Mismatched Sequences

In [13]:
(
    unique_corr_mismatched_sequences,
    unique_corr_mismatched_sequences_decoded,
    total_corr_mismatched_sequences,
) = get_unq_sequences(mismatched_correct)

Total Sequences: 923,707, Total Unique Sequences: 4,243, Total Paths Compared: 2,358,066,840


In [14]:
len(recursive_submatching(unique_corr_mismatched_sequences, len(unique_corr_mismatched_sequences)))

3105
2991
2989
2989


2989

### Create New Tokenizers and Calculate Non-Overlapping

In [15]:
node_type_token_dict, token_node_type_dict = create_tokenizers(
    mismatched_correct["node_type_token_dict"], mismatched_seq["node_type_token_dict"]
)

In [16]:
mismatched_sequences = set(tokenize_seqs(node_type_token_dict, unique_mismatched_sequences_decoded))
mismatched_correct_sequences = set(tokenize_seqs(node_type_token_dict, unique_corr_mismatched_sequences_decoded))

#### Nonoverlapping

In [17]:
seqs = mismatched_sequences - mismatched_correct_sequences
len(seqs)

176

In [18]:
detokenize_seqs(token_node_type_dict, seqs)

[['Shape', 'Slice', 'Concat', 'Reshape', 'Expand', 'Mul'],
 ['Pad', 'AveragePool', 'ArgMin', 'Less', 'Cast', 'And'],
 ['Resize', 'Abs', 'ReduceMax'],
 ['ReduceMin', 'Cast', 'Where'],
 ['MatMul', 'MatMul', 'Where'],
 ['Mul', 'Softmax', 'MatMul'],
 ['Cos', 'Mul', 'Mul', 'Max'],
 ['Tan', 'Sub', 'Div'],
 ['Ceil', 'Sin', 'Max'],
 ['Clip', 'Clip', 'Mul'],
 ['Ceil', 'Cast', 'ArgMax'],
 ['Atan', 'Shape', 'Slice', 'Concat', 'Resize', 'Round'],
 ['ReduceSum', 'Cast', 'ArgMin'],
 ['Concat', 'ReduceMax', 'Reshape'],
 ['Cast', 'Abs', 'Less'],
 ['PRelu', 'Concat', 'Shape', 'Slice', 'Concat', 'Resize', 'ReduceSum'],
 ['Where', 'Resize', 'Concat'],
 ['ReduceSum', 'Resize', 'Concat'],
 ['Cast', 'Pad', 'Mul'],
 ['Mul', 'Mul', 'Clip', 'Cast'],
 ['Cast', 'LeakyRelu', 'Tan'],
 ['Sub', 'Sin', 'Div'],
 ['Cast', 'Floor', 'Ceil'],
 ['Reshape', 'Cast', 'MatMul'],
 ['Greater', 'Where', 'Slice'],
 ['Shape', 'Slice', 'Concat', 'Reshape', 'Cast', 'MatMul'],
 ['Mul', 'Mul', 'Concat', 'Transpose'],
 ['Mul', 'Mul', 'C

In [19]:
reduced_sequences = recursive_submatching(seqs, len(seqs))

146
131
131


In [20]:
detokenize_seqs(token_node_type_dict, reduced_sequences)

[['Resize', 'Abs', 'ReduceMax'],
 ['ReduceMin', 'Cast', 'Where'],
 ['MatMul', 'MatMul', 'Where'],
 ['Mul', 'Softmax', 'MatMul'],
 ['Cos', 'Mul', 'Mul', 'Max'],
 ['Tan', 'Sub', 'Div'],
 ['Ceil', 'Sin', 'Max'],
 ['Clip', 'Clip', 'Mul'],
 ['Ceil', 'Cast', 'ArgMax'],
 ['ReduceSum', 'Cast', 'ArgMin'],
 ['Concat', 'ReduceMax', 'Reshape'],
 ['Cast', 'Abs', 'Less'],
 ['Where', 'Resize', 'Concat'],
 ['ReduceSum', 'Resize', 'Concat'],
 ['Cast', 'Pad', 'Mul'],
 ['Mul', 'Mul', 'Clip', 'Cast'],
 ['Cast', 'LeakyRelu', 'Tan'],
 ['Sub', 'Sin', 'Div'],
 ['Cast', 'Floor', 'Ceil'],
 ['Reshape', 'Cast', 'MatMul'],
 ['Greater', 'Where', 'Slice'],
 ['Div', 'Abs', 'Atan', 'Div'],
 ['Conv', 'Squeeze', 'Greater'],
 ['MatMul', 'Cast', 'Mul', 'Mul'],
 ['Floor', 'Cast', 'Mul', 'Neg'],
 ['ReduceSum', 'MatMul', 'Max'],
 ['MatMul', 'Mul', 'ReduceMax'],
 ['Concat', 'Max', 'Mul', 'Mul'],
 ['Trilu', 'LeakyRelu', 'Sub'],
 ['ReduceSum', 'Resize', 'Div'],
 ['Add', 'Div', 'Greater'],
 ['Cast', 'Mul', 'Trilu'],
 ['Atan', 'S

In [21]:
filtered = list(
    filter(
        lambda x: (len(x) > 2),
        detokenize_seqs(token_node_type_dict, reduced_sequences),
    )
)
len(filtered)

131

In [22]:
filtered_ops = set()
for filt in filtered:
    filtered_ops = filtered_ops.union(filt)
filtered_ops

{'Abs',
 'Add',
 'ArgMax',
 'ArgMin',
 'Atan',
 'BatchNormalization',
 'Cast',
 'Ceil',
 'Clip',
 'Concat',
 'Conv',
 'Cos',
 'Div',
 'Equal',
 'Expand',
 'Floor',
 'Gather',
 'Greater',
 'If',
 'LeakyRelu',
 'Less',
 'MatMul',
 'Max',
 'MaxPool',
 'Min',
 'Mul',
 'Neg',
 'Or',
 'PRelu',
 'Pad',
 'ReduceMax',
 'ReduceMean',
 'ReduceMin',
 'ReduceSum',
 'Relu',
 'Reshape',
 'Resize',
 'Round',
 'Shape',
 'Sigmoid',
 'Sin',
 'Slice',
 'Softmax',
 'Squeeze',
 'Sub',
 'Tan',
 'Transpose',
 'Trilu',
 'Where',
 'Xor'}

### Find Unique Sequence Distribution by Pairs

In [23]:
tokenized_filtered = tokenize_seqs(mismatched_seq['node_type_token_dict'], filtered)

In [24]:
sequence_counts_by_pair = {}
for i, (pair, sequences) in enumerate(mismatched_seq['results'].items()):
    a, b = pair.split(",")
    unq_seq = set(sequences)
    unq_seq_stripped = set()
    for seq in unq_seq:
        new_seq = seq.strip(mismatched_seq['node_type_token_dict']['input'])
        new_seq = new_seq.strip(mismatched_seq['node_type_token_dict']['output'])
        if len(new_seq) > 2:
            unq_seq_stripped.add(new_seq)
    
    for filt_seq in tokenized_filtered:
        if filt_seq in unq_seq_stripped:
            if a not in sequence_counts_by_pair:
                sequence_counts_by_pair[a] = set([filt_seq])
            else:
                sequence_counts_by_pair[a] |= set([filt_seq])
            if b not in sequence_counts_by_pair:
                sequence_counts_by_pair[b] = set([filt_seq])
            else:
                sequence_counts_by_pair[b] |= set([filt_seq])
        else:
            if a not in sequence_counts_by_pair:
                sequence_counts_by_pair[a] = set()
            if b not in sequence_counts_by_pair:
                sequence_counts_by_pair[b] = set()
filtered_sorted_pairs = dict(sorted({key: len(val) for key, val in sequence_counts_by_pair.items()}.items(), key=lambda x: x[1], reverse=True))
filtered_sorted_pairs

{'0': 10,
 '34': 10,
 '14': 9,
 '87': 8,
 '92': 8,
 '80': 7,
 '97': 7,
 '4': 6,
 '25': 6,
 '27': 6,
 '29': 6,
 '36': 6,
 '65': 6,
 '70': 6,
 '88': 6,
 '91': 6,
 '9': 5,
 '33': 5,
 '35': 5,
 '41': 5,
 '51': 5,
 '55': 5,
 '62': 5,
 '78': 5,
 '81': 5,
 '84': 5,
 '86': 5,
 '90': 5,
 '93': 5,
 '1': 4,
 '10': 4,
 '11': 4,
 '17': 4,
 '31': 4,
 '47': 4,
 '49': 4,
 '50': 4,
 '67': 4,
 '68': 4,
 '73': 4,
 '75': 4,
 '89': 4,
 '99': 4,
 '7': 3,
 '8': 3,
 '12': 3,
 '20': 3,
 '22': 3,
 '23': 3,
 '28': 3,
 '46': 3,
 '52': 3,
 '53': 3,
 '57': 3,
 '59': 3,
 '72': 3,
 '76': 3,
 '79': 3,
 '82': 3,
 '95': 3,
 '3': 2,
 '5': 2,
 '16': 2,
 '21': 2,
 '26': 2,
 '30': 2,
 '37': 2,
 '38': 2,
 '39': 2,
 '40': 2,
 '42': 2,
 '43': 2,
 '44': 2,
 '48': 2,
 '54': 2,
 '56': 2,
 '60': 2,
 '61': 2,
 '63': 2,
 '71': 2,
 '74': 2,
 '77': 2,
 '96': 2,
 '2': 1,
 '13': 1,
 '15': 1,
 '19': 1,
 '24': 1,
 '58': 1,
 '64': 1,
 '66': 1,
 '69': 1,
 '83': 1,
 '85': 1,
 '94': 1,
 '98': 1,
 '6': 0,
 '18': 0,
 '32': 0,
 '45': 0}

In [25]:
len(filtered_sorted_pairs)

100

## Mismatched-Test Sequences

In [26]:
(
    unique_test_mismatched_sequences,
    unique_test_mismatched_sequences_decoded,
    total_test_mismatched_sequences,
) = get_unq_sequences(mismatched_test)

Total Sequences: 1,116, Total Unique Sequences: 2, Total Paths Compared: 166,053,120


In [27]:
len(recursive_submatching(unique_test_mismatched_sequences, len(unique_test_mismatched_sequences)))

2


2

### Create New Tokenizers and Calculate Non-Overlapping

In [28]:
node_type_token_dict, token_node_type_dict = create_tokenizers(
    mismatched_test["node_type_token_dict"], mismatched_seq["node_type_token_dict"]
)

In [29]:
mismatched_sequences = set(tokenize_seqs(node_type_token_dict, unique_mismatched_sequences_decoded))
mismatched_test_sequences = set(tokenize_seqs(node_type_token_dict, unique_test_mismatched_sequences_decoded))

#### Nonoverlapping

In [30]:
seqs = mismatched_sequences - mismatched_test_sequences
len(seqs)

979

In [31]:
reduced_sequences = recursive_submatching(seqs, len(seqs))

659
636
636


In [32]:
detokenize_seqs(token_node_type_dict, reduced_sequences)

[['Max', 'Concat', 'Div'],
 ['Where', 'Concat', 'Reshape'],
 ['Min', 'ArgMax', 'Cast'],
 ['Mul', 'Concat', 'Resize'],
 ['MatMul', 'Add', 'Div'],
 ['Cast', 'Max', 'Cast'],
 ['Ceil', 'Concat', 'Softmax'],
 ['Squeeze', 'Abs', 'Concat'],
 ['Atan', 'Mul', 'Mul'],
 ['MatMul', 'Mul', 'Concat'],
 ['Cast', 'Concat', 'Cast', 'Div'],
 ['Mul', 'Cast', 'Concat'],
 ['Atan', 'Resize', 'Round'],
 ['Cast', 'Add', 'Resize'],
 ['Ceil', 'Relu', 'Concat'],
 ['Mul', 'Concat', 'Concat'],
 ['Concat', 'Pad', 'AveragePool'],
 ['ArgMax', 'Greater', 'Where', 'Slice'],
 ['Softmax', 'MatMul', 'Add'],
 ['Cast', 'Mul', 'Trilu'],
 ['Concat', 'Cast', 'Sub'],
 ['ReduceMean', 'Trilu', 'Trilu'],
 ['Mul', 'Mul', 'Where'],
 ['Mul', 'Concat', 'Transpose'],
 ['Softmax', 'Reshape', 'Concat'],
 ['Cast', 'MatMul', 'Div'],
 ['Resize', 'Abs', 'ReduceMax'],
 ['Concat', 'Resize', 'Concat'],
 ['Mul', 'Mul', 'Max'],
 ['Resize', 'Mul', 'Mul'],
 ['MatMul', 'Add', 'Add'],
 ['Relu', 'Mul', 'Slice'],
 ['Ceil', 'Cast', 'ArgMax'],
 ['Mul', '

In [33]:
list(
    filter(
        lambda x: (len(x) > 2) and ("input" not in x) and ("output" not in x),
        detokenize_seqs(token_node_type_dict, reduced_sequences),
    )
)

[['Max', 'Concat', 'Div'],
 ['Where', 'Concat', 'Reshape'],
 ['Min', 'ArgMax', 'Cast'],
 ['Mul', 'Concat', 'Resize'],
 ['MatMul', 'Add', 'Div'],
 ['Cast', 'Max', 'Cast'],
 ['Ceil', 'Concat', 'Softmax'],
 ['Squeeze', 'Abs', 'Concat'],
 ['Atan', 'Mul', 'Mul'],
 ['MatMul', 'Mul', 'Concat'],
 ['Cast', 'Concat', 'Cast', 'Div'],
 ['Mul', 'Cast', 'Concat'],
 ['Atan', 'Resize', 'Round'],
 ['Cast', 'Add', 'Resize'],
 ['Ceil', 'Relu', 'Concat'],
 ['Mul', 'Concat', 'Concat'],
 ['Concat', 'Pad', 'AveragePool'],
 ['ArgMax', 'Greater', 'Where', 'Slice'],
 ['Softmax', 'MatMul', 'Add'],
 ['Cast', 'Mul', 'Trilu'],
 ['Concat', 'Cast', 'Sub'],
 ['ReduceMean', 'Trilu', 'Trilu'],
 ['Mul', 'Mul', 'Where'],
 ['Mul', 'Concat', 'Transpose'],
 ['Softmax', 'Reshape', 'Concat'],
 ['Cast', 'MatMul', 'Div'],
 ['Resize', 'Abs', 'ReduceMax'],
 ['Concat', 'Resize', 'Concat'],
 ['Mul', 'Mul', 'Max'],
 ['Resize', 'Mul', 'Mul'],
 ['MatMul', 'Add', 'Add'],
 ['Relu', 'Mul', 'Slice'],
 ['Ceil', 'Cast', 'ArgMax'],
 ['Mul', '

# tf2onnx

In [34]:
with open('./onnx_parsing_results/tf2onnx_mismatch_seq_match_results.json', 'r') as f:
    mismatched_seq = json.load(f)
with open('./onnx_parsing_results/tf2onnx_correct_mismatch_seq_match_results.json', 'r') as f:
    mismatched_correct = json.load(f)
with open('./onnx_parsing_results/tf2onnx_test_mismatch_seq_match_results.json', 'r') as f:
    mismatched_test = json.load(f)

In [35]:
print("Mismatched Ops:", len(mismatched_seq['node_type_token_dict'].keys()))
print("Mismatched-Correct Ops:", len(mismatched_correct['node_type_token_dict'].keys()))
print("Mismatched-Test Ops:", len(mismatched_test['node_type_token_dict'].keys()))


Mismatched Ops: 54
Mismatched-Correct Ops: 54
Mismatched-Test Ops: 65


## Mismatched Sequences

In [36]:
(
    unique_mismatched_sequences,
    unique_mismatched_sequences_decoded,
    total_mismatched_sequences,
) = get_unq_sequences(mismatched_seq)

Total Sequences: 156,218, Total Unique Sequences: 2,155, Total Paths Compared: 135,239,592


In [37]:
len(recursive_submatching(unique_mismatched_sequences, len(unique_mismatched_sequences)))

1265
1144
1118
1115
1115


1115

## Correct-Mismatched Sequences

In [38]:
(
    unique_corr_mismatched_sequences,
    unique_corr_mismatched_sequences_decoded,
    total_corr_mismatched_sequences,
) = get_unq_sequences(mismatched_correct)

Total Sequences: 57,890, Total Unique Sequences: 1,050, Total Paths Compared: 80,075,700


In [39]:
len(recursive_submatching(unique_corr_mismatched_sequences, len(unique_corr_mismatched_sequences)))

605
526
507
507


507

### Create New Tokenizers and Calculate Non-Overlapping

In [40]:
node_type_token_dict, token_node_type_dict = create_tokenizers(
    mismatched_correct["node_type_token_dict"], mismatched_seq["node_type_token_dict"]
)

In [41]:
mismatched_sequences = set(tokenize_seqs(node_type_token_dict, unique_mismatched_sequences_decoded))
mismatched_correct_sequences = set(tokenize_seqs(node_type_token_dict, unique_corr_mismatched_sequences_decoded))

#### Nonoverlapping

In [42]:
seqs = mismatched_sequences - mismatched_correct_sequences
len(seqs)

1527

In [43]:
reduced_sequences = recursive_submatching(seqs, len(seqs))

1032
901
871
868
865
865


In [44]:
detokenize_seqs(token_node_type_dict, reduced_sequences)

[['Mul', 'Add', 'ReduceMean'],
 ['Round', 'Reshape', 'ConvTranspose'],
 ['Atan', 'MatMul', 'Concat', 'Floor'],
 ['And', 'And', 'Not'],
 ['ReverseSequence', 'Squeeze', 'Unsqueeze'],
 ['Transpose', 'Conv', 'Conv', 'Transpose'],
 ['ReduceSum', 'Mul', 'Add', 'Concat'],
 ['ReduceMean', 'Reshape', 'Conv', 'Conv'],
 ['Conv', 'Transpose', 'ConvTranspose', 'Transpose'],
 ['Reshape', 'MatMul', 'Reshape', 'ReduceSum'],
 ['Conv', 'Relu', 'Min'],
 ['Mul', 'ReverseSequence', 'Reshape', 'ReverseSequence'],
 ['Add', 'Mul', 'MatMul'],
 ['LeakyRelu', 'Mul', 'Mul'],
 ['Conv', 'Reshape', 'Unsqueeze'],
 ['Transpose', 'Conv', 'ConvTranspose'],
 ['Conv', 'Conv', 'LeakyRelu'],
 ['Conv', 'Squeeze', 'Transpose'],
 ['Min', 'Max', 'Tan'],
 ['Mul', 'Abs', 'Max'],
 ['Round', 'Concat', 'Concat'],
 ['Cast', 'Mul', 'Add', 'Atan'],
 ['Div', 'Min', 'Max'],
 ['Reshape', 'Mul', 'Mul'],
 ['ConvTranspose', 'Max', 'Conv'],
 ['LeakyRelu', 'Min', 'Max', 'Mul'],
 ['Max', 'MatMul', 'Concat'],
 ['Transpose', 'Min', 'Reshape'],
 [

In [45]:
filtered = list(
    filter(
        lambda x: (len(x) > 2),
        detokenize_seqs(token_node_type_dict, reduced_sequences),
    )
)
len(filtered)

865

In [46]:
filtered_ops = set()
for filt in filtered:
    filtered_ops = filtered_ops.union(filt)
filtered_ops

{'Abs',
 'Add',
 'And',
 'ArgMax',
 'Atan',
 'BatchNormalization',
 'Cast',
 'Ceil',
 'Concat',
 'Conv',
 'ConvTranspose',
 'Cos',
 'DepthToSpace',
 'Div',
 'Erf',
 'Expand',
 'Floor',
 'Gather',
 'GlobalAveragePool',
 'Greater',
 'LRN',
 'LeakyRelu',
 'MatMul',
 'Max',
 'Min',
 'Mul',
 'Neg',
 'Not',
 'Or',
 'ReduceMax',
 'ReduceMean',
 'ReduceMin',
 'ReduceProd',
 'ReduceSum',
 'Relu',
 'Reshape',
 'ReverseSequence',
 'Round',
 'Shape',
 'Sigmoid',
 'Slice',
 'Softmax',
 'SpaceToDepth',
 'Squeeze',
 'Sub',
 'Tan',
 'Transpose',
 'Unsqueeze'}

### Find Unique Sequence Distribution by Pairs

In [47]:
tokenized_filtered = tokenize_seqs(mismatched_seq['node_type_token_dict'], filtered)

In [48]:
sequence_counts_by_pair = {}
for i, (pair, sequences) in enumerate(mismatched_seq['results'].items()):
    a, b = pair.split(",")
    unq_seq = set(sequences)
    unq_seq_stripped = set()
    for seq in unq_seq:
        new_seq = seq.strip(mismatched_seq['node_type_token_dict']['input'])
        new_seq = new_seq.strip(mismatched_seq['node_type_token_dict']['output'])
        if len(new_seq) > 2:
            unq_seq_stripped.add(new_seq)
    
    for filt_seq in tokenized_filtered:
        if filt_seq in unq_seq_stripped:
            if a not in sequence_counts_by_pair:
                sequence_counts_by_pair[a] = set([filt_seq])
            else:
                sequence_counts_by_pair[a] |= set([filt_seq])
            if b not in sequence_counts_by_pair:
                sequence_counts_by_pair[b] = set([filt_seq])
            else:
                sequence_counts_by_pair[b] |= set([filt_seq])
        else:
            if a not in sequence_counts_by_pair:
                sequence_counts_by_pair[a] = set()
            if b not in sequence_counts_by_pair:
                sequence_counts_by_pair[b] = set()
filtered_sorted_pairs = dict(sorted({key: len(val) for key, val in sequence_counts_by_pair.items()}.items(), key=lambda x: x[1], reverse=True))
filtered_sorted_pairs

{'71': 55,
 '29': 52,
 '64': 51,
 '82': 46,
 '196': 45,
 '208': 45,
 '50': 43,
 '77': 37,
 '140': 36,
 '166': 36,
 '32': 35,
 '2': 33,
 '193': 33,
 '144': 32,
 '164': 32,
 '69': 31,
 '156': 29,
 '216': 29,
 '1': 28,
 '41': 28,
 '60': 28,
 '46': 27,
 '91': 27,
 '211': 27,
 '72': 26,
 '94': 26,
 '118': 26,
 '151': 26,
 '126': 25,
 '133': 25,
 '161': 25,
 '28': 24,
 '70': 24,
 '183': 24,
 '201': 24,
 '15': 21,
 '90': 21,
 '112': 21,
 '136': 21,
 '182': 21,
 '88': 20,
 '56': 19,
 '86': 19,
 '96': 19,
 '202': 19,
 '35': 18,
 '37': 18,
 '203': 18,
 '212': 18,
 '38': 17,
 '117': 17,
 '129': 17,
 '13': 16,
 '100': 16,
 '121': 16,
 '157': 16,
 '30': 15,
 '102': 15,
 '109': 15,
 '51': 14,
 '53': 14,
 '79': 14,
 '101': 14,
 '128': 14,
 '172': 14,
 '173': 14,
 '174': 14,
 '27': 13,
 '52': 13,
 '58': 13,
 '76': 13,
 '85': 13,
 '186': 13,
 '21': 12,
 '55': 12,
 '113': 12,
 '127': 12,
 '146': 12,
 '209': 12,
 '0': 11,
 '99': 11,
 '124': 11,
 '131': 11,
 '162': 11,
 '170': 11,
 '190': 11,
 '197': 11,


In [49]:
len(sequence_counts_by_pair)

220

## Mismatched-Test Sequences

In [50]:
(
    unique_test_mismatched_sequences,
    unique_test_mismatched_sequences_decoded,
    total_test_mismatched_sequences,
) = get_unq_sequences(mismatched_test)

Total Sequences: 5,481, Total Unique Sequences: 35, Total Paths Compared: 0


In [51]:
len(recursive_submatching(unique_test_mismatched_sequences, len(unique_test_mismatched_sequences)))

35


35

### Create New Tokenizers and Calculate Non-Overlapping

In [52]:
node_type_token_dict, token_node_type_dict = create_tokenizers(
    mismatched_test["node_type_token_dict"], mismatched_seq["node_type_token_dict"]
)

In [53]:
mismatched_sequences = set(tokenize_seqs(node_type_token_dict, unique_mismatched_sequences_decoded))
mismatched_test_sequences = set(tokenize_seqs(node_type_token_dict, unique_test_mismatched_sequences_decoded))

#### Nonoverlapping

In [54]:
len(mismatched_sequences)

2155

In [55]:
len(mismatched_test_sequences)

35

In [56]:
len(mismatched_sequences - mismatched_test_sequences)

2135

In [57]:
seqs =  mismatched_sequences - mismatched_test_sequences
len(seqs)

2135

In [58]:
reduced_sequences = recursive_submatching(seqs, len(seqs))

1255
1129
1106
1102
1102


In [59]:
list(
    filter(
        lambda x: (len(x) > 2) and ("input" not in x) and ("output" not in x),
        detokenize_seqs(token_node_type_dict, reduced_sequences),
    )
)

[['Neg', 'Mul', 'Add'],
 ['ReduceSum', 'Mul', 'Add', 'Concat'],
 ['Reshape', 'Tan', 'Reshape'],
 ['Reshape', 'MatMul', 'Reshape', 'ReduceSum'],
 ['Transpose', 'Conv', 'Cast'],
 ['Min', 'MatMul', 'Concat'],
 ['Max', 'Reshape', 'Conv'],
 ['ReduceSum', 'Concat', 'Div'],
 ['Sigmoid', 'Mul', 'Add'],
 ['ReduceMax', 'Min', 'Max'],
 ['Round', 'Mul', 'Add'],
 ['Transpose', 'Concat', 'Concat'],
 ['ReduceProd', 'Reshape', 'SpaceToDepth'],
 ['Tan', 'Reshape', 'ConvTranspose'],
 ['Reshape', 'Slice', 'Transpose', 'Conv'],
 ['Max', 'Shape', 'Gather'],
 ['Conv', 'Reshape', 'Atan'],
 ['Tan', 'Round', 'Floor'],
 ['Transpose', 'Conv', 'ReduceMax', 'Transpose'],
 ['Transpose', 'Conv', 'Neg'],
 ['Transpose', 'Min', 'Max'],
 ['Min', 'Max', 'Tan', 'Sub'],
 ['Relu', 'Concat', 'Concat'],
 ['Concat', 'Concat', 'Cos'],
 ['ConvTranspose', 'Max', 'Conv'],
 ['Conv', 'ReduceProd', 'Reshape'],
 ['And', 'And', 'Not'],
 ['Reshape', 'MatMul', 'Reshape', 'Tan'],
 ['Conv', 'ReduceMax', 'Reshape'],
 ['Max', 'Conv', 'Transp