In [24]:
import pandas as pd
import numpy as np
import re
import math
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN, AgglomerativeClustering

In [25]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [26]:
# Read in text file that shows PROSE program to transformation
file_path = 'programs_one_to_one.txt'

with open(file_path, 'r') as file:
    data = file.read()

# Split the data on the word "Program:" but keep it as part of the split results
programs_transformations = data.split("\n")
programs_transformations.remove("")

programs = []
transformations = []
lhs = []
rhs = []

for item in programs_transformations:
    item_split = item.split("==>")
    transformation = item_split[1]
    programs.append(item_split[0])
    transformations.append(transformation)
    transformation_split = transformation.split("->")
    lhs.append(transformation_split[0])
    rhs.append(transformation_split[1])



df = pd.DataFrame({
    'program': programs,
    'transformation': transformations,
    'lhs': lhs,
    'rhs': rhs
})

df

Unnamed: 0,program,transformation,lhs,rhs
0,"let columnName = ""0"" in let x = ChooseInput(vs...",Computers as Theatre->Computers As Theatre,Computers as Theatre,Computers As Theatre
1,"Concat(let columnName = ""0"" in let x = ChooseI...",Computers As Theatre->Computers as Theatre,Computers As Theatre,Computers as Theatre
2,"Concat(ConstStr(""\""""), Concat(let columnName =...","Brenda Laurel->""Laurel, Brenda""",Brenda Laurel,"""Laurel, Brenda"""
3,"Concat(let columnName = ""0"" in let x = ChooseI...","""Laurel, Brenda""->Brenda Laurel","""Laurel, Brenda""",Brenda Laurel
4,"Concat(let columnName = ""0"" in let x = ChooseI...",Tru64 Unix Troubleshooting Diagnosing &amp;->T...,Tru64 Unix Troubleshooting Diagnosing &amp;,Tru64 UNIX Troubleshooting : Diagnosing and Co...
...,...,...,...,...
485,"Concat(ConstStr(""\""""), Concat(let columnName =...","""Farin, Gerald""->""Gerald E. Farin,""","""Farin, Gerald""","""Gerald E. Farin,"""
486,"Concat(let columnName = ""0"" in let x = ChooseI...",Computer Systems Design and Architecture->Comp...,Computer Systems Design and Architecture,Computer Systems Design and Architecture (2nd ...
487,"let columnName = ""0"" in let x = ChooseInput(vs...",Computer Systems Design and Architecture (2nd ...,Computer Systems Design and Architecture (2nd ...,Computer Systems Design and Architecture
488,"Concat(ConstStr(""\""Vincent P. ""), Concat(let c...","Heuring->""Vincent P. Heuring, Harry F. Jordan""",Heuring,"""Vincent P. Heuring, Harry F. Jordan"""


In [27]:
file_path = 'transformation_feature.tsv'

df.to_csv(file_path, sep='\t', index=False)


In [28]:
sample = pd.read_csv("SampleData.csv", header=None)
source = sample.iloc[:, 0]
print(source)
transformation_embeddings = [model.encode(text) for text in source]
cosine_similarities = cosine_similarity(transformation_embeddings)
print(cosine_similarities)

0    Concat(ConstStr("\""), Concat(let columnName =...
1    Concat(let columnName = "0" in let x = ChooseI...
2    Concat(let columnName = "0" in let x = ChooseI...
3    Concat(ConstStr("\""), Concat(let columnName =...
4    let columnName = "0" in let x = ChooseInput(vs...
5    Concat(ConstStr("\""), Concat(let columnName =...
Name: 0, dtype: object
[[1.0000002  0.9542012  0.9236101  0.98526764 0.82243955 0.76877034]
 [0.9542012  1.0000001  0.9513806  0.94090664 0.9172368  0.7299446 ]
 [0.9236101  0.9513806  1.         0.95094526 0.87627685 0.7260963 ]
 [0.98526764 0.94090664 0.95094526 1.0000002  0.81626046 0.75917137]
 [0.82243955 0.9172368  0.87627685 0.81626046 1.0000005  0.69265294]
 [0.76877034 0.7299446  0.7260963  0.75917137 0.69265294 1.0000005 ]]


In [29]:
def naturalize_code(code):
    replacements = {
        'ÃŽÂµ': 'end of string or no character',
        'Concat': 'join strings',
        'ConstStr': 'fixed string',
        'RegexPair': 'pattern matching',
        'let columnName = "0" in let x = ChooseInput(vs, columnName) in': 'select column "0" from the data',
        '\\"': 'quotation mark',
        'RegexPositionRelative': 'find position relative to pattern',
        'SubStr': 'extract substring',
        'PosPair': 'position pair indicating start and end'
    }


    for old, new in replacements.items():
        code = re.sub(re.escape(old), new, code)

    return code

def affixes_code(code):
    # Regex replacement for 'ConstStr("something")' to 'ConstStr'
    return re.sub(r'ConstStr\("[^"]*"\)', 'ConstStr', code)

# df['pseudo-coded'] = df.iloc[:, 0].apply(naturalize_code)
# df['clean-coded'] = df.iloc[:, 0].apply(affixes_code)

In [30]:
transformations = df.iloc[:, 0]
transformation_embeddings = [model.encode(text) for text in transformations]

In [31]:
cosine_similarities = cosine_similarity(transformation_embeddings)
print(cosine_similarities.shape)

(490, 490)


In [32]:
simiarity_theshold = 0.98

row_indices, col_indices = np.where(cosine_similarities > simiarity_theshold)

groups = {}
undirected_graph = {} 


# Iterate over unique row indices
for i in np.unique(row_indices):
    relevant_cols = col_indices[row_indices == i]

    if i not in undirected_graph:
        undirected_graph[i] = []
    undirected_graph[i].extend(relevant_cols.tolist())

    for col in relevant_cols:
        if col not in undirected_graph:
            undirected_graph[col] = []
        undirected_graph[col].append(i)
    
    # Store relevant column indices in groups
    groups[i] = relevant_cols.tolist()

# Convert the lists to sets to remove duplicates in the undirected graph
for key in undirected_graph:
    undirected_graph[key] = list(set(undirected_graph[key]))

for i in range(len(df)):
    if i not in groups:
        groups[i] = []
    if i not in undirected_graph:
        undirected_graph[i] = [i]

In [33]:
graph = groups.copy()
clusters = []
visited = set()

def dfs(v, curr_cluster):
    visited.add(v)
    curr_cluster.append(v)
    for w in graph[v]:
        if w not in visited:
            dfs(w, curr_cluster)

for node in graph:
    if node not in visited:
        cluster = []
        dfs(node, cluster)
        clusters.append(cluster)

In [34]:
print(clusters)

[[0, 83, 85, 119, 159, 185], [1], [2, 19, 61, 89, 92, 64, 7, 178, 96, 101, 104, 108, 113, 129, 140, 171, 174, 187, 192, 200, 212, 216, 220, 232, 240, 244, 224, 347, 253, 264, 268, 277, 282, 286, 290, 299, 302, 318, 322, 366, 390, 362, 465, 395, 414, 418, 434, 443, 476, 484], [3, 93, 97, 100, 112, 141, 175, 186, 193, 208, 209, 213, 217, 221, 233, 252, 269, 287, 319, 367, 394, 415, 419, 435, 442, 473, 477], [4, 425, 263], [5], [6], [8], [9, 78, 102, 153, 32, 43, 45, 58, 87, 124, 162, 229, 272, 326, 330, 343, 345, 398, 351, 439, 210, 218, 230, 238, 254, 266, 300, 349, 352, 377, 429, 449, 483, 487], [10, 37, 36, 11, 14, 248, 261, 260, 179, 382, 48, 132, 49, 315, 133, 136, 307, 371, 314, 370, 378, 379, 383, 407, 426, 430, 431, 450, 468, 427, 333, 336, 475, 306, 406, 117], [12, 256], [13], [15, 128], [16], [17], [18, 88, 276, 472], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [33], [34], [35], [38, 99, 98, 400, 111, 478, 180, 304, 305, 181], [39], [40], [41], [42],

In [35]:
with open('clusters.txt', 'w') as f:
    for clus in clusters:
        f.write('[')
        for idx in clus:
            f.write(f'{transformations.iloc[idx]}; ')
        f.write(']\n\n')

str1Tostr2 = df.iloc[:, 1]
with open('str1Tostr2Clusters.txt', 'w') as f:
    for clus in clusters:
        f.write('[')
        for idx in clus:
            f.write(f'{str1Tostr2.iloc[idx]}; ')
        f.write(']\n\n')

In [36]:
from collections import Counter

lengths =[]
for cluster in clusters:
    lengths.append(len(cluster))
print(Counter(lengths))
unique_values_count = df.iloc[:, 0].nunique()
print("Number of unique values in program column:", unique_values_count)

total_entries = df.count()['program']
false_negatives = Counter(lengths)[1]
true_positives = total_entries - false_negatives

Counter({1: 236, 2: 17, 3: 4, 4: 3, 6: 1, 50: 1, 27: 1, 34: 1, 36: 1, 10: 1, 9: 1, 11: 1, 5: 1, 8: 1})
Number of unique values in program column: 362


In [37]:
# Read in text file that shows PROSE program to transformation
file_path = 'programs_one_to_one_conflict.txt'

with open(file_path, 'r') as file:
    data = file.read()

# Split the data on the word "Program:" but keep it as part of the split results
programs_transformations = data.split("\n")
programs_transformations.remove("")

programs = []
transformations = []
lhs = []
rhs = []

for item in programs_transformations:
    item_split = item.split("==>")
    transformation = item_split[1]
    programs.append(item_split[0])
    transformations.append(transformation)
    transformation_split = transformation.split("->")
    lhs.append(transformation_split[0])
    rhs.append(transformation_split[1])



df = pd.DataFrame({
    'program': programs,
    'transformation': transformations,
    'lhs': lhs,
    'rhs': rhs
})

df

Unnamed: 0,program,transformation,lhs,rhs
0,"Concat(ConstStr(""Hack ""), Concat(let columnNam...",ADVANCES IN COMPUTERS V56->Hack I.T. - Securit...,ADVANCES IN COMPUTERS V56,Hack I.T. - Security Through Penetration Testi...
1,"Concat(let columnName = ""0"" in let x = ChooseI...",Hack I.T. - Security Through Penetration Testi...,Hack I.T. - Security Through Penetration Testi...,ADVANCES IN COMPUTERS V56
2,"Concat(ConstStr(""\""Klevinsky, T. J.""), let col...","""Zelkowitz, Marvin""->""Klevinsky, T. J.""","""Zelkowitz, Marvin""","""Klevinsky, T. J."""
3,"Concat(ConstStr(""\""Zelkowitz, Marvin""), let co...","""Klevinsky, T. J.""->""Zelkowitz, Marvin""","""Klevinsky, T. J.""","""Zelkowitz, Marvin"""
4,"Concat(ConstStr(""Communication Networking an A...",Wi-Foo: The Secrets of Wireless Hacking->Commu...,Wi-Foo: The Secrets of Wireless Hacking,Communication Networking an Analytical A
...,...,...,...,...
495,"Concat(ConstStr(""\""Beveridge, Jim, Wiener, Rob...","""Fortier,Paul""->""Beveridge, Jim, Wiener, Robert""","""Fortier,Paul""","""Beveridge, Jim, Wiener, Robert"""
496,"Concat(ConstStr(""Accelerated ""), Concat(let co...",Data Structures With C++ Using Stl->Accelerate...,Data Structures With C++ Using Stl,Accelerated C++ Practical Prog By Exampl
497,"Concat(ConstStr(""Data Structures With ""), Conc...",Accelerated C++ Practical Prog By Exampl->Data...,Accelerated C++ Practical Prog By Exampl,Data Structures With C++ Using Stl
498,"Concat(ConstStr(""\""Koenig, Andrew""), let colum...","""Ford, William; Topp, William; Topp, William R...","""Ford, William; Topp, William; Topp, William R.""","""Koenig, Andrew"""


In [38]:
def naturalize_code(code):
    replacements = {
        'ÃŽÂµ': 'end of string or no character',
        'Concat': 'join strings',
        'ConstStr': 'fixed string',
        'RegexPair': 'pattern matching',
        'let columnName = "0" in let x = ChooseInput(vs, columnName) in': 'select column "0" from the data',
        '\\"': 'quotation mark',
        'RegexPositionRelative': 'find position relative to pattern',
        'SubStr': 'extract substring',
        'PosPair': 'position pair indicating start and end'
    }

    # Apply all replacements to the code
    for old, new in replacements.items():
        code = re.sub(re.escape(old), new, code)

    return code

def affixes_code(code):
    # Regex replacement for 'ConstStr("something")' to 'ConstStr'
    return re.sub(r'ConstStr\("[^"]*"\)', 'ConstStr', code)

# df['pseudo-coded'] = df.iloc[:, 0].apply(naturalize_code)
# df['clean-coded'] = df.iloc[:, 0].apply(affixes_code)

df

Unnamed: 0,program,transformation,lhs,rhs
0,"Concat(ConstStr(""Hack ""), Concat(let columnNam...",ADVANCES IN COMPUTERS V56->Hack I.T. - Securit...,ADVANCES IN COMPUTERS V56,Hack I.T. - Security Through Penetration Testi...
1,"Concat(let columnName = ""0"" in let x = ChooseI...",Hack I.T. - Security Through Penetration Testi...,Hack I.T. - Security Through Penetration Testi...,ADVANCES IN COMPUTERS V56
2,"Concat(ConstStr(""\""Klevinsky, T. J.""), let col...","""Zelkowitz, Marvin""->""Klevinsky, T. J.""","""Zelkowitz, Marvin""","""Klevinsky, T. J."""
3,"Concat(ConstStr(""\""Zelkowitz, Marvin""), let co...","""Klevinsky, T. J.""->""Zelkowitz, Marvin""","""Klevinsky, T. J.""","""Zelkowitz, Marvin"""
4,"Concat(ConstStr(""Communication Networking an A...",Wi-Foo: The Secrets of Wireless Hacking->Commu...,Wi-Foo: The Secrets of Wireless Hacking,Communication Networking an Analytical A
...,...,...,...,...
495,"Concat(ConstStr(""\""Beveridge, Jim, Wiener, Rob...","""Fortier,Paul""->""Beveridge, Jim, Wiener, Robert""","""Fortier,Paul""","""Beveridge, Jim, Wiener, Robert"""
496,"Concat(ConstStr(""Accelerated ""), Concat(let co...",Data Structures With C++ Using Stl->Accelerate...,Data Structures With C++ Using Stl,Accelerated C++ Practical Prog By Exampl
497,"Concat(ConstStr(""Data Structures With ""), Conc...",Accelerated C++ Practical Prog By Exampl->Data...,Accelerated C++ Practical Prog By Exampl,Data Structures With C++ Using Stl
498,"Concat(ConstStr(""\""Koenig, Andrew""), let colum...","""Ford, William; Topp, William; Topp, William R...","""Ford, William; Topp, William; Topp, William R.""","""Koenig, Andrew"""


In [39]:
df

Unnamed: 0,program,transformation,lhs,rhs
0,"Concat(ConstStr(""Hack ""), Concat(let columnNam...",ADVANCES IN COMPUTERS V56->Hack I.T. - Securit...,ADVANCES IN COMPUTERS V56,Hack I.T. - Security Through Penetration Testi...
1,"Concat(let columnName = ""0"" in let x = ChooseI...",Hack I.T. - Security Through Penetration Testi...,Hack I.T. - Security Through Penetration Testi...,ADVANCES IN COMPUTERS V56
2,"Concat(ConstStr(""\""Klevinsky, T. J.""), let col...","""Zelkowitz, Marvin""->""Klevinsky, T. J.""","""Zelkowitz, Marvin""","""Klevinsky, T. J."""
3,"Concat(ConstStr(""\""Zelkowitz, Marvin""), let co...","""Klevinsky, T. J.""->""Zelkowitz, Marvin""","""Klevinsky, T. J.""","""Zelkowitz, Marvin"""
4,"Concat(ConstStr(""Communication Networking an A...",Wi-Foo: The Secrets of Wireless Hacking->Commu...,Wi-Foo: The Secrets of Wireless Hacking,Communication Networking an Analytical A
...,...,...,...,...
495,"Concat(ConstStr(""\""Beveridge, Jim, Wiener, Rob...","""Fortier,Paul""->""Beveridge, Jim, Wiener, Robert""","""Fortier,Paul""","""Beveridge, Jim, Wiener, Robert"""
496,"Concat(ConstStr(""Accelerated ""), Concat(let co...",Data Structures With C++ Using Stl->Accelerate...,Data Structures With C++ Using Stl,Accelerated C++ Practical Prog By Exampl
497,"Concat(ConstStr(""Data Structures With ""), Conc...",Accelerated C++ Practical Prog By Exampl->Data...,Accelerated C++ Practical Prog By Exampl,Data Structures With C++ Using Stl
498,"Concat(ConstStr(""\""Koenig, Andrew""), let colum...","""Ford, William; Topp, William; Topp, William R...","""Ford, William; Topp, William; Topp, William R.""","""Koenig, Andrew"""


In [40]:
transformations = df.iloc[:, 0]
transformation_embeddings = [model.encode(text) for text in transformations]
cosine_similarities = cosine_similarity(transformation_embeddings)
print(cosine_similarities.shape)

(500, 500)


In [41]:
row_indices, col_indices = np.where(cosine_similarities > simiarity_theshold)

groups = {}
undirected_graph = {} 


# Iterate over unique row indices
for i in np.unique(row_indices):
    relevant_cols = col_indices[row_indices == i]

    if i not in undirected_graph:
        undirected_graph[i] = []
    undirected_graph[i].extend(relevant_cols.tolist())

    for col in relevant_cols:
        if col not in undirected_graph:
            undirected_graph[col] = []
        undirected_graph[col].append(i)
    
    # Store relevant column indices in groups
    groups[i] = relevant_cols.tolist()

# Convert the lists to sets to remove duplicates in the undirected graph
for key in undirected_graph:
    undirected_graph[key] = list(set(undirected_graph[key]))

for i in range(len(df)):
    if i not in groups:
        groups[i] = []
    if i not in undirected_graph:
        undirected_graph[i] = [i]

In [42]:
graph = groups.copy()
clusters = []
visited = set()

def dfs(v, curr_cluster):
    visited.add(v)
    curr_cluster.append(v)
    for w in graph[v]:
        if w not in visited:
            dfs(w, curr_cluster)

for node in graph:
    if node not in visited:
        cluster = []
        dfs(node, cluster)
        clusters.append(cluster)

In [43]:
lengths =[]
for cluster in clusters:
    lengths.append(len(cluster))
print(Counter(lengths))
unique_values_count = df.iloc[:, 0].nunique()
print("Number of unique values in program column:", unique_values_count)

total_entries = df.count()['program']
true_negatives = Counter(lengths)[1]
false_positives = total_entries - true_negatives

Counter({1: 430, 2: 7, 3: 2, 46: 1, 4: 1})
Number of unique values in program column: 496


Counter({1: 430, 2: 7, 3: 2, 46: 1, 4: 1})

In [44]:
print(clusters)

[[0], [1], [2, 3, 19, 183, 71, 98, 31, 251, 130, 278, 355, 123, 295, 226, 383, 411, 18, 58, 491, 95, 211, 262, 298, 379, 466, 427, 490, 378, 498, 182, 410, 250, 294, 426, 494, 382, 386, 455, 235, 47, 147, 15, 219, 151, 334, 299], [4], [5], [6, 22], [7], [8], [9], [10], [11], [12], [13], [14], [16], [17], [20], [21], [23, 263], [24], [25], [26], [27], [28], [29], [30], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [48], [49], [50], [51], [52], [53], [54, 259, 483, 350], [55], [56], [57], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [96], [97], [99], [100], [101], [102], [103], [104], [105], [106], [107], [108], [109], [110], [111], [112], [113], [114], [115], [116], [117], [118], [119], [120], [121], [122], [124], [125], [126], [127, 191, 375], [128, 448], [129], [131], [132], [133

In [45]:
import math
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
MCC = (true_positives * true_negatives - false_positives * false_negatives) / math.sqrt((true_positives + false_positives) * (true_positives + false_negatives) * (true_negatives + false_positives) * (true_negatives + false_negatives))

print("Precision:", precision)
print("Recall:", recall)
print("MCC:", MCC)

Precision: 0.7839506172839507
Recall: 0.5183673469387755
MCC: 0.4031689450583762
