In [130]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN, AgglomerativeClustering

In [131]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [132]:
# Read in text file that shows PROSE program to transformation
file_path = 'programs_one_to_one.txt'

with open(file_path, 'r') as file:
    data = file.read()

# Split the data on the word "Program:" but keep it as part of the split results
programs_transformations = data.split("\n")
programs_transformations.remove("")

programs = []
transformations = []
lhs = []
rhs = []

for item in programs_transformations:
    item_split = item.split("==>")
    transformation = item_split[1]
    programs.append(item_split[0])
    transformations.append(transformation)
    transformation_split = transformation.split("->")
    lhs.append(transformation_split[0])
    rhs.append(transformation_split[1])



df = pd.DataFrame({
    'program': programs,
    'transformation': transformations,
    'lhs': lhs,
    'rhs': rhs
})

df

Unnamed: 0,program,transformation,lhs,rhs
0,"let columnName = ""0"" in let x = ChooseInput(vs...",Computers as Theatre->Computers As Theatre,Computers as Theatre,Computers As Theatre
1,"Concat(let columnName = ""0"" in let x = ChooseI...",Computers As Theatre->Computers as Theatre,Computers As Theatre,Computers as Theatre
2,"Concat(ConstStr(""\""""), Concat(let columnName =...","Brenda Laurel->""Laurel, Brenda""",Brenda Laurel,"""Laurel, Brenda"""
3,"Concat(let columnName = ""0"" in let x = ChooseI...","""Laurel, Brenda""->Brenda Laurel","""Laurel, Brenda""",Brenda Laurel
4,"Concat(let columnName = ""0"" in let x = ChooseI...",Tru64 Unix Troubleshooting Diagnosing &amp;->T...,Tru64 Unix Troubleshooting Diagnosing &amp;,Tru64 UNIX Troubleshooting : Diagnosing and Co...
...,...,...,...,...
485,"Concat(ConstStr(""\""""), Concat(let columnName =...","""Farin, Gerald""->""Gerald E. Farin,""","""Farin, Gerald""","""Gerald E. Farin,"""
486,"Concat(let columnName = ""0"" in let x = ChooseI...",Computer Systems Design and Architecture->Comp...,Computer Systems Design and Architecture,Computer Systems Design and Architecture (2nd ...
487,"let columnName = ""0"" in let x = ChooseInput(vs...",Computer Systems Design and Architecture (2nd ...,Computer Systems Design and Architecture (2nd ...,Computer Systems Design and Architecture
488,"Concat(ConstStr(""\""Vincent P. ""), Concat(let c...","Heuring->""Vincent P. Heuring, Harry F. Jordan""",Heuring,"""Vincent P. Heuring, Harry F. Jordan"""


In [133]:
file_path = 'transformation_feature.tsv'

df.to_csv(file_path, sep='\t', index=False)


In [134]:
sample = pd.read_csv("SampleData.csv", header=None)
source = sample.iloc[:, 0]
print(source)
transformation_embeddings = [model.encode(text) for text in source]
cosine_similarities = cosine_similarity(transformation_embeddings)
print(cosine_similarities)

0    Concat(ConstStr("\""), Concat(let columnName =...
1    Concat(let columnName = "0" in let x = ChooseI...
2    Concat(let columnName = "0" in let x = ChooseI...
3    Concat(ConstStr("\""), Concat(let columnName =...
4    let columnName = "0" in let x = ChooseInput(vs...
5    Concat(ConstStr("\""), Concat(let columnName =...
Name: 0, dtype: object
[[1.0000002  0.9542012  0.9236101  0.98526764 0.82243955 0.76877034]
 [0.9542012  1.0000001  0.9513806  0.94090664 0.9172368  0.7299446 ]
 [0.9236101  0.9513806  1.         0.95094526 0.87627685 0.7260963 ]
 [0.98526764 0.94090664 0.95094526 1.0000002  0.81626046 0.75917137]
 [0.82243955 0.9172368  0.87627685 0.81626046 1.0000005  0.69265294]
 [0.76877034 0.7299446  0.7260963  0.75917137 0.69265294 1.0000005 ]]


In [135]:
# def naturalize_code(code):
#     replacements = {
#         'ÃŽÂµ': 'end of string or no character',
#         'Concat': 'join strings',
#         'ConstStr': 'fixed string',
#         'RegexPair': 'pattern matching',
#         'let columnName = "0" in let x = ChooseInput(vs, columnName) in': 'select column "0" from the data',
#         '\\"': 'quotation mark',
#         'RegexPositionRelative': 'find position relative to pattern',
#         'SubStr': 'extract substring',
#         'PosPair': 'position pair indicating start and end'
#     }

#     # Apply all replacements to the code
#     for old, new in replacements.items():
#         code = re.sub(re.escape(old), new, code)

#     return code

# df['pseudo-coded'] = df.iloc[:, 0].apply(naturalize_code)


# df


Unnamed: 0,program,transformation,lhs,rhs,pseudo-coded
0,"let columnName = ""0"" in let x = ChooseInput(vs...",Computers as Theatre->Computers As Theatre,Computers as Theatre,Computers As Theatre,"select column ""0"" from the data ToSimpleTitleC..."
1,"Concat(let columnName = ""0"" in let x = ChooseI...",Computers As Theatre->Computers as Theatre,Computers As Theatre,Computers as Theatre,"join strings(select column ""0"" from the data e..."
2,"Concat(ConstStr(""\""""), Concat(let columnName =...","Brenda Laurel->""Laurel, Brenda""",Brenda Laurel,"""Laurel, Brenda""","join strings(fixed string(""quotation mark""), j..."
3,"Concat(let columnName = ""0"" in let x = ChooseI...","""Laurel, Brenda""->Brenda Laurel","""Laurel, Brenda""",Brenda Laurel,"join strings(select column ""0"" from the data e..."
4,"Concat(let columnName = ""0"" in let x = ChooseI...",Tru64 Unix Troubleshooting Diagnosing &amp;->T...,Tru64 Unix Troubleshooting Diagnosing &amp;,Tru64 UNIX Troubleshooting : Diagnosing and Co...,"join strings(select column ""0"" from the data e..."
...,...,...,...,...,...
485,"Concat(ConstStr(""\""""), Concat(let columnName =...","""Farin, Gerald""->""Gerald E. Farin,""","""Farin, Gerald""","""Gerald E. Farin,""","join strings(fixed string(""quotation mark""), j..."
486,"Concat(let columnName = ""0"" in let x = ChooseI...",Computer Systems Design and Architecture->Comp...,Computer Systems Design and Architecture,Computer Systems Design and Architecture (2nd ...,"join strings(select column ""0"" from the data x..."
487,"let columnName = ""0"" in let x = ChooseInput(vs...",Computer Systems Design and Architecture (2nd ...,Computer Systems Design and Architecture (2nd ...,Computer Systems Design and Architecture,"select column ""0"" from the data extract substr..."
488,"Concat(ConstStr(""\""Vincent P. ""), Concat(let c...","Heuring->""Vincent P. Heuring, Harry F. Jordan""",Heuring,"""Vincent P. Heuring, Harry F. Jordan""","join strings(fixed string(""quotation markVince..."


In [136]:
transformations = df.iloc[:, 0]
transformation_embeddings = [model.encode(text) for text in transformations]

In [137]:
np.save('transformation_embeddings.npy', transformation_embeddings)

In [138]:
cosine_similarities = cosine_similarity(transformation_embeddings)
print(cosine_similarities.shape)

(490, 490)


In [139]:
# Brute force approach: Group together all indices where cosine similarity is > 0.8
# undirected_graph = {} 
# for i in range(len(cosine_similarities)):
#     undirected_graph[i] = []

# for i in range(len(cosine_similarities)):
#     for j in range(i+1, len(cosine_similarities[0])):
#         if cosine_similarities[i][j] > 0.8:
#             undirected_graph[i].append(j)
#             undirected_graph[j].append(i)
simiarity_theshold = 0.98

row_indices, col_indices = np.where(cosine_similarities > simiarity_theshold)

groups = {}
undirected_graph = {} 


# Iterate over unique row indices
for i in np.unique(row_indices):
    relevant_cols = col_indices[row_indices == i]

    if i not in undirected_graph:
        undirected_graph[i] = []
    undirected_graph[i].extend(relevant_cols.tolist())

    for col in relevant_cols:
        if col not in undirected_graph:
            undirected_graph[col] = []
        undirected_graph[col].append(i)
    
    # Store relevant column indices in groups
    groups[i] = relevant_cols.tolist()

# Convert the lists to sets to remove duplicates in the undirected graph
for key in undirected_graph:
    undirected_graph[key] = list(set(undirected_graph[key]))

for i in range(len(df)):
    if i not in groups:
        groups[i] = []
    if i not in undirected_graph:
        undirected_graph[i] = [i]

In [140]:
graph = groups.copy()
clusters = []
visited = set()

def dfs(v, curr_cluster):
    visited.add(v)
    curr_cluster.append(v)
    for w in graph[v]:
        if w not in visited:
            dfs(w, curr_cluster)

for node in graph:
    if node not in visited:
        cluster = []
        dfs(node, cluster)
        clusters.append(cluster)

# print(clusters)

In [141]:
print(clusters)

[[0, 83, 85, 119, 159, 185], [1], [2, 89, 92, 96, 101, 113, 140, 174, 187, 192, 212, 216, 220, 232, 244, 64, 7, 178, 224, 347, 362, 390, 465, 253, 268, 277, 286, 302, 318, 366, 395, 414, 418, 434, 443, 476, 484], [3, 93, 97, 100, 112, 141, 175, 186, 193, 209, 213, 217, 221, 233, 252, 269, 287, 319, 367, 394, 415, 419, 435, 442, 477], [4], [5], [6], [8], [9, 78, 102, 210, 218, 230, 238, 254, 266, 300, 349, 352, 377, 429, 449, 483, 487], [10, 37, 49, 427], [11, 117, 36, 132, 133, 314, 315, 136, 307, 383, 371, 48, 378, 370, 379, 382, 260, 426, 430, 431, 468, 407, 248, 261, 306, 406], [12], [13], [14], [15], [16], [17], [18, 88, 276, 472], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32, 45, 124, 162, 229, 272, 326, 330, 343, 398, 439], [33], [34], [35], [38, 180, 304, 305, 478, 400], [39], [40], [41], [42], [43, 58, 87, 345], [44], [46], [47], [50], [51], [52, 53, 334], [54], [55], [56], [57], [59], [60, 105, 109, 170, 201, 283, 291, 298, 323], [61, 104, 

In [142]:
with open('clusters.txt', 'w') as f:
    for clus in clusters:
        f.write('[')
        for idx in clus:
            f.write(f'{transformations.iloc[idx]}; ')
        f.write(']\n\n')

str1Tostr2 = df.iloc[:, 1]
with open('str1Tostr2Clusters.txt', 'w') as f:
    for clus in clusters:
        f.write('[')
        for idx in clus:
            f.write(f'{str1Tostr2.iloc[idx]}; ')
        f.write(']\n\n')

In [143]:
from collections import Counter

lengths =[]
for cluster in clusters:
    lengths.append(len(cluster))
print(Counter(lengths))
unique_values_count = df.iloc[:, 0].nunique()
print("Number of unique values in program column:", unique_values_count)

Counter({1: 284, 2: 9, 4: 3, 6: 2, 3: 2, 8: 2, 37: 1, 25: 1, 17: 1, 26: 1, 11: 1, 9: 1, 12: 1, 5: 1})
Number of unique values in 'isbn' column: 362


Counter({1: 236, 2: 17, 3: 4, 4: 3, 6: 1, 50: 1, 27: 1, 34: 1, 36: 1, 10: 1, 9: 1, 11: 1, 5: 1, 8: 1})

In [144]:
# Read in text file that shows PROSE program to transformation
file_path = 'programs_one_to_one_conflict.txt'

with open(file_path, 'r') as file:
    data = file.read()

# Split the data on the word "Program:" but keep it as part of the split results
programs_transformations = data.split("\n")
programs_transformations.remove("")

programs = []
transformations = []
lhs = []
rhs = []

for item in programs_transformations:
    item_split = item.split("==>")
    transformation = item_split[1]
    programs.append(item_split[0])
    transformations.append(transformation)
    transformation_split = transformation.split("->")
    lhs.append(transformation_split[0])
    rhs.append(transformation_split[1])



df = pd.DataFrame({
    'program': programs,
    'transformation': transformations,
    'lhs': lhs,
    'rhs': rhs
})

df

Unnamed: 0,program,transformation,lhs,rhs
0,"Concat(ConstStr(""Hack ""), Concat(let columnNam...",ADVANCES IN COMPUTERS V56->Hack I.T. - Securit...,ADVANCES IN COMPUTERS V56,Hack I.T. - Security Through Penetration Testi...
1,"Concat(let columnName = ""0"" in let x = ChooseI...",Hack I.T. - Security Through Penetration Testi...,Hack I.T. - Security Through Penetration Testi...,ADVANCES IN COMPUTERS V56
2,"Concat(ConstStr(""\""Klevinsky, T. J.""), let col...","""Zelkowitz, Marvin""->""Klevinsky, T. J.""","""Zelkowitz, Marvin""","""Klevinsky, T. J."""
3,"Concat(ConstStr(""\""Zelkowitz, Marvin""), let co...","""Klevinsky, T. J.""->""Zelkowitz, Marvin""","""Klevinsky, T. J.""","""Zelkowitz, Marvin"""
4,"Concat(ConstStr(""Communication Networking an A...",Wi-Foo: The Secrets of Wireless Hacking->Commu...,Wi-Foo: The Secrets of Wireless Hacking,Communication Networking an Analytical A
...,...,...,...,...
495,"Concat(ConstStr(""\""Beveridge, Jim, Wiener, Rob...","""Fortier,Paul""->""Beveridge, Jim, Wiener, Robert""","""Fortier,Paul""","""Beveridge, Jim, Wiener, Robert"""
496,"Concat(ConstStr(""Accelerated ""), Concat(let co...",Data Structures With C++ Using Stl->Accelerate...,Data Structures With C++ Using Stl,Accelerated C++ Practical Prog By Exampl
497,"Concat(ConstStr(""Data Structures With ""), Conc...",Accelerated C++ Practical Prog By Exampl->Data...,Accelerated C++ Practical Prog By Exampl,Data Structures With C++ Using Stl
498,"Concat(ConstStr(""\""Koenig, Andrew""), let colum...","""Ford, William; Topp, William; Topp, William R...","""Ford, William; Topp, William; Topp, William R.""","""Koenig, Andrew"""


In [145]:
# def naturalize_code(code):
#     replacements = {
#         'ÃŽÂµ': 'end of string or no character',
#         'Concat': 'join strings',
#         'ConstStr': 'fixed string',
#         'RegexPair': 'pattern matching',
#         'let columnName = "0" in let x = ChooseInput(vs, columnName) in': 'select column "0" from the data',
#         '\\"': 'quotation mark',
#         'RegexPositionRelative': 'find position relative to pattern',
#         'SubStr': 'extract substring',
#         'PosPair': 'position pair indicating start and end'
#     }

#     # Apply all replacements to the code
#     for old, new in replacements.items():
#         code = re.sub(re.escape(old), new, code)

#     return code

# df['pseudo-coded'] = df.iloc[:, 0].apply(naturalize_code)


df

Unnamed: 0,program,transformation,lhs,rhs,pseudo-coded
0,"Concat(ConstStr(""Hack ""), Concat(let columnNam...",ADVANCES IN COMPUTERS V56->Hack I.T. - Securit...,ADVANCES IN COMPUTERS V56,Hack I.T. - Security Through Penetration Testi...,"join strings(fixed string(""Hack ""), join strin..."
1,"Concat(let columnName = ""0"" in let x = ChooseI...",Hack I.T. - Security Through Penetration Testi...,Hack I.T. - Security Through Penetration Testi...,ADVANCES IN COMPUTERS V56,"join strings(select column ""0"" from the data T..."
2,"Concat(ConstStr(""\""Klevinsky, T. J.""), let col...","""Zelkowitz, Marvin""->""Klevinsky, T. J.""","""Zelkowitz, Marvin""","""Klevinsky, T. J.""","join strings(fixed string(""quotation markKlevi..."
3,"Concat(ConstStr(""\""Zelkowitz, Marvin""), let co...","""Klevinsky, T. J.""->""Zelkowitz, Marvin""","""Klevinsky, T. J.""","""Zelkowitz, Marvin""","join strings(fixed string(""quotation markZelko..."
4,"Concat(ConstStr(""Communication Networking an A...",Wi-Foo: The Secrets of Wireless Hacking->Commu...,Wi-Foo: The Secrets of Wireless Hacking,Communication Networking an Analytical A,"join strings(fixed string(""Communication Netwo..."
...,...,...,...,...,...
495,"Concat(ConstStr(""\""Beveridge, Jim, Wiener, Rob...","""Fortier,Paul""->""Beveridge, Jim, Wiener, Robert""","""Fortier,Paul""","""Beveridge, Jim, Wiener, Robert""","join strings(fixed string(""quotation markBever..."
496,"Concat(ConstStr(""Accelerated ""), Concat(let co...",Data Structures With C++ Using Stl->Accelerate...,Data Structures With C++ Using Stl,Accelerated C++ Practical Prog By Exampl,"join strings(fixed string(""Accelerated ""), joi..."
497,"Concat(ConstStr(""Data Structures With ""), Conc...",Accelerated C++ Practical Prog By Exampl->Data...,Accelerated C++ Practical Prog By Exampl,Data Structures With C++ Using Stl,"join strings(fixed string(""Data Structures Wit..."
498,"Concat(ConstStr(""\""Koenig, Andrew""), let colum...","""Ford, William; Topp, William; Topp, William R...","""Ford, William; Topp, William; Topp, William R.""","""Koenig, Andrew""","join strings(fixed string(""quotation markKoeni..."


In [146]:
transformations = df.iloc[:, 0]
transformation_embeddings = [model.encode(text) for text in transformations]
cosine_similarities = cosine_similarity(transformation_embeddings)
print(cosine_similarities.shape)

(500, 500)


In [147]:
row_indices, col_indices = np.where(cosine_similarities > 0.99)

groups = {}
undirected_graph = {} 


# Iterate over unique row indices
for i in np.unique(row_indices):
    # Find the column indices where cosine_similarities > 0.8 for this row
    relevant_cols = col_indices[row_indices == i]

    if i not in undirected_graph:
        undirected_graph[i] = []
    undirected_graph[i].extend(relevant_cols.tolist())

    for col in relevant_cols:
        if col not in undirected_graph:
            undirected_graph[col] = []
        undirected_graph[col].append(i)
    
    # Store relevant column indices in groups
    groups[i] = relevant_cols.tolist()

# Convert the lists to sets to remove duplicates in the undirected graph
for key in undirected_graph:
    undirected_graph[key] = list(set(undirected_graph[key]))

for i in range(len(df)):
    if i not in groups:
        groups[i] = []
    if i not in undirected_graph:
        undirected_graph[i] = [i]

In [148]:
graph = groups.copy()
clusters = []
visited = set()

def dfs(v, curr_cluster):
    visited.add(v)
    curr_cluster.append(v)
    for w in graph[v]:
        if w not in visited:
            dfs(w, curr_cluster)

for node in graph:
    if node not in visited:
        cluster = []
        dfs(node, cluster)
        clusters.append(cluster)

In [149]:
lengths =[]
for cluster in clusters:
    lengths.append(len(cluster))
print(Counter(lengths))
unique_values_count = df.iloc[:, 0].nunique()
print("Number of unique values in 'isbn' column:", unique_values_count)

Counter({1: 445, 2: 6, 4: 2, 29: 1, 6: 1})
Number of unique values in 'isbn' column: 496


Counter({1: 488, 2: 6})