# RQ3: Distribution 

## Euler Diagram

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib_set_diagrams import EulerDiagram, get_subsets

data_dir = '../project_data/'
df = pd.read_csv(data_dir + "all_projects.csv")

df["unique_identifier"] = df["project"].astype(str) + df["class"].astype(str) + df["line_no"].astype(str)
df = df.sort_values('unique_identifier', ascending = True).reset_index(drop=True)

covered = set(df.loc[df['is_clover_covered'] == True, 'unique_identifier'].unique())
porbs = set(df.loc[df['on_porbs_slice'] == True, 'unique_identifier'].unique())
slicer4j = set(df.loc[df['on_slicer4j_slice'] == True, 'unique_identifier'].unique())
pseudosweep = set(df.loc[df['required_ps'] == True, 'unique_identifier'].unique())
cov = set(df.loc[df['covered_ps'] == True, 'unique_identifier'].unique())

print(porbs)

set_list = [
    porbs, 
    slicer4j, 
    pseudosweep,
    # covered, 
    cov
    ]

fig, ax = plt.subplots()

diagram = EulerDiagram.from_sets(
    set_list,
    ax=ax,
    set_labels=[
        "Observational\nSlice", 
        "Dynamic\nSlice",  
        "Required", 
        "\n\n\n\n\nPS_Covered"
        ],
)

print(get_subsets(set_list))

# Customize appearance using the correct attribute name: subset_artists
for artist in diagram.subset_artists.values():
    artist.set_alpha(0.4)
    artist.set_edgecolor("black")


plt.title("Lines Fulfilling Each Criterion")
plt.tight_layout()


## Calculating Slice and Required Intersections


In [None]:
cov_porbs = porbs.intersection(cov)
cov_slicer4j = slicer4j.intersection(cov)

# pairs
cov_porbs_slicer4j = cov_porbs.intersection(cov_slicer4j)
cov_porbs_pseudosweep = cov_porbs.intersection(pseudosweep)
cov_slicer4j_pseudosweep = cov_slicer4j.intersection(pseudosweep)

# overall
overall_intersection = cov_porbs_slicer4j.intersection(pseudosweep)

print("cov_porbs_slicer4j", len(cov_porbs_slicer4j))
print("cov_porbs_pseudosweep", len(cov_porbs_pseudosweep))
print("cov_slicer4j_pseudosweep", len(cov_slicer4j_pseudosweep))
print("overall_intersection", len(overall_intersection))

## Calculating Gap Intersections

In [None]:
# Cov Gaps
gap_porbs = cov.difference(porbs)
gap_slicer4j = cov.difference(slicer4j)
gap_pseudosweep = cov.difference(pseudosweep)

print("gap_porbs", gap_porbs)
print("gap_slicer4j", gap_slicer4j)
print("gap_pseudosweep",gap_pseudosweep)

# Gap Intersections
gap_porbs_slicer4j = gap_porbs.intersection(gap_slicer4j)
gap_porbs_pseudosweep = gap_porbs.intersection(gap_pseudosweep)
gap_slicer4j_pseudosweep = gap_slicer4j.intersection(gap_pseudosweep)
print("gap_porbs_slicer4j",gap_porbs_slicer4j)
print("gap_porbs_pseudosweep",gap_porbs_pseudosweep)
print("gap_slicer4j_pseudosweep", gap_slicer4j_pseudosweep)

overall_gap_intersection = gap_porbs.intersection(gap_slicer4j, gap_pseudosweep)
print("overall_gap_intersection", overall_gap_intersection)

## Visualising the Coverage Gaps


In [None]:
def get_symbol(prefix, line) -> str:
    id = f'{prefix}{line}'
    
    # overall intersection
    if id in overall_gap_intersection:
        return r"$\blacksquare$", "overall_gap_intersection"
    
    # overlaps
    if id in gap_porbs_pseudosweep:
        return r"$\boxtimes$", "gap_porbs_pseudosweep"
    if id in gap_porbs_slicer4j:
        return r"$\boxminus$", "gap_porbs_slicer4j"
    if id in gap_slicer4j_pseudosweep:
        return r"$\boxplus$", "gap_slicer4j_pseudosweep"

    # individual 
    if id in gap_porbs:
        return r"$\circledcirc$", "gap_porbs"
    if id in gap_pseudosweep:
        return r"$\circleddash$", "gap_pseudosweep"
    if id in gap_slicer4j:
        return r"$\circledast$", "gap_slicer4j"
    return None, ""
    

In [None]:
df_visualize = pd.DataFrame()

statement_types = pd.DataFrame(columns=["Type", "Gap"])


for project, clazz in set(zip(df["project"], df["class"])):
    prefix = f"{project}{clazz}"
    line_total = len(df.loc[(df["project"] == project) & (df["class"] == clazz)])
    # print(prefix, line_total)

    symbols = []
    statement_types = pd.DataFrame(columns=["Type", "Gap"])

    # Stmt gap locations
    for line in range(1, line_total + 1):
        symbol_latex, gap = get_symbol(prefix, line)
        if symbol_latex is not None:
            # print(symbol_latex)
            symbols.append("\hgap" + symbol_latex.strip() + "\hgap")
        statement_type = df.loc[
            (df["project"] == project)
            & (df["class"] == clazz)
            & (df["line_no"] == line),
            "statement_type",
        ]
        if (len(statement_type)) > 0 and str(statement_type.values[0]) != "nan":
            statement_type = statement_type.values[0]
            statement_types.loc[len(statement_types)] = [statement_type, gap]
            # statement_types.loc[len(statement_types)] = [statement_type, ""]

    symbols.sort()
    symbols.insert(0, clazz)

    df_visualize = pd.concat([df_visualize, pd.DataFrame([symbols[:60]])])

    # print(len(df_visualize))
    # break # TODO: temp for debugging

# Output latex
df_visualize.sort_values([0,1], inplace=True)
df_visualize.drop([0], axis=1, inplace=True)
df_visualize.to_latex(
    index=False,
    na_rep="\hgap\hgap",
    )
print(df_visualize)

In [None]:
def jaccard_similarity(set1, set2):
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))
  if union == 0:
    return 0.0 
  return intersection / union

def jaccard_distance(set1, set2):
  return 1 - jaccard_similarity(set1, set2)

# Calculate Jaccard similarities
jaccard_porbs_slicer4j = jaccard_similarity(gap_porbs, gap_slicer4j)
jaccard_porbs_pseudosweep = jaccard_similarity(gap_porbs, gap_pseudosweep)
jaccard_slicer4j_pseudosweep = jaccard_similarity(gap_slicer4j, gap_pseudosweep)


# Print the results
print(f"Jaccard similarity (gap_porbs, gap_slicer4j): {jaccard_porbs_slicer4j}")
print(f"Jaccard similarity (gap_porbs, gap_pseudosweep): {jaccard_porbs_pseudosweep}")
print(f"Jaccard similarity (gap_slicer4j, gap_pseudosweep): {jaccard_slicer4j_pseudosweep}")
print()

# Calculate Jaccard distances
distance_porbs_slicer4j = jaccard_distance(gap_porbs, gap_slicer4j)
distance_porbs_pseudosweep = jaccard_distance(gap_porbs, gap_pseudosweep)
distance_slicer4j_pseudosweep = jaccard_distance(gap_slicer4j, gap_pseudosweep)

# Print the results
print(f"Jaccard distance (gap_porbs, gap_slicer4j): {distance_porbs_slicer4j}")
print(f"Jaccard distance (gap_porbs, gap_pseudosweep): {distance_porbs_pseudosweep}")
print(f"Jaccard distance (gap_slicer4j, gap_pseudosweep): {distance_slicer4j_pseudosweep}")

