In [1]:
import json
import os
import pandas as pd
from collections import defaultdict

def load_json_files(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                data.append(json.load(file))
            except json.JSONDecodeError:
                print(f"Json Parsing Error: File {file_path} not a valid json format")
    return data

def count_source_pairs(json_data):
    nodes_dict = {}
    pair_counts = defaultdict(int)

    for node in json_data.get('nodes', []):
        nodes_dict[node['id']] = node['Source']

    for link in json_data.get('links', []):
        source_id = link['source']
        target_id = link['target']
        source_source = nodes_dict.get(source_id)
        target_source = nodes_dict.get(target_id)

        if source_source and target_source:
            pair = tuple(sorted([source_source, target_source]))
            pair_counts[pair] += 1

    return pair_counts

def main(file_paths):
    total_pair_counts = defaultdict(int)
    per_file_counts = {}

    for file_path in file_paths:
        json_data = load_json_files([file_path])[0]
        pair_counts = count_source_pairs(json_data)
        file_label = os.path.basename(file_path).replace("_graph.json", "")
        per_file_counts[file_label] = pair_counts

        print(f"\n✅ Processed: {file_label}, found {len(pair_counts)} source pairs.")

        for pair, count in pair_counts.items():
            total_pair_counts[pair] += count

    # 🔄 合并为DataFrame
    all_pairs = sorted(set(pair for d in per_file_counts.values() for pair in d))
    df_data = {}

    for file, counts in per_file_counts.items():
        df_data[file] = [counts.get(pair, 0) for pair in all_pairs]

    df_data["Total"] = [total_pair_counts[pair] for pair in all_pairs]

    df = pd.DataFrame(df_data, index=pd.MultiIndex.from_tuples(all_pairs, names=["Source 1", "Source 2"]))
    df = df.sort_values(by="Total", ascending=False)

    print("\n📊 Source Pair Overlap Table:")
    display(df) 
    # df.to_excel("source_pair_overlap.xlsx")  # optionally save to file

if __name__ == "__main__":
    file_paths = [
        "npm_graph.json",
        "pypi_graph.json",
        "ruby_graph.json"
    ]
    main(file_paths)


✅ Processed: npm, found 15 source pairs.

✅ Processed: pypi, found 16 source pairs.

✅ Processed: ruby, found 1 source pairs.

📊 Source Pair Overlap Table:


Unnamed: 0_level_0,Unnamed: 1_level_0,npm,pypi,ruby,Total
Source 1,Source 2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Backstabber-Knife,Mal-PyPI Dataset,0,2897,0,2897
Backstabber-Knife,Phylum,48,918,0,966
Mal-PyPI Dataset,Phylum,0,918,0,918
Backstabber-Knife,Maloss,115,216,37,368
Phylum,Tianwen,269,3,0,272
Mal-PyPI Dataset,Maloss,0,201,0,201
Snyk.io,Tianwen,0,106,0,106
Maloss,Tianwen,61,8,0,69
Backstabber-Knife,Blogs,36,0,0,36
Backstabber-Knife,Tianwen,1,35,0,36


In [None]:
# #  Result 

# ('Backstabber-Knife', 'Maloss'): 115
# ('Backstabber-Knife', 'Phylum'): 48
# ('Backstabber-Knife', 'Socket'): 1
# ('Backstabber-Knife', 'Blogs'): 36
# ('Blogs', 'Maloss'): 6
# ('Backstabber-Knife', 'Snyk.io'): 1
# ('Maloss', 'Snyk.io'): 1
# ('Backstabber-Knife', 'Tianwen'): 1
# ('Maloss', 'Tianwen'): 61
# ('Maloss', 'Socket'): 3
# ('GitHub Advisory', 'Maloss'): 6
# ('Blogs', 'GitHub Advisory'): 1
# ('Phylum', 'Snyk.io'): 9
# ('Phylum', 'Tianwen'): 269
# ('Socket', 'Tianwen'): 2

# pypi_graph_name.json  source pairs Num.:
# ('Backstabber-Knife', 'Mal-PyPI Dataset'): 2897
# ('Backstabber-Knife', 'Maloss'): 216
# ('Mal-PyPI Dataset', 'Maloss'): 201
# ('Backstabber-Knife', 'Phylum'): 918
# ('Mal-PyPI Dataset', 'Phylum'): 918
# ('Backstabber-Knife', 'Tianwen'): 35
# ('Mal-PyPI Dataset', 'Tianwen'): 32
# ('Maloss', 'Tianwen'): 8
# ('Backstabber-Knife', 'DataDog'): 7
# ('DataDog', 'Mal-PyPI Dataset'): 7
# ('DataDog', 'Phylum'): 15
# ('Backstabber-Knife', 'Snyk.io'): 2
# ('Snyk.io', 'Tianwen'): 106
# ('Phylum', 'Snyk.io'): 8
# ('Phylum', 'Tianwen'): 3
# ('Blogs', 'DataDog'): 1

# ruby_graph_name.json  source pairs Num.:
# ('Backstabber-Knife', 'Maloss'): 37

# Total Count Of All Files
# ('Backstabber-Knife', 'Maloss'): 368
# ('Backstabber-Knife', 'Phylum'): 966
# ('Backstabber-Knife', 'Socket'): 1
# ('Backstabber-Knife', 'Blogs'): 36
# ('Blogs', 'Maloss'): 6
# ('Backstabber-Knife', 'Snyk.io'): 3
# ('Maloss', 'Snyk.io'): 1
# ('Backstabber-Knife', 'Tianwen'): 36
# ('Maloss', 'Tianwen'): 69
# ('Maloss', 'Socket'): 3
# ('GitHub Advisory', 'Maloss'): 6
# ('Blogs', 'GitHub Advisory'): 1
# ('Phylum', 'Snyk.io'): 17
# ('Phylum', 'Tianwen'): 272
# ('Socket', 'Tianwen'): 2
# ('Backstabber-Knife', 'Mal-PyPI Dataset'): 2897
# ('Mal-PyPI Dataset', 'Maloss'): 201
# ('Mal-PyPI Dataset', 'Phylum'): 918
# ('Mal-PyPI Dataset', 'Tianwen'): 32
# ('Backstabber-Knife', 'DataDog'): 7
# ('DataDog', 'Mal-PyPI Dataset'): 7
# ('DataDog', 'Phylum'): 15
# ('Snyk.io', 'Tianwen'): 106
# ('Blogs', 'DataDog'): 1 
