In [None]:
import os
import json
from datetime import datetime
import pandas as pd
import math
import numpy
import matplotlib as mpl
import numpy as np
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import sys

In [None]:
# STEP1: sort nodes by time

# time_name = 'time'
time_name = 'published_at'
def sort_nodes(nodes):
    available_nodes = []
    not_available_nodes = []

    for node in nodes:
        if node[time_name] != "Not available":
            available_nodes.append(node)
        else:
            not_available_nodes.append(node)

    available_nodes.sort(key=lambda x: datetime.fromisoformat(x[time_name].replace("Z", "+00:00")))
    not_available_nodes.sort(key=lambda x: x['name'].lower())
    sorted_nodes = available_nodes + not_available_nodes

    return sorted_nodes

def process_json_files_in_folder(folder_path):
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
    
    for json_file in json_files:
        with open(os.path.join(folder_path, json_file), 'r', encoding='utf-8') as f:
            data = json.load(f)

        if 'nodes' in data:
            sorted_nodes = sort_nodes(data['nodes'])
            data['nodes'] = sorted_nodes

        with open(os.path.join(folder_path, json_file), 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

        print(f"Processed and sorted nodes in {json_file}")

def process_multiple_folders(folder_paths):
    for folder_path in folder_paths:
        if os.path.exists(folder_path):
            process_json_files_in_folder(folder_path)
        else:
            print(f"Folder {folder_path} does not exist.")


folder_paths = [
# # # dependent-hidden graph
#   '../KG/Dependency_Edge/metadata_dep/npm',
#   '../KG/Dependency_Edge/sourcecode_dep/npm',
#   '../KG/Dependency_Edge/metadata_dep/pypi',
#   '../KG/Dependency_Edge/sourcecode_dep/pypi',
#   '../KG/Dependency_Edge/metadata_dep/ruby',
#   '../KG/Dependency_Edge/sourcecode_dep/ruby'

# co-existing graph
  '../KG/Co-existing_Edge/source_report/npm',
  '../KG/Co-existing_Edge/WebCrawler_report/npm',
  '../KG/Co-existing_Edge/source_report/pypi',
  '../KG/Co-existing_Edge/WebCrawler_report/pypi',
  '../KG/Co-existing_Edge/source_report/ruby',
  '../KG/Co-existing_Edge/WebCrawler_report/ruby'
]

process_multiple_folders(folder_paths)

In [None]:
# STEP2: generate evolution for depdent-hidden/co-existing graphs
def replace_nan_with_none(data):
    if isinstance(data, dict):
        return {key: replace_nan_with_none(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [replace_nan_with_none(item) for item in data]
    elif data is None or (isinstance(data, float) and math.isnan(data)):
        return None
    else:
        return data
    
total_name_change = 0
total_version_change = 0
total_description_change = 0
total_dependencies_change = 0
total_code_change = 0
total_change_num = 0

def compare_nodes(prev_node, curr_node):
    global total_name_change, total_version_change, total_description_change, total_dependencies_change, total_code_change, total_change_num

    prev_node = replace_nan_with_none(prev_node)
    curr_node = replace_nan_with_none(curr_node)

    if prev_node['name'] != curr_node['name']:
        total_name_change += 1
    if (prev_node['name'] == curr_node['name']) and (prev_node.get('version', 'none') != curr_node.get('version', 'none')):
        total_version_change += 1
    if (prev_node.get('SHA-256', 'none') != 'none') and (curr_node.get('SHA-256', 'none') != 'none') and (prev_node['description'] != curr_node['description']):
        total_description_change += 1
    if (prev_node.get('SHA-256', 'none') != 'none') and (curr_node.get('SHA-256', 'none') != 'none') and (prev_node.get('SHA-256', 'none') != curr_node.get('SHA-256', 'none')):
        total_code_change += 1
    if (prev_node.get('SHA-256', 'none') != 'none') and (curr_node.get('SHA-256', 'none') != 'none') and (prev_node['dependencies'] != curr_node['dependencies']):
        total_dependencies_change += 1

def process_json_files_in_folder(folder_path):
    global total_name_change, total_version_change, total_description_change, total_dependencies_change, total_code_change
    
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
    
    for json_file in json_files:
        with open(os.path.join(folder_path, json_file), 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        nodes = data.get('nodes', [])
        
        for i in range(1, len(nodes)):
            prev_node = nodes[i - 1]
            curr_node = nodes[i]
            compare_nodes(prev_node, curr_node)

def process_multiple_folders(folder_paths):
    for folder_path in folder_paths:
        process_json_files_in_folder(folder_path)
    
    total_change_num = total_name_change+total_version_change
    final_results = {
        # "CN": total_name_change / (total_name_change + total_version_change + total_description_change + total_dependencies_change + total_code_change),
        # "CV": total_version_change / (total_name_change + total_version_change + total_description_change + total_dependencies_change + total_code_change),
        # "CD": total_description_change / (total_name_change + total_version_change + total_description_change + total_dependencies_change + total_code_change),
        # "CDep": total_dependencies_change / (total_name_change + total_version_change + total_description_change + total_dependencies_change + total_code_change),
        # "CC": total_code_change / (total_name_change + total_version_change + total_description_change + total_dependencies_change + total_code_change)
        "CN": total_name_change / total_change_num,
        "CV": total_version_change / total_change_num,
        "CD": total_description_change / total_change_num,
        "CDep": total_dependencies_change / total_change_num,
        "CC": total_code_change / total_change_num
    }

    # Convert to DataFrame
    df = pd.DataFrame(list(final_results.items()), columns=['Change Type', 'Value'])
    
    output_file = './evolution.xlsx'
    df.to_excel(output_file, index=False)
    
    print(f"Final aggregated results saved to {output_file}")
    print("Final aggregated results across all folders:", final_results)

folder_paths = [
# # # dependent-hidden graph
#   '../KG/Dependency_Edge/metadata_dep/npm',
#   '../KG/Dependency_Edge/sourcecode_dep/npm',
#   '../KG/Dependency_Edge/metadata_dep/pypi',
#   '../KG/Dependency_Edge/sourcecode_dep/pypi',
#   '../KG/Dependency_Edge/metadata_dep/ruby',
#   '../KG/Dependency_Edge/sourcecode_dep/ruby'

# co-existing graph
  '../KG/Co-existing_Edge/source_report/npm',
  '../KG/Co-existing_Edge/WebCrawler_report/npm',
  '../KG/Co-existing_Edge/source_report/pypi',
  '../KG/Co-existing_Edge/WebCrawler_report/pypi',
  '../KG/Co-existing_Edge/source_report/ruby',
  '../KG/Co-existing_Edge/WebCrawler_report/ruby'
]

process_multiple_folders(folder_paths)


In [None]:
# STEP2: generate evolution for similar graphs
import pandas as pd

def replace_nan_with_none(node):
    return {k: (None if v != v else v) for k, v in node.items()}  

def process_multiple_excel(excel_files):
    all_results = []

    for excel_file in excel_files:
        df = pd.read_excel(excel_file)

        required_columns = ['cluster', 'published_at', 'name', 'version', 'SHA-256', 'description', 'dependencies']
        if not all(col in df.columns for col in required_columns):
            print(f"Error: The Excel file {excel_file} must contain 'cluster', 'published_at', 'name', 'version', 'SHA-256', 'description', and 'dependencies' columns.")
            continue
        
        total_name_change = 0
        total_version_change = 0
        total_description_change = 0
        total_dependencies_change = 0
        total_code_change = 0

        for cluster, group in df.groupby('cluster'):
            group_sorted = group.sort_values(by='published_at')

            prev_node = None
            for index, curr_node in group_sorted.iterrows():
                if prev_node is not None:
                    prev_node = replace_nan_with_none(prev_node)
                    curr_node = replace_nan_with_none(curr_node)

                    if prev_node['name'] != curr_node['name']:
                        total_name_change += 1
                    if prev_node['name'] == curr_node['name'] and prev_node.get('version', 'none') != curr_node.get('version', 'none'):
                        total_version_change += 1
                    if prev_node.get('SHA-256', 'none') != 'none' and curr_node.get('SHA-256', 'none') != 'none' and prev_node['description'] != curr_node['description']:
                        total_description_change += 1
                    if prev_node.get('SHA-256', 'none') != 'none' and curr_node.get('SHA-256', 'none') != 'none' and prev_node.get('SHA-256', 'none') != curr_node.get('SHA-256', 'none'):
                        total_code_change += 1
                    if prev_node.get('SHA-256', 'none') != 'none' and curr_node.get('SHA-256', 'none') != 'none' and prev_node['dependencies'] != curr_node['dependencies']:
                        total_dependencies_change += 1

                prev_node = curr_node

        total_change_num = total_name_change + total_version_change 
        all_results.append([
            excel_file,
            total_name_change,
            total_version_change,
            total_description_change,
            total_dependencies_change,
            total_code_change,
            total_change_num
        ])

    # Save full change stats
    result_df = pd.DataFrame(
        all_results,
        columns=['File_name','Total Name Change', 'Total Version Change', 'Total Description Change', 'Total Dependencies Change', 'Total Code Change', 'Total Change Num']
    )

    # Aggregate totals across all files
    total_name = sum(row[1] for row in all_results)
    total_version = sum(row[2] for row in all_results)
    total_description = sum(row[3] for row in all_results)
    total_dependencies = sum(row[4] for row in all_results)
    total_code = sum(row[5] for row in all_results)
    total_changes = sum(row[6] for row in all_results)

    # Output change type ratio
    ratio_df = pd.DataFrame([
        ['CN', total_name / total_changes],
        ['CV', total_version / total_changes],
        ['CD', total_description / total_changes],
        ['CDep', total_dependencies / total_changes],
        ['CC', total_code / total_changes],
    ], columns=['Change Type', 'Value'])

    ratio_df.to_excel('./evolution.xlsx', index=False)
    print("Final aggregated results across all folders:", ratio_df)
    print("Change type ratio saved to evolution.xlsx")

excel_files = [
    '../KG/Similar_Edge/npm_end.xlsx', 
    '../KG/Similar_Edge/pypi_end.xlsx',  
    '../KG/Similar_Edge/ruby_end.xlsx',
]
process_multiple_excel(excel_files)

In [None]:
# STEP3: paint
sys.path.append('figure')
label_fontsize = 30
tick_fontsize = 30
barlabel_fontsize = 24
legend_fontsize = 24
linewidth = 3
markersize = 12

indicator_linewidth = 2
indicator_color = 'grey'
indicator_linestyle = 'dotted'

dpi = 600
figsize = (9, 6)
# mpl.use('TkAgg')
plt.rc('font', family='Times New Roman')

def single_bar(x, y, path, ylabel, xlabel=None, width=0.5, figsize=figsize, rotation=0, ylim=None, **kwargs):
    plt.figure(figsize=figsize)
    p = plt.bar(x, y, width=width, color='#14517C')
    bar_labels_format = None
    bar_labels_fmt = ''
    print(y[0], type(y[0]))
    if isinstance(y[0], np.int32):
        bar_labels_format = [format(i, ',') for i in y]
    elif isinstance(y[0], np.float64):
        bar_labels_format = [format(i, '.2f') for i in y]
    elif isinstance(y[0], int):
        bar_labels_format = [format(i, ',') for i in y]
    elif isinstance(y[0], float):
        bar_labels_format = [format(i, '.2f') for i in y]

    for i in range(len(x)):
        plt.text(x[i], y[i], "{:.2f}%".format(y[i]*100), ha='center', va='bottom', fontsize=25)

    if 'x_step' in kwargs:
        plt.xticks(np.arange(kwargs['x_min'], kwargs['x_max'], kwargs['x_step']),
                   x,
                   rotation=rotation,
                   size=tick_fontsize)
    else:
        plt.xticks(np.arange(len(x)), x, rotation=rotation, size=tick_fontsize)
    plt.xlabel(xlabel, size=label_fontsize)

    if 'y_step' in kwargs:
        plt.yticks(np.arange(kwargs['y_min'], kwargs['y_max'], kwargs['y_step']),
                   labels=[str(y) for y in np.arange(kwargs['y_min'], kwargs['y_max'], kwargs['y_step'])],
                   size=tick_fontsize)
    else:
        plt.yticks(size=tick_fontsize)
    plt.ylabel(ylabel, size=label_fontsize)
    plt.ylim(ylim)

    plt.tight_layout()
    # plt.savefig(path, dpi=dpi)
    plt.show()

def evolution():
    path_xls = 'evolution.xlsx'
    df = pd.read_excel(path_xls, sheet_name='Sheet1', header=None)
    df = df[1:]
    data = []
    for _, _series in df.iterrows():
        data.append((_series[0], _series[1]))

    x = np.array([x[0] for x in data])
    y = np.array([x[1] for x in data])
    single_bar(x=x, y=y,
                      ylabel="Percentage", rotation=0,
                      path="./evolution.pdf", ylim=(0, 1))

evolution()