In [None]:
import os
import json
import pandas as pd
from datetime import datetime
import numpy
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from shapely.geometry import LineString
import matplotlib.ticker as ticker
import statsmodels.api as sm

In [None]:
# for similar graph

def process_multiple_excels(input_excel_paths, output_excel_path):
    all_results = []

    for input_path in input_excel_paths:
        print(f"⏳ Processing: {input_path}")
        df = pd.read_excel(input_path)

        # Filter out rows where 'published_at' is 'Not available'
        df = df[df['published_at'] != 'Not available']

        # Ensure the 'published_at' column is in datetime format
        df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')

        # Iterate through each cluster group
        for cluster, group in df.groupby('cluster'):
            group_sorted = group.sort_values(by='published_at')
            
            if len(group_sorted) > 1:
                time_diff = group_sorted['published_at'].max() - group_sorted['published_at'].min()
                time_diff_in_days = time_diff.total_seconds() / 86400
                time_diff_str = f"{time_diff_in_days:.2f}"
            else:
                time_diff_str = "0.00"

            source_name = os.path.splitext(os.path.basename(input_path))[0]
            cluster_id = f"{source_name}_{cluster}"
            all_results.append([cluster_id, time_diff_str])

    result_df = pd.DataFrame(all_results, columns=['cluster', 'time'])
    result_df.to_excel(output_excel_path, index=False)
    print(f"✅ All results saved to: {output_excel_path}")


if __name__ == "__main__":
    input_excel_paths = [
        '../KG/Similar_Edge/ruby_end.xlsx',
        '../KG/Similar_Edge/npm_end.xlsx',
        '../KG/Similar_Edge/pypi_end.xlsx'
    ]
    output_excel_path = './active_time.xlsx'
    process_multiple_excels(input_excel_paths, output_excel_path)

In [None]:
# for dependent-hidden/co-existing graph

time_name = 'published_at'
# time_name = 'time'
def extract_time_difference_from_subgraph(subgraph):
    nodes = subgraph['nodes']

    for node in nodes:
        if node[time_name] != "Not available":
            node[time_name] = datetime.fromisoformat(node[time_name][:-1])
        else:
            node[time_name] = None

    dates = [node[time_name] for node in nodes if node[time_name] is not None]
    if dates:
        min_date = min(dates)
        max_date = max(dates)
        time_diff = max_date - min_date  

        time_diff_in_days = time_diff.total_seconds() / 86400  # 86400s = 1d
        time_diff_str = f"{time_diff_in_days:.2f}"
    else:
        time_diff_str = 'None'
    
    return time_diff_str

def process_json_files_in_folders(folders):
    time_differences = []

    for folder_path in folders:
        for root, _, files in os.walk(folder_path):
            for filename in files:
                if filename.endswith('.json'):
                    file_path = os.path.join(root, filename)
                    with open(file_path, 'r') as file:
                        subgraph = json.load(file)
                        time_diff_str = extract_time_difference_from_subgraph(subgraph)
                        
                        time_differences.append({
                            'file_path': file_path,
                            'time': time_diff_str
                        })

    df = pd.DataFrame(time_differences)
    output_path = './active_time.xlsx'
    df.to_excel(output_path, index=False)
    print(f"Time differences saved to {output_path}")

# TODO
folders = [
# # dependent-hidden graph
#   '../KG/Dependency_Edge/metadata_dep/npm',
#   '../KG/Dependency_Edge/sourcecode_dep/npm',
#   '../KG/Dependency_Edge/metadata_dep/pypi',
#   '../KG/Dependency_Edge/sourcecode_dep/pypi',
#   '../KG/Dependency_Edge/metadata_dep/ruby',
#   '../KG/Dependency_Edge/sourcecode_dep/ruby'

# # co-existing graph
  '../KG/Co-existing_Edge/source_report/npm',
  '../KG/Co-existing_Edge/WebCrawler_report/npm',
  '../Co-existing_Edge/source_report/pypi',
  '../KG/Co-existing_Edge/WebCrawler_report/pypi',
  '../KG/Co-existing_Edge/source_report/ruby',
  '../KG/Co-existing_Edge/WebCrawler_report/ruby'
]
process_json_files_in_folders(folders)

In [None]:
# paint

label_fontsize = 27
tick_fontsize = 22
barlabel_fontsize = 24
legend_fontsize = 24

linewidth = 3
markersize = 12
indicator_linewidth = 2
indicator_color = 'grey'
indicator_linestyle = 'dotted'

dpi = 600
figsize = (9, 6)
mpl.use('TkAgg')
plt.rc('font', family='Times New Roman')

def xlsx_to_csv_pd(path_xls):
    temp = path_xls.rsplit('.', 1)
    path_csv = temp[0] + '.csv'
    data_xls = pd.read_excel(path_xls, index_col=0)
    data_xls.to_csv(path_csv, encoding='utf-8')
    return path_csv

def cdf(path, x, data, xl, xr, colors, linestyle, ylabel,rotation=0,
        xlabel=None, figsize=figsize, labels=None,
        **kwargs):
    if labels is not None:
        if len(labels) != len(data): raise ValueError('label num error')
    if colors is not None:
        if len(colors) != len(data): raise ValueError('colors num error')

    fig, ax = plt.subplots(figsize=figsize)
    if 'x_step' in kwargs:
        ax.set_xticks(np.arange(xl, xr + 1, kwargs['x_step']))

    out_format = '{x:,.0f}'
    if isinstance(xl, np.float64):
        out_format = '{x:,.1f}'

    for i in range(len(data)):
        la = labels[i]
        color = colors[i]
        ls = linestyle[i]
        plt.plot(x, data[i], color=color, label=la, linewidth=linewidth, linestyle=ls)
        plt.gca().xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter(out_format))
    
    plt.locator_params(axis='x', nbins=10)
    plt.xlim(xl, xr)
    plt.xticks(size=tick_fontsize,rotation=rotation)
    plt.xlabel(xlabel, size=label_fontsize)

    plt.ylim(0, 1)
    plt.yticks(size=tick_fontsize)
    plt.ylabel(ylabel, size=label_fontsize)

    if len(labels) > 1:
        plt.legend(fontsize=legend_fontsize, loc=4)
    plt.tight_layout()

    if 'hline' in kwargs:
        hline = kwargs['hline']
        for i in range(len(data)):
            first_line = LineString(np.column_stack((x, data[i])))
            second_line = LineString(np.column_stack((x, [hline for i in range(len(x))])))
            intersection = first_line.intersection(second_line)

            arr_x, arr_y = intersection.xy
            for x_i in arr_x:
                plt.axvline(x=x_i, ymin=0, ymax=hline, linewidth=indicator_linewidth, color=indicator_color,
                            linestyle=indicator_linestyle)
            for ind, y_i in enumerate(arr_y):
                plt.axhline(y=y_i, xmin=0, xmax=arr_x[ind] / xr, linewidth=indicator_linewidth, color=indicator_color,
                            linestyle=indicator_linestyle)

    plt.savefig(path, dpi=dpi)
    plt.show()

def active_time_similar():
    path_xls = "active_time_similar.xlsx"
    path_csv = xlsx_to_csv_pd(path_xls)
    df = pd.read_csv(path_csv, encoding='ISO-8859-15')
    all_data = []
    for index, row in df.iterrows():
        all_data.append(row['time'])

    # x = np.linspace(min(all_data), max(all_data))
    x = np.linspace(0, 60)
    ecdf_all = sm.distributions.ECDF(all_data)
    y = ecdf_all(x)

    cdf(path="active_time_similar.pdf",
               x=x, data=[y], xl=0, xr=60,
               xlabel='Active Period(Day)',
               hline=0.8,
               ylabel='CDF',
               labels=['Count'],
               colors=['blue'],
               linestyle=['-'])


def active_time_dep():
    path_xls = "active_time.xlsx"
    path_csv = xlsx_to_csv_pd(path_xls)
    df = pd.read_csv(path_csv, encoding='ISO-8859-15')
    all_data = []
    for index, row in df.iterrows():
        all_data.append(row['time'])

    # x = np.linspace(min(all_data), max(all_data))
    x = np.linspace(0, 140)
    ecdf_all = sm.distributions.ECDF(all_data)
    y = ecdf_all(x)

    cdf(path="active_time_dep.pdf",
               x=x, data=[y], xl=0, xr=140,
               xlabel='Active Period(Day)',
               hline=0.8,
               ylabel='CDF',
               labels=['Count'],
               colors=['blue'],
               linestyle=['-'])

def active_time_coexist():
    path_xls = "active_time.xlsx"
    path_csv = xlsx_to_csv_pd(path_xls)
    df = pd.read_csv(path_csv, encoding='ISO-8859-15')
    all_data = []
    for index, row in df.iterrows():
        all_data.append(row['time'])

    # x = np.linspace(min(all_data), max(all_data))
    x = np.linspace(0, 100)
    ecdf_all = sm.distributions.ECDF(all_data)
    y = ecdf_all(x)

    cdf(path="active_time_dep.pdf",
               x=x, data=[y], xl=0, xr=100,
               xlabel='Active Period(Day)',
               hline=0.8,
               ylabel='CDF',
               labels=['Count'],
               colors=['blue'],
               linestyle=['-'])
               

## TODO:select different function
       
active_time_similar()
# active_time_dep()
# active_time_coexist()