# 5.2 Experimental results and analysis of software heterogeneous coupling network model construction

In [1]:
import shutil

from matplotlib import pyplot as plt

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

shapes = ['o', 'v', '^', '<', '>', 's', 'p', '*', 'h', '+']
colors = ['red', 'blue', 'green', 'orange', 'purple', 'cyan', 'magenta', 'darkgreen', 'chocolate']

## Read network data

In [2]:
from experiment.functions import func_get_lab_graph
from experiment.lab2._0_config_ import LabSpace, SoftwareConfigs1
import os

Graphs = dict()
Labels = dict()
Types = dict()

for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        modelDir = os.path.join(LabSpace, softwareName)
        labDir = os.path.join(modelDir, '.lab2')
        
        G, labelDict, _, typeDict = func_get_lab_graph(modelDir, labDir, softwareName)
        Graphs[softwareName] = G
        Labels[softwareName] = labelDict
        Types[softwareName] = typeDict
        print(f'{softwareName} reading completed ')

## 5.2.1 Network structure characteristics

In [3]:
import pandas as pd
import networkx as nx
import os

from ccft.util.utils import serialize
from experiment.lab2._0_config_ import SoftwareConfigs1, LabSpace


InfoDir = os.path.join(LabSpace, '.st')
if not os.path.isdir(InfoDir):
    os.makedirs(InfoDir, exist_ok=True)

graphInfos = pd.DataFrame(columns=['name', 'N', 'E', 'E/N', 'k', 'd', 'L', 'C', 'mC', 'uC'])

loc = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        modelDir = os.path.join(LabSpace, softwareName)
        labDir = os.path.join(modelDir, '.lab2')

        G = Graphs[softwareName] 
        labelDict = Labels[softwareName] 
        typeDict =  Types[softwareName] 
        print(f'{softwareName} 读取完成 ')
        
        unDirectedGraph = nx.to_undirected(G)
        
        N = G.number_of_nodes()
        E = G.number_of_edges()
        degrees = dict(G.degree()).values()
        k = sum(degrees) / N if N > 0 else 0
        
        d = 0
        L = 0
        mC = 0
        for component in nx.connected_components(unDirectedGraph):
            # subgraph = unDirectedGraph.subgraph(component)
            # d = max(d, nx.diameter(subgraph))
            # L += (nx.average_shortest_path_length(subgraph) * subgraph.number_of_nodes())
            # mC = max(mC, nx.average_clustering(subgraph))
            pass
            
        L = round(L / G.number_of_nodes(), 2)
        C = round(nx.average_clustering(G), 4)
        uC = round(nx.average_clustering(unDirectedGraph), 4)
        mC = round(mC, 4)
        EtN = round(E/N, 2)
    
        graphInfo = {
            'name': softwareName,
            'N': N,
            'E': E,
            'E/N': EtN,
            'k': k,
            'd': d,
            'L': L,
            'C': C,
            'mC': mC,
            'uC': uC
        }
        
        print(graphInfo)
        graphInfos.loc[loc] = graphInfo
        loc += 1
    
binPath = os.path.join(InfoDir, 'infos.bin')
csvPath = os.path.join(InfoDir, 'infos.csv')
serialize(binPath, graphInfos)
graphInfos.to_csv(csvPath, index=False)
print(f'Network information is stored in {binPath}, and structural information is stored in {csvPath}')


In [4]:
graphInfos

## 5.2.2 Scale free characteristic analysis

### Network degree analysis

#### Read Degree Information

In [5]:
import pandas as pd
import numpy as np

Degrees = dict()
df = pd.DataFrame(columns=['name', 'ave', 'max', 'min'])
idx = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        G = Graphs[softwareName]
        labels = Labels[softwareName]
        types = Types[softwareName]
        
        degrees = dict(G.degree())
        outDegreeVals = np.asarray(list(degrees.values()))
        degreeIds = np.asarray(list(degrees.keys()))
        degreeLabs = np.asarray([labels[i] for i in degreeIds])
        degreeTypes = np.asarray([types[i] for i in degreeIds])
        Degrees[softwareName] = (outDegreeVals, degreeIds, degreeLabs, degreeTypes)
        
        df.loc[idx] = [softwareName, round(outDegreeVals.sum() / outDegreeVals.size, 2), outDegreeVals.max(), outDegreeVals.min()]
        idx +=1
        
degreeDf = df.T

csvPath = os.path.join(InfoDir, 'degree-info.csv')
degreeDf.to_csv(csvPath)
print(f'The degree information is saved in {csvPath}')


In [6]:
degreeDf

#### Fitting results of power-law distribution

In [7]:
import powerlaw

df = pd.DataFrame(columns=['name', 'alpha', 'k-min', 'D', 'sigma'])
idx = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        outDegreeVals = Degrees[softwareName][0]
        fit = powerlaw.Fit(outDegreeVals, discrete=True)
        df.loc[idx] = [softwareName, round(fit.alpha, 2), int(fit.xmin), round(fit.D, 3), round(fit.sigma, 3)]
        idx += 1
        
powerlawDf = df.T

csvPath = os.path.join(InfoDir, 'powerlaw.csv')
powerlawDf.to_csv(csvPath)
print(f'Power law fitting information is saved in {csvPath}')


In [8]:
powerlawDf


#### Scatter plot of degree distribution

In [9]:
from experiment.lab2.lab2_func import get_pdf

plt.figure(figsize=(8, 6))

idx = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        outDegreeVals = Degrees[softwareName][0]
        k, Pk = get_pdf(outDegreeVals)
        plt.scatter(k, Pk, label=softwareName, marker=shapes[idx], color=colors[idx])
        idx += 1
        
plt.xlabel("$k$")
plt.ylabel("$P(k)$")
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.show()

### Analysis of in degree and out degree of nodes

#### Extraction of in degree and out degree

In [10]:
import pandas as pd
import numpy as np

InDegrees = dict()
OutDegrees = dict()

df = pd.DataFrame(columns=['name', 'k-in ave', 'k-in max', 'k-out ave', 'k-out max'])
idx = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        G = Graphs[softwareName]
        labels = Labels[softwareName]
        types = Types[softwareName]
        
        inDegrees = dict(G.in_degree())
        inDegreeVals = np.asarray(list(inDegrees.values()))
        inDegreeIds = np.asarray(list(inDegrees.keys()))
        inDegreeLabs = np.asarray([labels[i] for i in inDegreeIds])
        inDegreeTypes = np.asarray([types[i] for i in inDegreeIds])
        InDegrees[softwareName] = (inDegreeVals, inDegreeIds, inDegreeLabs, inDegreeTypes)
        
        outDegrees = dict(G.out_degree())
        outDegreeVals = np.asarray(list(outDegrees.values()))
        outDegreeIds = np.asarray(list(outDegrees.keys()))
        outDegreeLabs = np.asarray([labels[i] for i in outDegreeIds])
        outDegreeTypes = np.asarray([types[i] for i in outDegreeIds])
        OutDegrees[softwareName] = (outDegreeVals, outDegreeIds, outDegreeLabs, outDegreeTypes)
        
        df.loc[idx] = [softwareName, round(inDegreeVals.sum() / inDegreeVals.size, 2), inDegreeVals.max(), 
                       round(outDegreeVals.sum() / outDegreeVals.size, 2), outDegreeVals.max()]
        idx +=1


In [11]:
df.T


#### Fitting results of in degree power-law distribution

In [12]:
df = pd.DataFrame(columns=['name', 'alpha in', 'alpha out', 'k-min in', 'k-min out', 'D in', 'D out', 'sigma in', 'sigma out'])
idx = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        inDegreeVals = InDegrees[softwareName][0]
        inFit = powerlaw.Fit(inDegreeVals + 1, discrete=True)
        outDegreeVals = OutDegrees[softwareName][0]
        outFit = powerlaw.Fit(outDegreeVals + 1, discrete=True)
        
        df.loc[idx] = [
            softwareName, 
            round(inFit.alpha, 3), round(outFit.alpha, 3), int(inFit.xmin), int(outFit.xmin), 
            round(inFit.D, 3), round(outFit.D, 3), round(inFit.sigma, 3), round(outFit.sigma, 3)
        ]
        idx += 1
df.T

#### In degree and out degree scatter plots

In [13]:
fig = plt.figure(figsize=(8, 6))
idx = 0

for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        outDegreeVals = InDegrees[softwareName][0]
        K, PK = get_pdf(outDegreeVals)
        maxDeg = outDegreeVals.max()
        plt.scatter(K, PK, label=f'{softwareName} max: {maxDeg}', marker=shapes[idx], color=colors[idx])

        idx += 1
        
plt.xlabel("$k$")
plt.ylabel("$P(k)$")
plt.xscale("log")
plt.yscale("log")
plt.legend()
plt.show()

fig = plt.figure(figsize=(8, 6))
idx = 0

for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        outDegreeVals = OutDegrees[softwareName][0]
        K, PK = get_pdf(outDegreeVals)
        maxDeg = outDegreeVals.max()
        plt.scatter(K, PK, label=f'{softwareName} max: {maxDeg}', marker=shapes[idx], color=colors[idx])

        idx += 1
        
plt.xlabel("$k$")
plt.ylabel("$p(k)$")
plt.xscale("log")
plt.yscale("log")
plt.legend()
plt.show()

### Node weighting analysis

#### Node weighting degree extraction

In [14]:
import networkx as nx
import pandas as pd
import numpy as np

WDegrees = dict()
df = pd.DataFrame(columns=['name', 'ave', 'max', 'min'])
idx = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        G = Graphs[softwareName]
        labels = Labels[softwareName]
        types = Types[softwareName]
        
        degrees = dict(nx.degree(G, weight='weight'))
        outDegreeVals = np.asarray(list(degrees.values()))
        degreeIds = np.asarray(list(degrees.keys()))
        degreeLabs = np.asarray([labels[i] for i in degreeIds])
        degreeTypes = np.asarray([types[i] for i in degreeIds])
        WDegrees[softwareName] = (outDegreeVals, degreeIds, degreeLabs, degreeTypes)
        
        df.loc[idx] = [softwareName, round(outDegreeVals.sum() / outDegreeVals.size, 2), outDegreeVals.max(), outDegreeVals.min()]
        idx +=1
df.T

#### Fitting of node weighted degree power-law distribution

In [15]:
df = pd.DataFrame(columns=['name', 'alpha', 'k-min', 'D', 'sigma'])
idx = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        wDegreeVals = WDegrees[softwareName][0]
        fit = powerlaw.Fit(wDegreeVals, discrete=True)
        df.loc[idx] = [softwareName, round(fit.alpha, 3), int(fit.xmin), round(fit.D, 3), round(fit.sigma, 3)]
        idx += 1

In [16]:
df.T


#### Weighted degree scatter plot of nodes

In [17]:
fig, axs = plt.subplots(3, 3, figsize=(12, 12))
idx = 0

for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        wDegreeVals = WDegrees[softwareName][0]
        K, PK = get_pdf(wDegreeVals)
        maxDeg = wDegreeVals.max()
        
        ax = axs[int(idx / 3), idx % 3]
        ax.scatter(K, PK, label=f'{softwareName} max: {maxDeg}', marker=shapes[idx], color=colors[idx])
        ax.set_title(softwareName)
        ax.set_xscale('log')
        ax.set_yscale('log')
        idx += 1

plt.show()

### Weight analysis of nodes

### Cumulative distribution curve of function node weights

In [12]:
from ccft.core.constant import ENode
FuncNodeWs = dict()

for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        G = Graphs[softwareName]
        types = Types[softwareName]
        
        funcNodes = []
        funcNodeWs = []
        for n, data in G.nodes(data=True):
            if types[n] == ENode.Method.name:
                funcNodes.append(n)
                funcNodeWs.append(data['weight'])
        FuncNodeWs[softwareName] = (funcNodes, funcNodeWs)


In [19]:
saveDir = os.path.join(LabSpace, 'graph', 'ENC-CDF')
if os.path.isdir(saveDir):
    shutil.rmtree(saveDir)
os.makedirs(saveDir)    

In [20]:
from experiment.lab2.lab2_func import get_pdf
import numpy as np
import pandas as pd

plt.figure(figsize=(8, 6))

idx = 0
maxW = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        G = Graphs[softwareName]
        labels = Labels[softwareName]
        types = Types[softwareName]
        
        funcNodes = FuncNodeWs[softwareName][0]
        funcNodeWs = FuncNodeWs[softwareName][1]
        
        k, p = get_pdf(funcNodeWs)
        maxW = max(maxW, max(funcNodeWs))
        cdf = [0] * len(p)
        cdf[0] = p[0]
        for i in range(1, len(p)):
            cdf[i] = cdf[i-1] + p[i]
        
        df = pd.DataFrame()
        df['nec'] = k
        df['cdf'] = cdf
        df.to_csv(os.path.join(saveDir, f'{softwareName}.csv'))
        
        # ax.set_xscale('log')
        # ax.set_title(softwareName)
        # ax.plot(dx, dy, color='gray', linestyle='-')
        plt.plot(k, cdf, color=colors[idx], label=softwareName)
        idx += 1
 
dx = np.linspace(1, maxW, num=2)
dy = [0.9, 0.9]       
plt.plot(dx, dy, color='gray', linestyle='--')
x = [20, 20]
y = [0, 1]
plt.plot(x, y, color='gray', linestyle='--')
plt.xlabel("k")
plt.ylabel("$P(k)$")
plt.xscale('log')
plt.legend()
plt.show()

## 5.2.3 Analysis of Small World Characteristics

### Calculate the average shortest path and average clustering coefficient of nodes, as well as the average shortest path and average clustering coefficient of random networks


In [15]:
def cal_ave_shortest_path_length(_graph_):
    i_pathCount = 0
    d_lengthDict = dict()
    i_sumLength = 0
    for _n_ in _graph_.nodes:
        for l in nx.single_source_shortest_path_length(_graph_, _n_).values():
            i_pathCount += 1
            i_sumLength += l
            if l in d_lengthDict:
                d_lengthDict[l] += 1
            else:
                d_lengthDict[l] = 1
            
    for l in d_lengthDict:
        d_lengthDict[l] = d_lengthDict[l] / i_pathCount
            
    return d_lengthDict, i_sumLength, i_pathCount

### Draw the shortest path distribution

In [16]:
saveDir = os.path.join(LabSpace, 'graph', 'Path-PDF')
if os.path.isdir(saveDir):
    shutil.rmtree(saveDir)
os.makedirs(saveDir)    

In [38]:
import pandas as pd
import networkx as nx

Lengths = dict()
AveLengths = dict()
AveClus = dict()

fig, axs = plt.subplots(3, 3, figsize=(18, 18))
idx = 0
labs = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']

for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        G = Graphs[softwareName]
        UnG = G.to_undirected()
        
        lengthDict, sumLength, pathCount = cal_ave_shortest_path_length(UnG)
        aveLength = sumLength / pathCount
        aveClus = nx.average_clustering(UnG)

        Lengths[softwareName] = lengthDict
        AveClus[softwareName] = aveClus
        AveLengths[softwareName] = aveLength
        
        df = pd.DataFrame()
        df['len'] = list(lengthDict.keys())
        df['count'] = list(lengthDict.values())
        df.to_csv(os.path.join(saveDir, f'{softwareName}.csv'))
        
        ax = axs[int(idx / 3), idx % 3]
        ax.bar(list(lengthDict.keys()), list(lengthDict.values()), color=colors[idx])
        ax.set_xlabel('$d$')
        ax.set_ylabel('$p(d)$')
        ax.set_title(f'{labs[idx]}) {softwareName}', y=-0.2, fontdict={'family': 'Times New Roman', 'size': 18})
        plt.subplots_adjust(hspace=0.25, wspace=0.23)
        idx += 1
plt.show()   

### Generate 100 random networks and calculate small world coefficients

In [22]:
RandomGraphCount = 10

In [23]:
df = pd.DataFrame(columns=['name', 'ave len', 'ave clus', 'ave r len', 'ave r clus', 'gamma'])
idx = 0
for softwareName, (status, language, src_dir) in SoftwareConfigs1.items():
    if status:
        G = Graphs[softwareName]
        UnG = G.to_undirected()
        labels = Labels[softwareName]
        types = Types[softwareName]
        
        N = G.number_of_nodes()
        E = G.number_of_edges()
        
        randomGraphCount = 0
        randomGraphAveClus = []
        sumRLength = 0
        sumRClus = 0
        
        while randomGraphCount < RandomGraphCount:
            randomGraph = nx.gnm_random_graph(N, E)
            _, t_sumLength, t_pathCount = cal_ave_shortest_path_length(randomGraph)
            sumRLength += t_sumLength / t_pathCount
            sumRClus += nx.average_clustering(randomGraph)
            randomGraphCount += 1
            print(f'{softwareName} generates a random network {randomGraphCount}')
        
        aveClus = AveClus[softwareName]
        aveLength = AveLengths[softwareName]
        aveRLength = sumRLength / RandomGraphCount
        aveRClus = sumRClus / RandomGraphCount
        gamma = (aveClus / aveRClus) / (aveLength / aveRLength)
        df.loc[idx] = [softwareName, round(aveLength, 2), round(aveClus, 4), round(aveRLength, 2), round(aveRClus, 4), gamma]
        
        idx += 1


In [25]:
df = df.T

csvPath = os.path.join(InfoDir, 'small-word-sim.csv')
df.to_csv(csvPath)
print(f'Small world information is saved in {csvPath}')
df
