In [13]:
from rdflib import Graph, URIRef, Literal, Namespace
import networkx as nx
import pandas as pd
import time

In [4]:
from main import *
from sa_helper import *

In [5]:
# Load the graph from the input file
graph = Graph()
graph.parse('aifb_fixed_complete.nt', format="nt") #graph.parse(input_file, format=format)

<Graph identifier=N8bb28fa8008c4e4fa31ea461380b2ed4 (<class 'rdflib.graph.Graph'>)>

In [6]:
G = nx.DiGraph()  # Create a directed graph

for s, p, o in graph:
    G.add_edge(s, o, predicate=p)

In [7]:
# Count the neighbor node number for each nodes in the graph
neighbor_counts = {node: len(list(G.neighbors(node))) for node in G.nodes()}

### Select seed nodes

In [8]:
# load training and testing entities
train_df = pd.read_csv('trainingSet.tsv', sep='\t')
test_df = pd.read_csv('testSet.tsv', sep='\t')
complete_df = pd.read_csv('completeDataset.tsv', sep='\t')

train_seed_entities = train_df['person'].tolist() 
test_seed_entities = test_df['person'].tolist()
complete_seed_entities = complete_df['person'].tolist()

### Confirm all entities from complete_seed_entities exist in the loaded RDF graph AIFB

In [10]:
for e in complete_seed_entities:
    found = False
    for node in G.nodes():
        if node==URIRef(e):
            found = True
    if not found:
        print(e)

## The property of the AIFB

In [26]:
#Exhibihate the size of subgraph：The number of nodes and edges in the subgraph.
num_nodes = len(G.nodes())
num_edges = len(G.edges())
print("AIFB has " + str(num_nodes) + " nodes and " + str(num_edges) + " edges.")

AIFB has 8285 nodes and 29146 edges.


In [7]:
#Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in AIFB.
density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0
print("The density of AIFB is " + str(density))

The density of AIFB is 0.0004246648912315282


In [12]:
#Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in AIFB.
seed_entities_in_AIFB = [entity for entity in train_seed_entities if URIRef(entity) in G.nodes()]
coverage = len(seed_entities_in_AIFB) / len(train_seed_entities)
print("The coverage for only train seed entities of AIFB is " + str(coverage))

The coverage for only train seed entities of AIFB is 1.0


In [13]:
#Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
seed_entities_in_AIFB = [entity for entity in complete_seed_entities if URIRef(entity) in G.nodes()]
coverage = len(seed_entities_in_AIFB) / len(complete_seed_entities)
print("The coverage for complete seed entities of AIFB is " + str(coverage))

The coverage for complete seed entities of AIFB is 1.0


In [30]:
#Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
avg_degree = sum(dict(G.degree()).values()) / num_nodes if num_nodes != 0 else 0
print("The relevance(Avg. degree) of AIFB is " + str(avg_degree))

The relevance(Avg. degree) of AIFB is 7.035847917923959


## Baseline:α=1, β=0, max_hops=2

### with BFS

In [14]:
# Initialize the resulting DataFrame
columns = ['Alpha', 'Beta', 'Max hops', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_ba = pd.DataFrame(columns=columns)

alpha = 1
beta = 0
max_hops = 2

start_time = time.time()

subgraph = spreading_activation_BFS(graph, G, train_seed_entities, neighbor_counts, 
                                    alpha, beta, max_hops, extraction_threshold=0,
                                    fan_out=True, excl=True, pop=True)
G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

for s, p, o in subgraph:
    G_sub.add_edge(s, o, predicate=p)

#Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
num_nodes = len(G_sub.nodes())
num_edges = len(G_sub.edges())

#Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

#Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)

#Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

#Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

end_time = time.time()
runtime = end_time - start_time

# Add results to DataFrame
results_df_ba = results_df_ba.append({
    'Alpha': alpha,
    'Beta': beta,
    'Max hops': max_hops,
    'Node number': num_nodes,
    'Edge number': num_edges,
    'Density': density,
    'Coverage(train)': coverage_train,
    'Coverage(complete)': coverage_complete,
    'Avg. degree': avg_degree,
    'Runtime': runtime
}, ignore_index=True)
                
# Export to CSV
results_df_ba.to_csv("Performance of baseline parameter combination.csv", index=False)

print(results_df_ba)    

   Alpha  Beta  Max hops  Node number  Edge number   Density  Coverage(train)  \
0    1.0   0.0       2.0       1536.0       9650.0  0.004093              1.0   

   Coverage(complete)  Avg. degree     Runtime  
0            0.909091    12.565104  303.432675  


### with DFS

In [15]:
# Initialize the resulting DataFrame
columns = ['Alpha', 'Beta', 'Max depth', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_ba = pd.DataFrame(columns=columns)

alpha = 1
beta = 0
max_depth = 2

start_time = time.time()

subgraph = spreading_activation_DFS(graph, G, train_seed_entities, neighbor_counts, 
                                    alpha, beta, max_depth, extraction_threshold=0,
                                    fan_out=True, excl=True, pop=True)
G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

for s, p, o in subgraph:
    G_sub.add_edge(s, o, predicate=p)

#Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
num_nodes = len(G_sub.nodes())
num_edges = len(G_sub.edges())

#Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

#Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)

#Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

#Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

end_time = time.time()
runtime = end_time - start_time

# Add results to DataFrame
results_df_ba = results_df_ba.append({
    'Alpha': alpha,
    'Beta': beta,
    'Max depth': max_depth,
    'Node number': num_nodes,
    'Edge number': num_edges,
    'Density': density,
    'Coverage(train)': coverage_train,
    'Coverage(complete)': coverage_complete,
    'Avg. degree': avg_degree,
    'Runtime': runtime
}, ignore_index=True)
                
# Export to CSV
results_df_ba.to_csv("Performance of baseline parameter combination_1.csv", index=False)

print(results_df_ba)   

   Alpha  Beta  Max depth  Node number  Edge number   Density  \
0    1.0   0.0        2.0       2206.0      15361.0  0.003158   

   Coverage(train)  Coverage(complete)  Avg. degree     Runtime  
0              1.0            0.954545    13.926564  786.407043  


## Parameter impact research

In [9]:
#Copy the original graph 
graph_copy = Graph()
for triple in graph:
    graph_copy.add(triple)

### SA with graph traversal method BFS

In [18]:
alpha_values = [0.1, 0.5, 0.9]
beta_values = [0.0025, 0.005, 0.0075, 0.01]
max_hops_values = [2, 3, 4]
#extraction_threshold_values = [0, 0.0001, 0.001, 0.01, 0.1]
#for extraction_threshold in extraction_threshold_values:

In [11]:
from tqdm import tqdm

In [21]:
# Initialize the resulting DataFrame
columns = ['α', 'β', 'Max hops', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df = pd.DataFrame(columns=columns)

for alpha in tqdm(alpha_values, desc="Alpha loop"):
    for beta in beta_values:
        for max_hops in max_hops_values:
                
            # Skip the undesired combination of alpha and beta
            if alpha == 0.9 and beta == 0.0025:
                continue

            start_time = time.time()

            subgraph = spreading_activation_BFS(graph_copy, G, train_seed_entities, neighbor_counts, 
                                                alpha, beta, max_hops, extraction_threshold=0,
                                                fan_out=True, excl=True, pop=True)
            G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

            for s, p, o in subgraph:
                G_sub.add_edge(s, o, predicate=p)

            #Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
            num_nodes = len(G_sub.nodes())
            num_edges = len(G_sub.edges())

            #Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
            density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

            #Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
            seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)
            
            #Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
            seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

            #Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
            avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

            end_time = time.time()
            runtime = end_time - start_time

            # Add results to DataFrame
            results_df = results_df.append({
                'α': alpha,
                'β': beta,
                'Max hops': max_hops,
                'Node number': num_nodes,
                'Edge number': num_edges,
                'Density': density,
                'Coverage(train)': coverage_train,
                'Coverage(complete)': coverage_complete,
                'Avg. degree': avg_degree,
                'Runtime': runtime
            }, ignore_index=True)
                
# Export to CSV
results_df.to_csv("parameter_impact_ex1.csv", index=False)

print(results_df)    

Alpha loop: 100%|████████████████████████████████████████████████████████████████████| 3/3 [4:41:18<00:00, 5626.31s/it]

      α       β  Max hoops  Node number  Edge number   Density  \
0   0.1  0.0025        2.0       1654.0      13044.0  0.004771   
1   0.1  0.0025        3.0       1654.0      13044.0  0.004771   
2   0.1  0.0025        4.0       1654.0      13044.0  0.004771   
3   0.1  0.0050        2.0       1522.0      12206.0  0.005273   
4   0.1  0.0050        3.0       1522.0      12206.0  0.005273   
5   0.1  0.0050        4.0       1522.0      12206.0  0.005273   
6   0.1  0.0075        2.0       1477.0      11753.0  0.005391   
7   0.1  0.0075        3.0       1477.0      11753.0  0.005391   
8   0.1  0.0075        4.0       1477.0      11753.0  0.005391   
9   0.1  0.0100        2.0       1460.0      11430.0  0.005366   
10  0.1  0.0100        3.0       1460.0      11430.0  0.005366   
11  0.1  0.0100        4.0       1460.0      11430.0  0.005366   
12  0.5  0.0025        2.0       1895.0      14188.0  0.003953   
13  0.5  0.0025        3.0       1895.0      14188.0  0.003953   
14  0.5  0




In [14]:
alpha_values = [0.1, 0.25, 0.5, 0.9]
beta_values = [0.0025, 0.005, 0.0075, 0.01]
max_hops_values = [2, 3, 4]
#extraction_threshold_values = [0, 0.0001, 0.001, 0.01, 0.1]
#for extraction_threshold in extraction_threshold_values:

In [15]:
# Initialize the resulting DataFrame
columns = ['α', 'β', 'Max hops', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_hop = pd.DataFrame(columns=columns)

for alpha in tqdm(alpha_values, desc="Alpha loop"):
    for beta in beta_values:
        for max_hops in max_hops_values:
                
            # Skip the undesired combination of alpha and beta
            if alpha == 0.9 and beta == 0.0025:
                continue

            start_time = time.time()

            subgraph = spreading_activation_BFS(graph_copy, G, train_seed_entities, neighbor_counts, 
                                                alpha, beta, max_hops, extraction_threshold=0,
                                                fan_out=True, excl=True, pop=True)
            G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

            for s, p, o in subgraph:
                G_sub.add_edge(s, o, predicate=p)

            #Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
            num_nodes = len(G_sub.nodes())
            num_edges = len(G_sub.edges())

            #Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
            density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

            #Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
            seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)
            
            #Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
            seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

            #Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
            avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

            end_time = time.time()
            runtime = end_time - start_time

            # Add results to DataFrame
            results_df_hop = results_df_hop.append({
                'α': alpha,
                'β': beta,
                'Max hops': max_hops,
                'Node number': num_nodes,
                'Edge number': num_edges,
                'Density': density,
                'Coverage(train)': coverage_train,
                'Coverage(complete)': coverage_complete,
                'Avg. degree': avg_degree,
                'Runtime': runtime
            }, ignore_index=True)
                
# Export to CSV
results_df_hop.to_csv("parameter_impact_ex_2.csv", index=False)

print(results_df_hop)    

Alpha loop: 100%|████████████████████████████████████████████████████████████████████| 4/4 [3:34:36<00:00, 3219.19s/it]

       α       β  Max hops  Node number  Edge number   Density  \
0   0.10  0.0025       2.0       1394.0       8734.0  0.004498   
1   0.10  0.0025       3.0       1598.0      12653.0  0.004958   
2   0.10  0.0025       4.0       1601.0      12603.0  0.004920   
3   0.10  0.0050       2.0       1387.0       8709.0  0.004530   
4   0.10  0.0050       3.0       1493.0      11784.0  0.005290   
5   0.10  0.0050       4.0       1504.0      11816.0  0.005227   
6   0.10  0.0075       2.0       1386.0       8703.0  0.004534   
7   0.10  0.0075       3.0       1464.0      11456.0  0.005349   
8   0.10  0.0075       4.0       1465.0      11463.0  0.005345   
9   0.10  0.0100       2.0       1384.0       8689.0  0.004540   
10  0.10  0.0100       3.0       1437.0      10760.0  0.005214   
11  0.10  0.0100       4.0       1454.0      11073.0  0.005241   
12  0.25  0.0025       2.0       1456.0       9449.0  0.004460   
13  0.25  0.0025       3.0       1690.0      13284.0  0.004654   
14  0.25  




In [1]:
alpha_values = [0.9]
beta_values = [0.0025]
max_hops_values = [2, 3, 4]
#extraction_threshold_values = [0, 0.0001, 0.001, 0.01, 0.1]

In [12]:
# Initialize the resulting DataFrame
columns = ['α', 'β', 'Max hops', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_thr = pd.DataFrame(columns=columns)

for alpha in tqdm(alpha_values, desc="Alpha loop"):
    for beta in beta_values:
        for max_hops in max_hops_values:
                
            # Skip the undesired combination of alpha and beta
            #if alpha == 0.9 and beta == 0.0025:
                #continue

            start_time = time.time()

            subgraph = spreading_activation_BFS(graph_copy, G, train_seed_entities, neighbor_counts, 
                                                alpha, beta, max_hops, extraction_threshold=0,
                                                fan_out=True, excl=True, pop=True)
            G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

            for s, p, o in subgraph:
                G_sub.add_edge(s, o, predicate=p)

            #Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
            num_nodes = len(G_sub.nodes())
            num_edges = len(G_sub.edges())

            #Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
            density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

            #Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
            seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)
            
            #Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
            seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

            #Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
            avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

            end_time = time.time()
            runtime = end_time - start_time

            # Add results to DataFrame
            results_df_thr = results_df_thr.append({
                'α': alpha,
                'β': beta,
                'Max hops': max_hops,
                'Node number': num_nodes,
                'Edge number': num_edges,
                'Density': density,
                'Coverage(train)': coverage_train,
                'Coverage(complete)': coverage_complete,
                'Avg. degree': avg_degree,
                'Runtime': runtime
            }, ignore_index=True)
                
# Export to CSV
results_df_thr.to_csv("parameter_impact_ex_3.csv", index=False)

print(results_df_thr)    

Alpha loop: 100%|██████████████████████████████████████████████████████████████████████| 1/1 [19:54<00:00, 1194.20s/it]

     α       β  Max hops  Node number  Edge number   Density  Coverage(train)  \
0  0.9  0.0025       2.0       1466.0       9438.0  0.004394              1.0   
1  0.9  0.0025       3.0       1827.0      13696.0  0.004105              1.0   
2  0.9  0.0025       4.0       1864.0      14036.0  0.004042              1.0   

   Coverage(complete)  Avg. degree     Runtime  
0            0.903409    12.875853  241.518181  
1            0.948864    14.992885  449.151833  
2            0.948864    15.060086  503.529024  





### SA with graph traversal method DFS

In [34]:
alpha_values = [0.1, 0.25, 0.5, 0.9]
beta_values = [0.0025, 0.005, 0.0075, 0.01]
max_depth_values = [2, 3]
#extraction_threshold_values = [0, 0.0001, 0.001, 0.01, 0.1]
#for extraction_threshold in extraction_threshold_values:

In [35]:
# Initialize the resulting DataFrame
columns = ['Alpha', 'Beta', 'Max depth', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_DFS = pd.DataFrame(columns=columns)

for alpha in tqdm(alpha_values, desc="Alpha loop"):
    for beta in beta_values:
        for max_depth in max_depth_values:
                
            # Skip the undesired combination of alpha and beta
            if alpha == 0.9 and beta == 0.0025:
                continue

            start_time = time.time()

            subgraph = spreading_activation_DFS(graph_copy, G, train_seed_entities, neighbor_counts, 
                                                alpha, beta, max_depth, extraction_threshold=0,
                                                fan_out=True, excl=True, pop=True)
            G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

            for s, p, o in subgraph:
                G_sub.add_edge(s, o, predicate=p)

            #Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
            num_nodes = len(G_sub.nodes())
            num_edges = len(G_sub.edges())

            #Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
            density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

            #Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
            seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)
            
            #Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
            seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

            #Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
            avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

            end_time = time.time()
            runtime = end_time - start_time

            # Add results to DataFrame
            results_df_DFS = results_df_DFS.append({
                'Alpha': alpha,
                'Beta': beta,
                'Max depth': max_depth,
                'Node number': num_nodes,
                'Edge number': num_edges,
                'Density': density,
                'Coverage(train)': coverage_train,
                'Coverage(complete)': coverage_complete,
                'Avg. degree': avg_degree,
                'Runtime': runtime
            }, ignore_index=True)
                
# Export to CSV
results_df_DFS.to_csv("parameter_impact_ex3.csv", index=False)

print(results_df_DFS)    

Alpha loop: 100%|████████████████████████████████████████████████████████████████████| 4/4 [7:06:30<00:00, 6397.59s/it]

    Alpha    Beta  Max depth  Node number  Edge number   Density  \
0    0.10  0.0025        2.0       1812.0      13935.0  0.004246   
1    0.10  0.0025        3.0       2077.0      14947.0  0.003466   
2    0.10  0.0050        2.0       1721.0      13608.0  0.004597   
3    0.10  0.0050        3.0       1910.0      14296.0  0.003921   
4    0.10  0.0075        2.0       1700.0      13444.0  0.004655   
5    0.10  0.0075        3.0       1819.0      13871.0  0.004195   
6    0.10  0.0100        2.0       1632.0      13065.0  0.004908   
7    0.10  0.0100        3.0       1742.0      13506.0  0.004453   
8    0.25  0.0025        2.0       1907.0      14369.0  0.003953   
9    0.25  0.0025        3.0       2237.0      15421.0  0.003083   
10   0.25  0.0050        2.0       1834.0      14042.0  0.004177   
11   0.25  0.0050        3.0       2160.0      15215.0  0.003263   
12   0.25  0.0075        2.0       1806.0      13915.0  0.004269   
13   0.25  0.0075        3.0       2076.0      1




In [10]:
alpha_values = [0.1, 0.25, 0.5, 0.9]
beta_values = [0.0025, 0.005, 0.0075, 0.01]
max_depth_values = [4]

In [11]:
# Initialize the resulting DataFrame
columns = ['Alpha', 'Beta', 'Max depth', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_DFS = pd.DataFrame(columns=columns)

for alpha in tqdm(alpha_values, desc="Alpha loop"):
    for beta in beta_values:
        for max_depth in max_depth_values:
                
            # Skip the undesired combination of alpha and beta
            if alpha == 0.9 and beta == 0.0025:
                continue

            start_time = time.time()

            subgraph = spreading_activation_DFS(graph_copy, G, train_seed_entities, neighbor_counts, 
                                                alpha, beta, max_depth, extraction_threshold=0,
                                                fan_out=True, excl=True, pop=True)
            G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

            for s, p, o in subgraph:
                G_sub.add_edge(s, o, predicate=p)

            #Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
            num_nodes = len(G_sub.nodes())
            num_edges = len(G_sub.edges())

            #Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
            density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

            #Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
            seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)
            
            #Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
            seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

            #Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
            avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

            end_time = time.time()
            runtime = end_time - start_time

            # Add results to DataFrame
            results_df_DFS = results_df_DFS.append({
                'Alpha': alpha,
                'Beta': beta,
                'Max depth': max_depth,
                'Node number': num_nodes,
                'Edge number': num_edges,
                'Density': density,
                'Coverage(train)': coverage_train,
                'Coverage(complete)': coverage_complete,
                'Avg. degree': avg_degree,
                'Runtime': runtime
            }, ignore_index=True)
                
# Export to CSV
results_df_DFS.to_csv("parameter_impact_ex4.csv", index=False)

print(results_df_DFS)    

Alpha loop: 100%|████████████████████████████████████████████████████████████████████| 4/4 [4:34:42<00:00, 4120.68s/it]

    Alpha    Beta  Max depth  Node number  Edge number   Density  \
0    0.10  0.0025        4.0       2088.0      15015.0  0.003446   
1    0.10  0.0050        4.0       1938.0      14535.0  0.003872   
2    0.10  0.0075        4.0       1841.0      14122.0  0.004169   
3    0.10  0.0100        4.0       1781.0      13855.0  0.004370   
4    0.25  0.0025        4.0       2216.0      15378.0  0.003133   
5    0.25  0.0050        4.0       2153.0      15211.0  0.003283   
6    0.25  0.0075        4.0       2077.0      14990.0  0.003476   
7    0.25  0.0100        4.0       2000.0      14762.0  0.003692   
8    0.50  0.0025        4.0       2310.0      15641.0  0.002932   
9    0.50  0.0050        4.0       2265.0      15504.0  0.003023   
10   0.50  0.0075        4.0       2217.0      15384.0  0.003131   
11   0.50  0.0100        4.0       2200.0      15348.0  0.003173   
12   0.90  0.0050        4.0       2328.0      15667.0  0.002892   
13   0.90  0.0075        4.0       2306.0      1




In [16]:
alpha_values = [0.1]
beta_values = [0.01]
max_depth_values = [2, 3, 4]

In [17]:
# Initialize the resulting DataFrame
columns = ['Alpha', 'Beta', 'Max depth', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_DFS_d = pd.DataFrame(columns=columns)

for alpha in tqdm(alpha_values, desc="Alpha loop"):
    for beta in beta_values:
        for max_depth in max_depth_values:
                
            # Skip the undesired combination of alpha and beta
            if alpha == 0.9 and beta == 0.0025:
                continue

            start_time = time.time()

            subgraph = spreading_activation_DFS(graph_copy, G, train_seed_entities, neighbor_counts, 
                                                alpha, beta, max_depth, extraction_threshold=0,
                                                fan_out=True, excl=True, pop=True)
            G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

            for s, p, o in subgraph:
                G_sub.add_edge(s, o, predicate=p)

            #Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
            num_nodes = len(G_sub.nodes())
            num_edges = len(G_sub.edges())

            #Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
            density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

            #Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
            seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)
            
            #Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
            seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
            coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

            #Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
            avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

            end_time = time.time()
            runtime = end_time - start_time

            # Add results to DataFrame
            results_df_DFS_d = results_df_DFS_d.append({
                'Alpha': alpha,
                'Beta': beta,
                'Max depth': max_depth,
                'Node number': num_nodes,
                'Edge number': num_edges,
                'Density': density,
                'Coverage(train)': coverage_train,
                'Coverage(complete)': coverage_complete,
                'Avg. degree': avg_degree,
                'Runtime': runtime
            }, ignore_index=True)
                
# Export to CSV
results_df_DFS_d.to_csv("parameter_impact_ex6.csv", index=False)

print(results_df_DFS_d)    

Alpha loop: 100%|██████████████████████████████████████████████████████████████████████| 1/1 [41:50<00:00, 2510.68s/it]

   Alpha  Beta  Max depth  Node number  Edge number   Density  \
0    0.1  0.01        2.0       1643.0      13230.0  0.004904   
1    0.1  0.01        3.0       1759.0      13668.0  0.004420   
2    0.1  0.01        4.0       1781.0      13855.0  0.004370   

   Coverage(train)  Coverage(complete)  Avg. degree      Runtime  
0              1.0            0.914773    16.104687   684.011912  
1              1.0            0.943182    15.540648   699.555607  
2              1.0            0.943182    15.558675  1127.109778  





## Test impact of 3 boolean parameters (fan_out, excl, pop)

### when fan_out=False, excl=True, pop=True

In [17]:
# Initialize the resulting DataFrame
columns = ['Alpha', 'Beta', 'Max hops', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_bool = pd.DataFrame(columns=columns)

alpha = 0.186
beta = 0.006
max_hops = 3

start_time = time.time()

subgraph = spreading_activation_BFS(graph, G, train_seed_entities, neighbor_counts, 
                                    alpha=0.186, beta=0.006, max_hops=3, extraction_threshold=0,
                                    fan_out=False, excl=True, pop=True)
G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

for s, p, o in subgraph:
    G_sub.add_edge(s, o, predicate=p)

#Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
num_nodes = len(G_sub.nodes())
num_edges = len(G_sub.edges())

#Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

#Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)

#Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

#Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

end_time = time.time()
runtime = end_time - start_time

# Add results to DataFrame
results_df_bool = results_df_bool.append({
    'Alpha': alpha,
    'Beta': beta,
    'Max hops': max_hops,
    'Node number': num_nodes,
    'Edge number': num_edges,
    'Density': density,
    'Coverage(train)': coverage_train,
    'Coverage(complete)': coverage_complete,
    'Avg. degree': avg_degree,
    'Runtime': runtime
}, ignore_index=True)
                
# Export to CSV
results_df_bool.to_csv("Impact of 3 boolean parameters_1.csv", index=False)

print(results_df_bool)    

   Alpha   Beta  Max hops  Node number  Edge number   Density  \
0  0.186  0.006       3.0       2127.0      14906.0  0.003296   

   Coverage(train)  Coverage(complete)  Avg. degree     Runtime  
0              1.0            0.948864    14.015985  807.009719  


### when fan_out=True, excl=False, pop=True

In [18]:
# Initialize the resulting DataFrame
columns = ['Alpha', 'Beta', 'Max hops', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_bool = pd.DataFrame(columns=columns)

alpha = 0.186
beta = 0.006
max_hops = 3

start_time = time.time()

subgraph = spreading_activation_BFS(graph, G, train_seed_entities, neighbor_counts, 
                                    alpha=0.186, beta=0.006, max_hops=3, extraction_threshold=0,
                                    fan_out=True, excl=False, pop=True)
G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

for s, p, o in subgraph:
    G_sub.add_edge(s, o, predicate=p)

#Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
num_nodes = len(G_sub.nodes())
num_edges = len(G_sub.edges())

#Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

#Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)

#Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

#Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

end_time = time.time()
runtime = end_time - start_time

# Add results to DataFrame
results_df_bool = results_df_bool.append({
    'Alpha': alpha,
    'Beta': beta,
    'Max hops': max_hops,
    'Node number': num_nodes,
    'Edge number': num_edges,
    'Density': density,
    'Coverage(train)': coverage_train,
    'Coverage(complete)': coverage_complete,
    'Avg. degree': avg_degree,
    'Runtime': runtime
}, ignore_index=True)
                
# Export to CSV
results_df_bool.to_csv("Impact of 3 boolean parameters_2.csv", index=False)

print(results_df_bool)    

   Alpha   Beta  Max hops  Node number  Edge number   Density  \
0  0.186  0.006       3.0       1547.0      12185.0  0.005095   

   Coverage(train)  Coverage(complete)  Avg. degree    Runtime  
0              1.0              0.9375     15.75307  35.938984  


### when fan_out=True, excl=True, pop=False

In [19]:
# Initialize the resulting DataFrame
columns = ['Alpha', 'Beta', 'Max hops', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_bool = pd.DataFrame(columns=columns)

alpha = 0.186
beta = 0.006
max_hops = 3

start_time = time.time()

subgraph = spreading_activation_BFS(graph, G, train_seed_entities, neighbor_counts, 
                                    alpha=0.186, beta=0.006, max_hops=3, extraction_threshold=0,
                                    fan_out=True, excl=True, pop=False)
G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

for s, p, o in subgraph:
    G_sub.add_edge(s, o, predicate=p)

#Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
num_nodes = len(G_sub.nodes())
num_edges = len(G_sub.edges())

#Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

#Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)

#Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

#Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

end_time = time.time()
runtime = end_time - start_time

# Add results to DataFrame
results_df_bool = results_df_bool.append({
    'Alpha': alpha,
    'Beta': beta,
    'Max hops': max_hops,
    'Node number': num_nodes,
    'Edge number': num_edges,
    'Density': density,
    'Coverage(train)': coverage_train,
    'Coverage(complete)': coverage_complete,
    'Avg. degree': avg_degree,
    'Runtime': runtime
}, ignore_index=True)
                
# Export to CSV
results_df_bool.to_csv("Impact of 3 boolean parameters_3.csv", index=False)

print(results_df_bool)    

   Alpha   Beta  Max hops  Node number  Edge number   Density  \
0  0.186  0.006       3.0       1434.0      10714.0  0.005214   

   Coverage(train)  Coverage(complete)  Avg. degree     Runtime  
0              1.0            0.897727    14.942817  129.537624  


### when fan_out=True, excl=False, pop=False

In [20]:
# Initialize the resulting DataFrame
columns = ['Alpha', 'Beta', 'Max hops', 'Node number', 'Edge number', 'Density', 'Coverage(train)', 
           'Coverage(complete)', 'Avg. degree', 'Runtime']
results_df_bool = pd.DataFrame(columns=columns)

alpha = 0.186
beta = 0.006
max_hops = 3

start_time = time.time()

subgraph = spreading_activation_BFS(graph, G, train_seed_entities, neighbor_counts, 
                                    alpha=0.186, beta=0.006, max_hops=3, extraction_threshold=0,
                                    fan_out=True, excl=False, pop=False)
G_sub = nx.DiGraph()  # Create a directed graph for the subgraph

for s, p, o in subgraph:
    G_sub.add_edge(s, o, predicate=p)

#Exhibihate the size of subgraph: The number of nodes and edges in the subgraph.
num_nodes = len(G_sub.nodes())
num_edges = len(G_sub.edges())

#Exhibihate the density of subgraph: The ratio of the number of edges to the number of nodes in the subgraph.
density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

#Exhibihate the coverage(only train_seed_entities): Percentage of the seed entities that are included in the subgraph.
seed_entities_in_subgraph = [entity for entity in train_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_train = len(seed_entities_in_subgraph) / len(train_seed_entities)

#Exhibihate the coverage(complete_seed_entities): Percentage of the seed entities that are included in AIFB.
seed_entities_in_subgraph = [entity for entity in complete_seed_entities if URIRef(entity) in G_sub.nodes()]
coverage_complete = len(seed_entities_in_subgraph) / len(complete_seed_entities)

#Exhibihate the Relevance: How many of the nodes and edges in the subgraph are relevant
avg_degree = sum(dict(G_sub.degree()).values()) / num_nodes if num_nodes != 0 else 0

end_time = time.time()
runtime = end_time - start_time

# Add results to DataFrame
results_df_bool = results_df_bool.append({
    'Alpha': alpha,
    'Beta': beta,
    'Max hops': max_hops,
    'Node number': num_nodes,
    'Edge number': num_edges,
    'Density': density,
    'Coverage(train)': coverage_train,
    'Coverage(complete)': coverage_complete,
    'Avg. degree': avg_degree,
    'Runtime': runtime
}, ignore_index=True)
                
# Export to CSV
results_df_bool.to_csv("Impact of 3 boolean parameters_4.csv", index=False)

print(results_df_bool)    

   Alpha   Beta  Max hops  Node number  Edge number   Density  \
0  0.186  0.006       3.0       1470.0      11421.0  0.005289   

   Coverage(train)  Coverage(complete)  Avg. degree    Runtime  
0              1.0            0.931818    15.538776  11.308762  
