In [26]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

from GraphAPI import GraphCreator
from graph_helpers import *

%aimport GraphAPI
%aimport graph_helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Generating Graph from Entry Point

1. We initialize our GraphCreator class and check how many new nodes we will need to query. 

In [38]:
gc = GraphCreator("Decision tree")
print("Number of Links to Search:", len(gc.next_links))
print(gc.primary_nodes)

Number of Links to Search: 257
{'Design rationale': True, 'Algorithm': True, 'Tree (graph theory)': True, 'Random forest': True, 'Behavior tree (artificial intelligence, robotics and control)': True, 'Odds algorithm': True, 'Decision tree learning': True, 'Topological combinatorics': True, 'Boosting (machine learning)': True, 'Truth table': True, 'Decision list': True, 'Goal': True, 'DRAKON': True, 'Decision tree model': True, 'Decision analysis': True, 'Operations research': True, 'Causal model': True, 'Utility': True, 'Decision table': True, 'Markov chain': True, 'Decision cycle': True, 'Decision support system': True, 'Probability': True}


2. We query all the nodes linked to/from the entry point (expand our network one level for each node).

In [39]:
gc.expand_network(group_size=2, timeout=5, log_progress=False)

3. Since some nodes will likely have linked to articles through a redirect link, we need to traverse our graph and ensure that all redirects are assigned to the correct nodes. Once all redirects have been dealt with, we remove any old redirect node. 

In [40]:
gc.redraw_redirects()

4. Edges are weighted by how many categories two connected nodes have in common. Once we have all our nodes, and we have dealt with redirects, we can add edge weights for our entire graph. 

In [41]:
gc.update_edge_weights()
gc.get_edge_weights().head()

Unnamed: 0,source_node,target_node,edge_weight
0,Hysteria Project 2,Hysteria Project,7
1,Hysteria Project,Hysteria Project 2,7
2,Concept map,Mind map,5
3,Visual analytics,Visualization (graphics),5
4,Mind map,Concept map,5


# Getting Our Feature Set

There are two options when generating the feature set:

1. we can generate a standard feature set with only the features themselves. To do this, have the `rank` parameter set to `False`.
2. We can generate a ranked feature set (set `rank` equal to `True`). For each parameter, this will rank them in order of _best_ to _worst_ (this could be ascending or descending, depending on the context of the feature).

After running `get_features_df`, the feature set will be saved in the GraphCreator instance as `feature_df`

In [42]:
features_df = gc.get_features_df(rank=False)

In [43]:
features_df.sort_values("shared_neighbors_with_entry_score", ascending=False)

Unnamed: 0,node,degree,category_matches_with_source,in_edges,out_edges,shared_neighbors_with_entry_score,centrality,page_rank,adjusted_reciprocity,shortest_path_length_from_entry,shortest_path_length_to_entry,primary_link
78,Decision tree,255,1,169.0,86.0,1.000000,8.854455e-02,0.001043,68.0,0.0,0.0,0
107,International Standard Book Number,135,0,133.0,2.0,0.594595,1.153773e-01,0.001201,4.0,1.0,2.0,0
113,Digital object identifier,126,0,124.0,2.0,0.554054,1.110512e-01,0.000965,4.0,1.0,2.0,0
127,Issue tree,98,0,38.0,60.0,0.238494,3.007667e-02,0.000213,46.0,1.0,1.0,0
111,Morphological analysis (problem-solving),130,0,67.0,63.0,0.233716,2.508264e-02,0.000377,58.0,1.0,1.0,0
139,Problem structuring methods,74,0,33.0,41.0,0.229730,1.973649e-02,0.000152,44.0,1.0,1.0,0
141,Diagrammatic reasoning,70,0,31.0,39.0,0.229730,1.529013e-02,0.000108,36.0,1.0,1.0,0
132,Business decision mapping,91,0,36.0,55.0,0.227848,1.786681e-02,0.000138,42.0,1.0,1.0,0
126,Information mapping,99,0,40.0,59.0,0.227273,2.063346e-02,0.000205,46.0,1.0,1.0,0
161,PubMed Identifier,51,0,,,0.225225,5.910859e-02,0.000380,0.0,1.0,,0


## Similarity Rank

Two articles are more similar the more categories they share and the closer they are to each other. 

In [44]:
gc.rank_similarity()
gc.features_df[[
    "node", 
    "category_matches_with_source", 
    "primary_link", 
    "shared_neighbors_with_entry_score",
    "shortest_path_length_from_entry",
    "shortest_path_length_to_entry",
    "centrality",
    "similarity_rank", 
]].sort_values(["similarity_rank", "centrality"], ascending=False).head(10)
# gc.features_df[gc.features_df.category_matches_with_source == 1]

Unnamed: 0,node,category_matches_with_source,primary_link,shared_neighbors_with_entry_score,shortest_path_length_from_entry,shortest_path_length_to_entry,centrality,similarity_rank
78,Decision tree,1,0,1.0,0.0,0.0,0.088545,inf
33,Design rationale,0,1,0.098847,1.0,1.0,0.034343,1.098847
60,Decision tree learning,1,1,0.067442,2.0,2.0,0.126444,1.033721
116,Decision analysis,1,1,0.065359,2.0,2.0,0.012034,1.03268
66,Random forest,1,1,0.056098,2.0,2.0,0.14331,1.028049
57,Tree (graph theory),0,1,0.018933,1.0,1.0,0.014493,1.018933
144,Decision cycle,0,1,0.018182,1.0,1.0,0.00216,1.018182
159,Decision table,1,1,0.034884,2.0,2.0,0.005249,1.017442
343,Decision list,0,1,0.017167,1.0,1.0,0.01763,1.017167
172,Decision tree model,1,1,0.02,2.0,2.0,0.008457,1.01


In [None]:
gc.primary_nodes

# Basic Plotting

In [None]:
sns.pairplot(features_df)

In [None]:
sns.heatmap(features_df.corr())

# Intro Node Values

In [None]:
intro_nodes = gc.intro_nodes
intro_nodes_df = None
for node in intro_nodes:
    if df is None:
        intro_nodes_df = pd.DataFrame(features_df[features_df.node == node])
    else:
        intro_nodes_df = pd.concat([intro_nodes_df, pd.DataFrame(features_df[features_df.node == node])])
    
intro_nodes_df

In [None]:
sns.pairplot(intro_nodes_df, hue="node")