In [1]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

from GraphAPI import GraphCreator
from graph_helpers import *
from evaluations import *

%aimport GraphAPI
%aimport graph_helpers
%aimport evaluations

## Generating Graph from Entry Point

1. We initialize our GraphCreator class and check how many new nodes we will need to query. 

In [83]:
gc = GraphCreator("Decision tree", include_see_also=False)
print("Number of Links to Search:", len(gc.next_links), "\n\n")
print(list(gc.primary_nodes.keys()), "\n\n")
print(gc.see_also_articles)

Number of Links to Search: 257 


['Causal model', 'Decision analysis', 'Algorithm', 'Operations research', 'Tree (graph theory)', 'Probability', 'Decision support system', 'Goal', 'Decision tree learning', 'Utility'] 


['Behavior tree (artificial intelligence, robotics and control)', 'Boosting (machine learning)', 'Decision cycle', 'Decision list', 'Decision table', 'Decision tree model', 'Design rationale', 'DRAKON', 'Markov chain', 'Random forest', 'Odds algorithm', 'Topological combinatorics', 'Truth table']


2. We query all the nodes linked to/from the entry point (expand our network one level for each node).

In [84]:
gc.expand_network(group_size=2, timeout=5, log_progress=False)

3. Since some nodes will likely have linked to articles through a redirect link, we need to traverse our graph and ensure that all redirects are assigned to the correct nodes. Once all redirects have been dealt with, we remove any old redirect nodes. 

In [85]:
gc.redraw_redirects()

4. Edges are weighted by how many categories two connected nodes have in common. Once we have all our nodes, and we have dealt with redirects, we can add edge weights for our entire graph. 

In [86]:
gc.update_edge_weights()
gc.get_edge_weights().head()

Unnamed: 0,source_node,target_node,edge_weight
0,Hysteria Project 2,Hysteria Project,7
1,Hysteria Project,Hysteria Project 2,7
2,Mind map,Concept map,5
3,Glossary of artificial intelligence,Deep learning,5
4,Visualization (graphics),Visual analytics,5


# Getting Our Feature Set

There are two options when generating the feature set:

1. we can generate a standard feature set with only the features themselves. To do this, have the `rank` parameter set to `False`.
2. We can generate a ranked feature set (set `rank` equal to `True`). For each parameter, this will rank them in order of _best_ to _worst_ (this could be ascending or descending, depending on the context of the feature).

After running `get_features_df`, the feature set will be saved in the GraphCreator instance as `feature_df`

In [87]:
features_df = gc.get_features_df(rank=False)

In [88]:
features_df.sort_values("degree", ascending=False)

Unnamed: 0,node,degree,category_matches_with_source,in_edges,out_edges,shared_neighbors_with_entry_score,centrality,page_rank,adjusted_reciprocity,shortest_path_length_from_entry,shortest_path_length_to_entry,jaccard_similarity,primary_link
0,PubMed,18773,0,18684.0,89.0,0.002806,6.618804e-02,0.130279,0.030428,1.0,3.0,0.001913,0
1,Logic,3492,0,2794.0,698.0,0.005084,9.545089e-02,0.019050,0.214215,2.0,1.0,0.003726,0
2,Algorithm,3381,0,2993.0,388.0,0.011272,1.690803e-01,0.014804,0.062682,1.0,2.0,0.008291,1
3,Index of philosophy articles (D–H),3034,0,13.0,3021.0,0.002162,3.453011e-04,0.000080,0.006086,3.0,1.0,0.005525,0
4,Time,2896,0,1970.0,926.0,0.003509,5.513420e-02,0.013656,0.330452,1.0,3.0,0.000468,0
5,Dungeon Master,2676,0,2342.0,334.0,0.000384,7.487214e-06,0.026458,0.178310,5.0,1.0,0.000000,0
6,Machine learning,2609,0,2080.0,529.0,0.023376,2.859254e-01,0.008701,0.150316,2.0,1.0,0.017647,0
7,Game theory,2555,0,1795.0,760.0,0.013363,1.396371e-01,0.010582,0.304892,2.0,1.0,0.006663,0
8,Causality,2280,0,1280.0,1000.0,0.007481,6.790412e-02,0.008155,0.292721,1.0,2.0,0.002768,0
9,Glossary of artificial intelligence,2022,0,319.0,1703.0,0.020329,1.922587e-01,0.001952,0.082156,2.0,1.0,0.027368,0


## Similarity Rank

Two articles are more similar the more categories they share and the closer they are to each other. 

In [89]:
gc.rank_similarity()

gc.features_df.sort_values(["similarity_rank"], ascending=False).reset_index().drop("index", axis=1)

Unnamed: 0,node,degree,category_matches_with_source,in_edges,out_edges,shared_neighbors_with_entry_score,centrality,page_rank,adjusted_reciprocity,shortest_path_length_from_entry,shortest_path_length_to_entry,jaccard_similarity,primary_link,similarity_rank
0,Decision tree,255,1,169.0,86.0,1.000000,8.850722e-02,0.000686,0.020691,0.0,0.0,1.000000,0,4.295497
1,Decision analysis,119,1,70.0,49.0,0.065359,1.203684e-02,0.000280,0.008520,2.0,2.0,0.039130,1,0.935841
2,Decision tree learning,328,1,172.0,156.0,0.067442,1.260537e-01,0.000635,0.054771,2.0,2.0,0.052469,1,0.854511
3,Causal model,89,0,14.0,75.0,0.026667,5.477489e-03,0.000068,0.001217,1.0,2.0,0.000000,1,0.618289
4,Tree (graph theory),399,0,314.0,85.0,0.018933,1.455990e-02,0.001675,0.017040,1.0,1.0,0.010460,1,0.594951
5,Influence diagram,54,1,20.0,34.0,0.063492,1.999828e-02,0.000130,0.004260,2.0,2.0,0.038462,0,0.517049
6,"Behavior tree (artificial intelligence, roboti...",25,1,6.0,19.0,0.016807,2.105911e-02,0.000044,0.002434,2.0,2.0,0.017442,0,0.507942
7,Information gain in decision trees,54,1,11.0,43.0,0.042802,1.905330e-02,0.000050,0.004260,2.0,2.0,0.028571,0,0.506728
8,ID3 algorithm,80,1,20.0,60.0,0.061151,1.953274e-02,0.000079,0.003651,2.0,2.0,0.044199,0,0.503419
9,Decision tree model,39,1,14.0,25.0,0.020000,8.440935e-03,0.000092,0.003043,2.0,2.0,0.022346,0,0.497116


# Validation

In [90]:
evaluate_metrics(gc.features_df, 
                 on=["similarity_rank", "centrality", "adjusted_reciprocity", "page_rank", "shortest_path_length_from_entry", "jaccard_similarity"], 
                 targets=gc.see_also_articles)

Metric Score,score,max score possible,difference,total targets,% targets in top 1%,% targets in top 5%,% targets in top 10%,% targets in top 20%
similarity_rank,0.952936,0.999786,0.04685,13.0,0.769231,1.0,1.0,1.0
centrality,0.893202,0.999786,0.106584,13.0,0.615385,0.846154,0.923077,1.0
adjusted_reciprocity,0.891288,0.999786,0.108497,13.0,0.923077,0.923077,0.923077,1.0
page_rank,0.963807,0.999786,0.035979,13.0,0.846154,1.0,1.0,1.0
shortest_path_length_from_entry,0.654135,0.999786,0.345651,13.0,0.0,0.0,0.0,0.0
jaccard_similarity,0.698609,0.999786,0.301176,13.0,0.230769,0.538462,0.846154,0.846154


In [91]:
df = gc.features_df.sort_values("similarity_rank", ascending=False).reset_index().drop("index", axis=1)
see_also_indeces = []
for node in gc.see_also_articles:
    article = df[df.node == node]
    
    see_also_indeces.append((node, article.index[0]))
    
see_also_indeces

[('Behavior tree (artificial intelligence, robotics and control)', 6),
 ('Boosting (machine learning)', 59),
 ('Decision cycle', 584),
 ('Decision list', 323),
 ('Decision table', 10),
 ('Decision tree model', 9),
 ('Design rationale', 46),
 ('DRAKON', 649),
 ('Markov chain', 171),
 ('Random forest', 11),
 ('Odds algorithm', 2853),
 ('Topological combinatorics', 726),
 ('Truth table', 251)]

In [92]:
len(gc.graph.nodes)

60632