In [1]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

from GraphAPI import GraphCreator

%aimport GraphAPI

## Generating Graph from Entry Point

1. We initialize our GraphCreator class and check how many new nodes we will need to query. 

In [17]:
gc = GraphCreator("Random forest")
print("Number of Links to Search:", len(gc.next_links))
print(gc.intro_nodes)

Number of Links to Search: 294
['Ensemble learning', 'Statistical classification', 'Regression analysis', 'Decision tree learning', 'Mode (statistics)', 'Overfitting', 'Test set', 'Tin Kam Ho', 'Random subspace method', 'Leo Breiman', 'Adele Cutler', 'Trademark', 'Minitab', 'Bootstrap aggregating', 'Donald Geman']


2. We query all the nodes linked to from the entry point (expand our network one level for each node).

In [18]:
gc.expand_network(group_size=2, timeout=5, log_progress=False)

3. Since some nodes will likely have linked to articles through a redirect link, we need to traverse our graph ensure that all redirects are assigned to the correct nodes. Once all redirects have been dealt with, we remove any old redirect node. 

In [19]:
gc.redraw_redirects()

4. Edges are weighted by how many categories two connected nodes have in common. Once we have all our nodes, and we have dealt with redirects, we can add edge weights for our entire graph. 

In [20]:
gc.update_edge_weights()
gc.get_edge_weights().head()

Unnamed: 0,source_node,target_node,edge_weight
0,Caffe (software),Deeplearning4j,11
1,Deeplearning4j,Caffe (software),11
2,BigDL,Caffe (software),10
3,BigDL,Deeplearning4j,10
4,Robert Tibshirani,Trevor Hastie,7


# Getting Our Feature Set

There are two options when generating the feature set:

1. we can generate a standard feature set with only the features themselves. To do this, have the `rank` parameter set to `False`.
2. We can generate a ranked feature set (set `rank` equal to `True`). For each parameter, this will rank them in order of _best_ to _worst_ (this could be ascending or descending, depending on the context of the feature).

After running `get_features_df`, the feature set will be saved in the GraphCreator instance as `feature_df`

In [21]:
features_df = gc.get_features_df(rank=False)
gc.features_df

Unnamed: 0,node,degree,category_matches_with_source,in_edges,out_edges,centrality,dispersion,page_rank,adjusted_reciprocity,shortest_path_length_from_source
0,Mathematical Reviews,7631,0,7597.0,34.0,0.020324,0.000000,0.065673,16.0,1.0
1,Melting point,6119,0,5905.0,214.0,0.000155,,0.071935,238.0,3.0
2,Wikipedia,3631,0,2829.0,802.0,0.003955,3.222222,0.035580,644.0,1.0
3,Machine learning,2603,1,2081.0,522.0,0.146751,6.510000,0.012884,504.0,2.0
4,Trademark,2545,0,2305.0,240.0,0.001404,0.333333,0.025271,224.0,1.0
5,Glossary of artificial intelligence,1996,1,319.0,1677.0,0.142135,5.300000,0.002345,268.0,2.0
6,Regression analysis,1845,0,1201.0,644.0,0.126552,3.500000,0.007207,998.0,1.0
7,Data mining,1540,0,1052.0,488.0,0.116360,4.787234,0.007846,520.0,1.0
8,R (programming language),1392,0,1041.0,351.0,0.032128,2.250000,0.010947,446.0,1.0
9,Toxicology,1345,0,949.0,396.0,0.001794,,0.010258,382.0,2.0


## Similarity Rank

Two articles are more similar the more categories they share and the closer they are to each other. 

In [22]:
gc.rank_similarity()
gc.features_df[["node", "category_matches_with_source", "similarity_rank", "adjusted_reciprocity"]].sort_values(["similarity_rank", "adjusted_reciprocity"], ascending=False).head(10)
# gc.features_df[gc.features_df.category_matches_with_source == 1]

Unnamed: 0,node,category_matches_with_source,similarity_rank,adjusted_reciprocity
132,Gradient boosting,3,1.5,20.0
18,Statistical classification,2,1.0,652.0
19,Artificial neural network,2,1.0,270.0
57,Decision tree learning,2,1.0,186.0
73,Boosting (machine learning),2,1.0,160.0
97,Bootstrap aggregating,2,1.0,152.0
129,AdaBoost,2,0.666667,30.0
163,Out-of-bag error,2,0.666667,6.0
433,Random subspace method,2,0.666667,4.0
22,Linear discriminant analysis,1,0.5,644.0


In [23]:
gc.intro_nodes

['Musical form',
 'Classical music era',
 'Movement (music)',
 'Tonality',
 'Exposition (music)',
 'Musical development',
 'Recapitulation (music)',
 'Introduction (music)',
 'Coda (music)',
 'Sonata',
 'Symphony',
 'Concerto',
 'String quartet',
 'Musical analysis']

In [None]:
def similarity_rank(row):
        try:
            # similarity is penalized by longer paths
            sim_score = row.category_matches_with_source / row.shortest_path_length_from_source        
            # if a path from the source does not exist, it is given a similarity score of 0
            return 0 if np.isnan(sim_score) else sim_score
        except:
            return 0
    
features_df['similarity_rank'] = features_df.apply(similarity_rank, axis=1)
features_df[['node', "similarity_rank", "category_matches_with_source", "shortest_path_length_from_source"]].sort_values("similarity_rank", ascending=False)

In [None]:
def average_rank(row):
    return np.mean([
        row.degree_ranked,
#         row.centrality_ranked,
        row.dispersion_ranked,
#         row.page_rank_ranked,
#         row.adjusted_reciprocity_ranked,
    ]) * row.category_matches_with_source_ranked * row.shortest_path_length_from_source_ranked 

features_df["rank_average"] = features_df.apply(average_rank, axis=1)

features_df.sort_values("rank_average", ascending=True)

In [None]:
# features_df.dispersion = features_df.dispersion.fillna(0.0)
# features_df.shortest_path_length_from_source = features_df.shortest_path_length_from_source.fillna(-1)

# Basic Plotting

In [None]:
sns.pairplot(features_df)

In [None]:
sns.heatmap(features_df.corr())

# Intro Node Values

In [None]:
intro_nodes = gc.intro_nodes
intro_nodes_df = None
for node in intro_nodes:
    if df is None:
        intro_nodes_df = pd.DataFrame(features_df[features_df.node == node])
    else:
        intro_nodes_df = pd.concat([intro_nodes_df, pd.DataFrame(features_df[features_df.node == node])])
    
intro_nodes_df

In [None]:
sns.pairplot(intro_nodes_df, hue="node")