##  PREDICTING THE LINKS BETWEEN RESEARCH PAPERS

###  installing libraries

In [1]:
!pip install arxiv



In [2]:
!pip install node2vec



### collecting research papers

In [3]:
import networkx as nx
import pandas as pd
import numpy as np
import arxiv

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, classification_report
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
from node2vec import Node2Vec as n2v

# constants
queries = [
    'automl', 'machinelearning', 'data', 'phyiscs','mathematics', 'recommendation system', 'nlp', 'neural networks'
]

def search_arxiv(queries, max_results = 100):
    '''
    This function will search arxiv associated to a set of queries and store
    the latest 10000 (max_results) associated to that search.
    
    params:
        queries (List -> Str) : A list of strings containing keywords you want
                                to search on Arxiv
        max_results (Int) : The maximum number of results you want to see associated
                            to your search. Default value is 1000, capped at 300000
                            
    returns:
        This function will return a DataFrame holding the following columns associated
        to the queries the user has passed. 
            `title`, `date`, `article_id`, `url`, `main_topic`, `all_topics`
    
    example:
        research_df = search_arxiv(
            queries = ['automl', 'recommender system', 'nlp', 'data science'],
            max_results = 10000
        )
    '''
    d = []
    searches = []
    # hitting the API
    for query in queries:
        search = arxiv.Search(
          query = query,
          max_results = max_results,
          sort_by = arxiv.SortCriterion.SubmittedDate,
          sort_order = arxiv.SortOrder.Descending
        )
        searches.append(search)
    
    # Converting search result into df
    for search in searches:
        for res in search.results():
            data = {
                'title' : res.title,
                'date' : res.published,
                'article_id' : res.entry_id,
                'url' : res.pdf_url,
                'main_topic' : res.primary_category,
                'all_topics' : res.categories,
                'authors' : res.authors
            }
            d.append(data)
        
    d = pd.DataFrame(d)
    d['year'] = pd.DatetimeIndex(d['date']).year
    
    # change article id from url to integer
    unique_article_ids = d.article_id.unique()
    article_mapping = {art:idx for idx,art in enumerate(unique_article_ids)}
    d['article_id'] = d['article_id'].map(article_mapping)
    return d
  
research_df = search_arxiv(
    queries = queries,
    max_results = 100
)
print(research_df.shape)

(647, 8)


In [4]:
research_df

Unnamed: 0,title,date,article_id,url,main_topic,all_topics,authors,year
0,Automated Imbalanced Learning,2022-11-01 10:43:48+00:00,0,http://arxiv.org/pdf/2211.00376v1,cs.LG,"[cs.LG, cs.AI]","[Prabhant Singh, Joaquin Vanschoren]",2022
1,Efficient Automatic Machine Learning via Desig...,2022-10-21 21:25:59+00:00,1,http://arxiv.org/pdf/2210.12257v1,cs.LG,"[cs.LG, cs.AI]","[Shirley Wu, Jiaxuan You, Jure Leskovec, Rex Y...",2022
2,The Dark Side of AutoML: Towards Architectural...,2022-10-21 18:13:23+00:00,2,http://arxiv.org/pdf/2210.12179v1,cs.CR,"[cs.CR, cs.LG]","[Ren Pang, Changjiang Li, Zhaohan Xi, Shouling...",2022
3,Extensible Proxy for Efficient NAS,2022-10-17 22:18:22+00:00,3,http://arxiv.org/pdf/2210.09459v1,cs.LG,"[cs.LG, cs.AI, cs.CV]","[Yuhong Li, Jiajie Li, Cong Han, Pan Li, Jinju...",2022
4,Multi-Agent Automated Machine Learning,2022-10-17 13:32:59+00:00,4,http://arxiv.org/pdf/2210.09084v1,cs.LG,"[cs.LG, cs.CV]","[Zhaozhi Wang, Kefan Su, Jian Zhang, Huizhu Ji...",2022
...,...,...,...,...,...,...,...,...
642,A 5G Enabled Adaptive Computing Workflow for G...,2022-10-31 21:27:36+00:00,560,http://arxiv.org/pdf/2211.00150v1,eess.SY,"[eess.SY, cs.SY]","[Yousu Chen, Liwei Wang, Xiaoyuan Fan, Dexin W...",2022
643,A Machine Learning Tutorial for Operational Me...,2022-10-31 21:10:48+00:00,561,http://arxiv.org/pdf/2211.00147v1,cs.LG,"[cs.LG, cs.CV, physics.ao-ph]","[Randy J. Chase, David R. Harrison, Gary Lackm...",2022
644,Reconfigurable nonlinear optical element using...,2022-10-31 20:51:24+00:00,562,http://arxiv.org/pdf/2211.00136v1,physics.optics,"[physics.optics, physics.app-ph]","[Vahid Nikkhah, Mario Junior Mencagli, Nader E...",2022
645,A New Concept of the Value Function,2022-10-31 20:43:00+00:00,563,http://arxiv.org/pdf/2211.00131v1,econ.GN,"[econ.GN, q-fin.EC]",[Kazuo Sano],2022



###  Creating Network

In [5]:
def generate_network(df, node_col = 'article_id', edge_col = 'main_topic'):
    '''
    This function will generate a article to article network given an input DataFrame.
    It will do so by creating an edge_dictionary where each key is going to be a node
    referenced by unique values in node_col and the values will be a list of other nodes
    connected to the key through the edge_col.
    
    params:
        df (DataFrame) : The dataset which holds the node and edge columns
        node_col (String) : The column name associated to the nodes of the network
        edge_col (String) : The column name associated to the edges of the network
        
    returns:
        A networkx graph corresponding to the input dataset
        
    example:
        generate_network(
            research_df,
            node_col = 'article_id',
            edge_col = 'main_topic'
        )
    '''
    edge_dct = {}
    for i,g in df.groupby(node_col):
        topics = g[edge_col].unique()
        edge_df = df[(df[node_col] != i) & (df[edge_col].isin(topics))]
        edges = list(edge_df[node_col].unique())
        edge_dct[i] = edges
    
    # create nx network
    g = nx.Graph(edge_dct, create_using = nx.MultiGraph)
    return g
  
all_tp = research_df.explode('all_topics').copy()

tp_nx = generate_network(
    all_tp, 
    node_col = 'article_id', 
    edge_col = 'all_topics'
)

print(nx.info(tp_nx))

Name: 
Type: Graph
Number of nodes: 565
Number of edges: 30783
Average degree: 108.9664



### Applying Node2Vec

In [6]:
g_emb = n2v(tp_nx, dimensions=16)

WINDOW = 1 # Node2Vec fit window
MIN_COUNT = 1 # Node2Vec min. count
BATCH_WORDS = 4 # Node2Vec batch words

mdl = g_emb.fit(
    window=WINDOW,
    min_count=MIN_COUNT,
    batch_words=BATCH_WORDS
)

# create embeddings dataframe
emb_df = (
    pd.DataFrame(
        [mdl.wv.get_vector(str(n)) for n in tp_nx.nodes()],
        index = tp_nx.nodes
    )
)

print(emb_df.head())

Computing transition probabilities:   0%|          | 0/565 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.13s/it]


         0         1         2         3         4         5         6   \
0 -0.149058 -0.402597 -0.334330  0.433153  0.066343  0.498890  1.447854   
1 -0.150856 -0.355169 -0.376519  0.737697  0.029109  0.635733  1.470077   
2  0.779876 -0.384114 -0.467994  0.420550  0.215246  0.292632  1.305728   
3 -0.652150 -0.029106  0.415992  0.675087  0.188229  0.565279  1.183562   
4 -0.294912  0.008023  0.649466  1.210001  0.262638  0.378264  1.385447   

         7         8         9         10        11        12        13  \
0  0.437668  0.681068  0.208751 -0.263348 -1.023139  0.218350 -0.686383   
1  0.451108  0.590127  0.017675 -0.504675 -0.944975  0.400907 -0.670767   
2  0.279769  0.408319 -0.192753 -0.053831 -0.453482  0.052763 -0.682086   
3  0.731556  0.614214  0.436885 -0.160814 -0.432583  0.172820 -0.616740   
4  0.915668  0.306889  0.031628  0.015953  0.031025 -0.020863 -0.322636   

         14        15  
0 -0.073022  0.544622  
1 -0.357767  0.358673  
2 -0.357404  0.624114  
3 

### create training and testing

In [9]:
unique_nodes = list(tp_nx.nodes())
all_possible_edges = [(x,y) for (x,y) in product(unique_nodes, unique_nodes)]

# generate edge features for all pairs of nodes
edge_features = [
    (mdl.wv.get_vector(str(i)) + mdl.wv.get_vector(str(j))) for i,j in all_possible_edges
]

# get current edges in the network
edges = list(tp_nx.edges())

# create target list, 1 if the pair exists in the network, 0 otherwise
is_con = [1 if e in edges else 0 for e in all_possible_edges]

print(sum(is_con))

30783



###  model creation

In [10]:
# get training and target data
X = np.array(edge_features)
y = is_con

# train test split
x_train, x_test, y_train, y_test = train_test_split(
  X,
  y,
  test_size = 0.3
)

# GBC classifier
clf = GradientBoostingClassifier()

# train the model
clf.fit(x_train, y_train)

### Evaluation

In [12]:
y_pred = clf.predict(x_test)
y_true = y_test

y_pred = clf.predict(x_test)
x_pred = clf.predict(x_train)
test_acc = accuracy_score(y_test, y_pred)
train_acc = accuracy_score(y_train, x_pred)
print("Testing Accuracy : ", test_acc)
print("Training Accuracy : ", train_acc)

print("MCC Score : ", matthews_corrcoef(y_true, y_pred))

print("Test Confusion Matrix : ")
print(confusion_matrix(y_pred,y_test))

print("Test Classification Report : ")
print(classification_report(y_test, clf.predict(x_test)))

Testing Accuracy :  0.8992460947289282
Training Accuracy :  0.9049123545022085
MCC Score :  0.30117577179485283
Test Confusion Matrix : 
[[83667  6870]
 [ 2779  2452]]
Test Classification Report : 
              precision    recall  f1-score   support

           0       0.92      0.97      0.95     86446
           1       0.47      0.26      0.34      9322

    accuracy                           0.90     95768
   macro avg       0.70      0.62      0.64     95768
weighted avg       0.88      0.90      0.89     95768



### Prediction

In [14]:
pred_ft = [(mdl.wv.get_vector(str('42'))+mdl.wv.get_vector(str('210')))]
print(clf.predict(pred_ft)[0])

print(clf.predict_proba(pred_ft))

0
[[0.50118293 0.49881707]]
