In [12]:
# import transformers
# from transformers import RobertaTokenizer, TFRobertaModel

In [1]:
#Importing relevant packages
import glob
import os
import io
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import math
import itertools
import random
import csv

from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import nltk
from nltk.stem import WordNetLemmatizer

from sentence_transformers import SentenceTransformer, util

from itertools import islice
import seaborn as sns
from collections import Counter

In [6]:
stop_words = set(nltk.corpus.stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

## Import the data

In [43]:
training = pd.read_csv('../data/training_set.txt', sep = ' ', header = None)
training.columns = ['source_id', 'target_id', 'Y']

testing = pd.read_csv('../data/testing_set.txt', sep = ' ', header = None)
testing.columns = ['source_id', 'target_id']

node_info = pd.read_csv('../data/node_information.csv', header = None)
node_info.columns  = ['id', 'year', 'title', 'authors', 'journal_name', 'abstract']
node_info['authors'] = node_info['authors'].fillna(value='')
node_info['journal_name'] = node_info['journal_name'].fillna(value='')

In [3]:
training.shape

(615512, 3)

## Create nodes and edges file - No need to run

In [45]:
nodes_df = node_info.loc[:,['id', 'title']]
nodes_df.to_csv('../data/nodes.csv',index=False)

In [37]:
edges_df = training.loc[training['Y'] == 1,['source_id','target_id'] ]
edges_df.to_csv('../data/edges.csv',index=False)

## Process node info

In [4]:
node_info.shape

(27770, 6)

In [48]:
node_info.head()

Unnamed: 0,id,year,title,authors,journal_name,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


In [49]:
def remove_brackets_authors(name):
    if name == "":
        return ""
    
    name = re.sub(r'\(.*?\)',"",name)
    clean_name = []
    for x in name.split(","):
        if x == "" or x == " ":
            continue
            
        if x.find('(') == -1:
            clean_name.append(x.strip())
            continue
            
        clean_name.append(x.replace(x[x.find('('):], "").strip())
    return ",".join(clean_name)             

def process_title(title):
    title_tokens = [token for token in title.split(" ") if token not in stop_words]
    title_tokens = [lemmatizer.lemmatize(token) for token in title_tokens]
    return " ".join(title_tokens)

def process_abstract(abstract):
    abstract_tokens = [token for token in abstract.split(" ") if token not in stop_words]
    abstract_tokens = [lemmatizer.lemmatize(token) for token in abstract_tokens]
    return " ".join(abstract_tokens)

In [50]:
def process_node_info():
    
    # Process authors
    node_info['authors'] = node_info['authors'].apply(lambda names: remove_brackets_authors(names).lower()) 
    
    # Process journal name
    node_info['journal_name'] = node_info['journal_name'].apply(lambda jn: jn[:-1].lower() if "," in jn else jn.lower())
    
    # Process title
    node_info['title'] = node_info['title'].apply(lambda title: process_title(title))
    
    # Process abstract
    node_info['abstract'] = node_info['abstract'].apply(lambda title: process_abstract(title))
    

In [51]:
process_node_info()

In [52]:
node_info.head()

Unnamed: 0,id,year,title,authors,journal_name,abstract
0,1001,2000,compactification geometry duality,paul s. aspinwall,,note based lecture given tasi99 review geometr...
1,1002,2000,domain wall massive gauged supergravity potential,"m. cvetic,h. lu,c.n. pope",class.quant.grav.,point massive gauged supergravity potential ex...
2,1003,2000,comment metric fluctuation brane world,"y.s. myung,gungwon kang",,recently ivanov volovich hep-th 9912242 claime...
3,1004,2000,moving mirror thermodynamic paradox,adam d. helfer,phys.rev.,quantum field responding moving mirror predict...
4,1005,2000,bundle chiral block boundary condition cft,"j. fuchs,c. schweigert",,proceeding lie iii clausthal july 1999 various...


In [53]:
node_info.to_csv("../data/updated_node_information.csv", index=False)

## Process training data

In [19]:
training = pd.merge(training, node_info[['id', 'year', 'authors', 'title', 'journal_name', 'abstract']],how='left', left_on='source_id', right_on='id') \
.drop('id', axis=1) \
.rename(columns={"year":"source_year", 'authors':'source_authors', 'title':'source_title', 'journal_name':'source_jn', 'abstract':'source_abstract'})

training = pd.merge(training, node_info[['id', 'year', 'authors', 'title', 'journal_name', 'abstract']],how='left', left_on='target_id', right_on='id') \
.drop('id', axis=1) \
.rename(columns={"year":"target_year", 'authors':'target_authors', 'title':'target_title', 'journal_name':'target_jn', 'abstract':'target_abstract'})

In [20]:
testing = pd.merge(testing, node_info[['id', 'year', 'authors', 'title', 'journal_name', 'abstract']],how='left', left_on='source_id', right_on='id') \
.drop('id', axis=1) \
.rename(columns={"year":"source_year", 'authors':'source_authors', 'title':'source_title', 'journal_name':'source_jn', 'abstract':'source_abstract'})

testing = pd.merge(testing, node_info[['id', 'year', 'authors', 'title', 'journal_name', 'abstract']],how='left', left_on='target_id', right_on='id') \
.drop('id', axis=1) \
.rename(columns={"year":"target_year", 'authors':'target_authors', 'title':'target_title', 'journal_name':'target_jn', 'abstract':'target_abstract'})

In [21]:
training.head(2)

Unnamed: 0,source_id,target_id,Y,source_year,source_authors,source_title,source_jn,source_abstract,target_year,target_authors,target_title,target_jn,target_abstract
0,9510123,9502114,1,1995,,an infinite number of potentials surrounding 2...,phys.lett.,we found an infinite number of potentials surr...,1995,"won t. kim,julian lee,young jai park",stability analysis of the dilatonic black hole...,phys.lett.,we explicitly show that the net number of degr...
1,9707075,9604178,1,1997,"l.e.ibanez,a.m.uranga",d 6 n 1 string vacua and duality,,winter school on duality mt sorak korea februa...,1996,"atish dabholkar,jaemo park",strings on orientifolds,nucl.phys.,we construct several examples of compactificat...


### Year diff

In [22]:
training['year_diff'] = training['source_year'] - training['target_year']
training['year_diff'] = training['year_diff'].apply(lambda diff : -1 if diff < 0 else diff)

In [23]:
testing['year_diff'] = testing['source_year'] - testing['target_year']
testing['year_diff'] = testing['year_diff'].apply(lambda diff : -1 if diff < 0 else diff)

### Common Authors

In [24]:
def common_authors(source, target):
    if source == "" or target == "":
        return 0
    source = source.split(",")
    target = target.split(",")
    return len(set(source) & (set(target)))

In [25]:
training['common_authors'] = training.apply(lambda row: common_authors(row.source_authors, row.target_authors), axis = 1)

In [26]:
testing['source_authors'] = testing['source_authors'].fillna(value='')
testing['target_authors'] = testing['target_authors'].fillna(value='')

In [27]:
testing['common_authors'] = testing.apply(lambda row: common_authors(row.source_authors, row.target_authors), axis = 1)

### Same Journal

In [28]:
training['same_journal'] = training.apply(lambda row: 1 if row.source_jn != "" and row.target_jn != "" and row.source_jn == row.target_jn else 0, axis = 1)

In [29]:
testing['same_journal'] = testing.apply(lambda row: 1 if row.source_jn != "" and row.target_jn != "" and row.source_jn == row.target_jn else 0, axis = 1)

### Title and Abstract TFIDF similarity

In [30]:
stop_words = set(nltk.corpus.stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def tokenize_string(string):
    new_string = string.lower().split(" ")
    return [lemmatizer.lemmatize(token) for token in new_string if token not in stop_words]

vectorizer = TfidfVectorizer(tokenizer=tokenize_string)
tfidf_abstract = vectorizer.fit_transform(node_info["abstract"])

vectorizer = TfidfVectorizer(tokenizer=tokenize_string)
tfidf_title = vectorizer.fit_transform(node_info["title"])

In [31]:
tfidf_title = tfidf_title.todense()
tfidf_abstract = tfidf_abstract.todense()

def compute_similarity(src, trgt, vectorizer):
    src_pos = np.where(node_info.id == src)[0].item()
    trgt_pos = np.where(node_info.id == trgt)[0].item()
    
    if vectorizer == 'title':
        src = np.asarray(tfidf_title[src_pos])
        trgt = np.asarray(tfidf_title[trgt_pos])
    elif vectorizer == 'abstract':
        src = np.asarray(tfidf_abstract[src_pos])
        trgt = np.asarray(tfidf_abstract[trgt_pos])
    else:
        return 0
    
    return cosine_similarity(src, trgt).item()


In [32]:
%%time
training['title_similarity'] = training.apply(lambda row: compute_similarity(row.source_id, row.target_id, 'title'), axis = 1)

CPU times: user 2min 25s, sys: 490 ms, total: 2min 26s
Wall time: 2min 26s


In [33]:
%%time
testing['title_similarity'] = testing.apply(lambda row: compute_similarity(row.source_id, row.target_id, 'title'), axis = 1)

CPU times: user 7.75 s, sys: 14.5 ms, total: 7.77 s
Wall time: 7.77 s


In [34]:
%%time
training['abstract_similarity'] = training.apply(lambda row: compute_similarity(row.source_id, row.target_id, 'abstract'), axis = 1)

CPU times: user 3min 36s, sys: 2.87 s, total: 3min 39s
Wall time: 3min 41s


In [35]:
%%time
testing['abstract_similarity'] = testing.apply(lambda row: compute_similarity(row.source_id, row.target_id, 'abstract'), axis = 1)

CPU times: user 11.3 s, sys: 106 ms, total: 11.5 s
Wall time: 11.4 s


### Title and Abstract embeddings 

In [54]:
roberta_model = SentenceTransformer('stsb-roberta-large')

In [55]:
print(node_info.shape)
node_info.head()

(27770, 6)


Unnamed: 0,id,year,title,authors,journal_name,abstract
0,1001,2000,compactification geometry duality,paul s. aspinwall,,note based lecture given tasi99 review geometr...
1,1002,2000,domain wall massive gauged supergravity potential,"m. cvetic,h. lu,c.n. pope",class.quant.grav.,point massive gauged supergravity potential ex...
2,1003,2000,comment metric fluctuation brane world,"y.s. myung,gungwon kang",,recently ivanov volovich hep-th 9912242 claime...
3,1004,2000,moving mirror thermodynamic paradox,adam d. helfer,phys.rev.,quantum field responding moving mirror predict...
4,1005,2000,bundle chiral block boundary condition cft,"j. fuchs,c. schweigert",,proceeding lie iii clausthal july 1999 various...


In [58]:
def get_embeddings(values):
    return roberta_model.encode(values, show_progress_bar = True)

In [59]:
%%time
title_embeddings = get_embeddings(list(node_info.title))
abstract_embeddings = get_embeddings(list(node_info.abstract))

Batches:   0%|          | 0/868 [00:00<?, ?it/s]

Batches:   0%|          | 0/868 [00:00<?, ?it/s]

CPU times: user 19h 17min 35s, sys: 1h 4min 41s, total: 20h 22min 17s
Wall time: 2h 38min 48s


In [88]:
def find_cosine_similarity(src_id, trgt_id, embeddings):
    src_idx = node_info[node_info['id']==src_id].index.values.astype(int)[0]
    trgt_idx = node_info[node_info['id']==trgt_id].index.values.astype(int)[0]
    
    if embeddings == "title":
        src_embed = title_embeddings[src_idx]
        trgt_embed = title_embeddings[trgt_idx]
        
    if embeddings == "abstract":
        src_embed = abstract_embeddings[src_idx]
        trgt_embed = abstract_embeddings[trgt_idx]
        
    cosine_score = util.cos_sim(src_embed, trgt_embed)
    
    return cosine_score.item()

In [89]:
%time
training['title_embed_sim'] = training.apply(lambda row: find_cosine_similarity(row.source_id,row.target_id, "title"), axis = 1)
training['abstract_embed_sim'] = training.apply(lambda row: find_cosine_similarity(row.source_id,row.target_id, "abstract"), axis = 1)

testing['title_embed_sim'] = testing.apply(lambda row: find_cosine_similarity(row.source_id,row.target_id, "title"), axis = 1)
testing['abstract_sembed_sim'] = testing.apply(lambda row: find_cosine_similarity(row.source_id,row.target_id, "abstract"), axis = 1)


CPU times: user 5 µs, sys: 3 µs, total: 8 µs
Wall time: 14.8 µs


In [116]:
testing.rename(columns={'abstract_sembed_sim':'abstract_embed_sim'}, inplace=True)

In [117]:
testing.columns

Index(['source_id', 'target_id', 'source_year', 'source_authors',
       'source_title', 'source_jn', 'source_abstract', 'target_year',
       'target_authors', 'target_title', 'target_jn', 'target_abstract',
       'year_diff', 'common_authors', 'same_journal', 'title_similarity',
       'abstract_similarity', 'source_in_centrality', 'source_out_centrality',
       'target_in_centrality', 'target_out_centrality',
       'source_degree_centrality', 'target_degree_centrality', 'source_k_core',
       'target_k_core', 'pref_attach_directed', 'jacc_index',
       'pref_attach_undirected', 'common_neighbors', 'adamic_adar',
       'distance_nodes', 'distance_nodes2', 'distance_nodes3',
       'target_pagerank', 'source_hub_score', 'target_authority_score',
       'target_hub_score', 'source_authority_score', 'common_neighbors_b',
       'title_embed_sim', 'abstract_embed_sim'],
      dtype='object')

In [60]:
# title_embeds = roberta_model.encode(node_info.title.values, show_progress_bar = True)

In [None]:
len(title_embeds)

In [67]:
%%time
#Compute embedding for both lists
embeddings1 = roberta_model.encode('I like dogs', convert_to_tensor=True)
embeddings2 = roberta_model.encode('i do not like dogs', convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.cos_sim(embeddings1, embeddings2)

CPU times: user 3.1 s, sys: 242 ms, total: 3.34 s
Wall time: 502 ms


In [68]:
cosine_scores.item()

0.5931638479232788

In [7]:
training.to_csv("../data/cleaned_trainig_set.csv", index=False)

In [65]:
testing.to_csv("../data/cleaned_testing_set.csv", index=False)

## Network properties

In [34]:
training = pd.read_csv("../data/cleaned_trainig_set.csv")

In [35]:
testing = pd.read_csv("../data/cleaned_testing_set.csv")

In [36]:
node_info= pd.read_csv("../data/updated_node_information.csv")

In [13]:
nodes = list(node_info.id)

In [38]:
print(training.shape)
print(node_info.shape)

(615512, 18)
(27770, 6)


In [39]:
testing.shape

(32648, 17)

### Directed Graph

In [14]:
#Graph
training_values = training.values.tolist()
edges = [(node_pair[0], node_pair[1]) for node_pair in training_values if node_pair[2] == 1]

G = nx.DiGraph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

In [7]:
out_degree_centrality = nx.out_degree_centrality(G)
in_degree_centrality = nx.in_degree_centrality(G)
deg_centrality = nx.degree_centrality(G)
# betweeness_centrality = nx.betweenness_centrality(G, k=10)
k_core = nx.core_number(G)

In [9]:
%%time
training['source_in_centrality'] = training.apply(lambda row: in_degree_centrality[row.source_id], axis = 1)
training['source_out_centrality'] = training.apply(lambda row: out_degree_centrality[row.source_id], axis = 1)
training['target_in_centrality'] = training.apply(lambda row: in_degree_centrality[row.target_id], axis = 1)
training['target_out_centrality'] = training.apply(lambda row: out_degree_centrality[row.target_id], axis = 1)

training['source_degree_centrality'] = training.apply(lambda row: deg_centrality[row.source_id], axis = 1)
training['target_degree_centrality'] = training.apply(lambda row: deg_centrality[row.target_id], axis = 1)

training['source_k_core'] = training.apply(lambda row: k_core[row.source_id], axis = 1)
training['target_k_core'] = training.apply(lambda row: k_core[row.target_id], axis = 1)

# Preferential Attachment
training['pref_attach_directed'] = training.apply(lambda row: row.source_out_centrality * row.target_in_centrality, axis = 1)

CPU times: user 28.2 s, sys: 607 ms, total: 28.8 s
Wall time: 28.9 s


In [68]:
%%time
testing['source_in_centrality'] = testing.apply(lambda row: in_degree_centrality[row.source_id], axis = 1)
testing['source_out_centrality'] = testing.apply(lambda row: out_degree_centrality[row.source_id], axis = 1)
testing['target_in_centrality'] = testing.apply(lambda row: in_degree_centrality[row.target_id], axis = 1)
testing['target_out_centrality'] = testing.apply(lambda row: out_degree_centrality[row.target_id], axis = 1)

testing['source_degree_centrality'] = testing.apply(lambda row: deg_centrality[row.source_id], axis = 1)
testing['target_degree_centrality'] = testing.apply(lambda row: deg_centrality[row.target_id], axis = 1)

testing['source_k_core'] = testing.apply(lambda row: k_core[row.source_id], axis = 1)
testing['target_k_core'] = testing.apply(lambda row: k_core[row.target_id], axis = 1)

# Preferential Attachment
testing['pref_attach_directed'] = testing.apply(lambda row: row.source_out_centrality * row.target_in_centrality, axis = 1)

CPU times: user 1.54 s, sys: 34.9 ms, total: 1.58 s
Wall time: 1.58 s


In [15]:
# HITS algorithm
hub_score, authority_score = nx.hits(G)

In [19]:
%%time
training['source_hub_score'] = training.apply(lambda row: hub_score[row.source_id], axis = 1)
training['target_hub_score'] = training.apply(lambda row: hub_score[row.target_id], axis = 1)
testing['source_hub_score'] = testing.apply(lambda row: hub_score[row.source_id], axis = 1)
testing['target_hub_score'] = testing.apply(lambda row: hub_score[row.target_id], axis = 1)

training['source_authority_score'] = training.apply(lambda row: authority_score[row.source_id], axis = 1)
training['target_authority_score'] = training.apply(lambda row: authority_score[row.target_id], axis = 1)
testing['target_authority_score'] = testing.apply(lambda row: authority_score[row.target_id], axis = 1)
testing['source_authority_score'] = testing.apply(lambda row: authority_score[row.source_id], axis = 1)

CPU times: user 13 s, sys: 363 ms, total: 13.4 s
Wall time: 13.4 s


### Undirected Graph

In [10]:
G_ud = G.to_undirected()

In [11]:
# Jaccobian Index
def jacc_index(G, source, target):
    jacc = 0
    if (source not in G.nodes() or target not in G.nodes()):
        return -1
    preds = nx.jaccard_coefficient(G, [(source, target)])
    for _, _, jacc in preds:
        return jacc

In [None]:
%%time
training['jacc_index'] = training.apply(lambda row: jacc_index(G_ud, row.source_id, row.target_id), axis = 1)

In [69]:
%%time
testing['jacc_index'] = testing.apply(lambda row: jacc_index(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 1.45 s, sys: 55.9 ms, total: 1.51 s
Wall time: 1.53 s


In [12]:
%%time
# Preferencial Attachment
def pref_attach(G, source, target):
    if (source not in G.nodes() or target not in G.nodes()):
        return -1
    preds = nx.preferential_attachment(G, [(source, target)])
    for _, _, pref in preds:
        return pref

training['pref_attach_undirected'] = training.apply(lambda row: pref_attach(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 7.04 s, sys: 101 ms, total: 7.14 s
Wall time: 7.14 s


In [70]:
%%time
testing['pref_attach_undirected'] = testing.apply(lambda row: pref_attach(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 402 ms, sys: 11.6 ms, total: 414 ms
Wall time: 411 ms


In [13]:
training.head(1)

Unnamed: 0,source_id,target_id,Y,source_year,source_authors,source_title,source_jn,source_abstract,target_year,target_authors,...,source_out_centrality,target_in_centrality,target_out_centrality,source_degree_centrality,target_degree_centrality,source_k_core,target_k_core,pref_attach_directed,jacc_index,pref_attach_undirected
0,9510123,9502114,1,1995,,infinit number potenti surround 2d black hole,phys.lett.,found infinit number potenti surround 2d black...,1995,"won t. kim,julian lee,young jai park",...,0.000108,0.000288,0.000144,0.000216,0.000432,5,8,3.112367e-08,0.058824,72


In [14]:
%%time
# Common Neighbors and Common Neighbors Count
def common_neighbors(G, source, target):
    if (source not in G.nodes() or target not in G.nodes):
        return -1
    neighbors = list(nx.common_neighbors(G, source, target))
    return len(neighbors)

training['common_neighbors'] = training.apply(lambda row: common_neighbors(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 21.2 s, sys: 144 ms, total: 21.4 s
Wall time: 21.4 s


In [71]:
%%time
testing['common_neighbors'] = testing.apply(lambda row: common_neighbors(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 1.14 s, sys: 13 ms, total: 1.15 s
Wall time: 1.15 s


In [26]:
def common_neighbors_b(cn):
    if cn > 0:
        return True
    return False

In [27]:
training['common_neighbors_b'] = training.apply(lambda row: common_neighbors_b(row.common_neighbors), axis = 1)
testing['common_neighbors_b'] = testing.apply(lambda row: common_neighbors_b(row.common_neighbors), axis = 1)

In [15]:
%%time
# Adamic Adar Index
def adamic_adar(G, source, target):
    if (source not in G.nodes() or target not in G.nodes()):
        return -1
    preds = nx.adamic_adar_index(G, [(source, target)])
    for _, _, aa in preds:
        return aa

training['adamic_adar'] = training.apply(lambda row: adamic_adar(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 25.7 s, sys: 120 ms, total: 25.8 s
Wall time: 25.8 s


In [72]:
%%time
testing['adamic_adar'] = testing.apply(lambda row: adamic_adar(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 1.42 s, sys: 17.3 ms, total: 1.44 s
Wall time: 1.44 s


In [16]:
%%time
#Distance Between Nodes
def shortest_path(G, source, target, k=2):
    try:
        paths = list(islice(nx.shortest_simple_paths(G, source, target), k))
    except:
        return -1
    paths = nx.shortest_simple_paths(G, source, target)
    for path in paths:
        if len(path)!=2:
            return len(path)
    return -1

training['distance_nodes'] = training.apply(lambda row: shortest_path(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 4min 36s, sys: 1.15 s, total: 4min 37s
Wall time: 4min 37s


In [73]:
%%time
testing['distance_nodes'] = testing.apply(lambda row: shortest_path(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 15.6 s, sys: 80.2 ms, total: 15.6 s
Wall time: 15.6 s


In [17]:
%%time
def shortest_path2(G, source, target, k=3):
    c=0
    try:
        paths = list(islice(nx.shortest_simple_paths(G, source, target), k))
    except:
        return -1
    for path in paths:
        if len(path)!=2:
            c+=1
            if c==2:
                return len(path)
    return -1


training['distance_nodes2'] = training.apply(lambda row: shortest_path2(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 7min 57s, sys: 2.32 s, total: 7min 59s
Wall time: 7min 59s


In [74]:
%%time
testing['distance_nodes2'] = testing.apply(lambda row: shortest_path2(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 26.7 s, sys: 133 ms, total: 26.8 s
Wall time: 26.8 s


In [18]:
%%time
def shortest_path3(G, source, target, k=4):
    c=0
    try:
        paths = list(islice(nx.shortest_simple_paths(G, source, target), k))
    except:
        return -1
    for path in paths:
        if len(path)!=2:
            c+=1
            if c==3:
                return len(path)
    return -1

training['distance_nodes3'] = training.apply(lambda row: shortest_path3(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 12min 5s, sys: 3.2 s, total: 12min 9s
Wall time: 12min 9s


In [75]:
%%time
testing['distance_nodes3'] = testing.apply(lambda row: shortest_path3(G_ud, row.source_id, row.target_id), axis = 1)

CPU times: user 40.3 s, sys: 181 ms, total: 40.4 s
Wall time: 40.5 s


In [44]:
# Page rank
page_rank = nx.pagerank(G)
training['target_pagerank'] = training.apply(lambda row: page_rank[row.target_id], axis = 1)
testing['target_pagerank'] = testing.apply(lambda row: page_rank[row.target_id], axis = 1)

In [None]:
training.columns

In [None]:
training.shape

In [90]:
training.to_csv("../data/final_trainig_set1.csv", index=False)

In [118]:
testing.to_csv("../data/final_testing_set1.csv", index=False)

## Model training

In [92]:
temp_training = pd.read_csv("../data/final_trainig_set1.csv")
temp_testing = pd.read_csv("../data/final_testing_set1.csv")

In [71]:
training = pd.read_csv("../data/final_trainig_set.csv")
testing = pd.read_csv("../data/final_testing_set.csv")

In [93]:
training_df = temp_training.drop(labels=['source_year','source_authors',
                                    'source_title','source_jn','source_abstract',
                                    'target_year','target_authors',
                                    'target_title','target_jn','target_abstract'], axis=1)
testing_df = temp_testing.drop(labels=['source_year','source_authors',
                                    'source_title','source_jn','source_abstract',
                                    'target_year','target_authors',
                                    'target_title','target_jn','target_abstract'], axis=1)

In [122]:
print(len(training_df.columns))
print(len(testing_df.columns))


print(training_df.columns)
print(testing_df.columns)

32
31
Index(['source_id', 'target_id', 'Y', 'year_diff', 'common_authors',
       'same_journal', 'title_similarity', 'abstract_similarity',
       'source_in_centrality', 'source_out_centrality', 'target_in_centrality',
       'target_out_centrality', 'source_degree_centrality',
       'target_degree_centrality', 'source_k_core', 'target_k_core',
       'pref_attach_directed', 'jacc_index', 'pref_attach_undirected',
       'common_neighbors', 'adamic_adar', 'distance_nodes', 'distance_nodes2',
       'distance_nodes3', 'target_pagerank', 'source_hub_score',
       'target_authority_score', 'target_hub_score', 'source_authority_score',
       'common_neighbors_b', 'title_embed_sim', 'abstract_embed_sim'],
      dtype='object')
Index(['source_id', 'target_id', 'year_diff', 'common_authors', 'same_journal',
       'title_similarity', 'abstract_similarity', 'source_in_centrality',
       'source_out_centrality', 'target_in_centrality',
       'target_out_centrality', 'source_degree_centra

In [170]:
training[['source_id','Y','source_title','target_title','source_abstract','target_abstract','title_embed_sim','abstract_embed_sim']].loc[4,:]

source_id                                                       9701033
Y                                                                     0
source_title                       quantum gravit measur three-geometri
target_title          israel condit gauss-bonnet theori friedmann equat
source_abstract       modif publish gravit measur arbitrari topolog ...
target_abstract       brane univers addit bulk field assum einstein-...
title_embed_sim                                                 0.17473
abstract_embed_sim                                             0.730622
Name: 4, dtype: object

In [172]:
training[['source_abstract','target_abstract']]

Unnamed: 0,source_abstract,target_abstract
0,found infinit number potenti surround 2d black...,explicitli show net number degre freedom two-d...
1,winter school dualiti mt sorak korea februari ...,construct sever exampl compactif type iib theo...
2,examin correspond conform field theori boundar...,reissner-nordstr om black hole result follow h...
3,supersymmetr scale invari theori discuss gener...,show dijkgraaf-vafa matrix model propos extend...
4,modif publish gravit measur arbitrari topolog ...,brane univers addit bulk field assum einstein-...
...,...,...
615507,non-perturb method quantis light-con le houch ...,kresimir puhep1 princeton edu pupt-1427 iassns...
615508,investig scatter electron infinit thin infinit...,studi metric minim area punctur riemann surfac...
615509,construct boundari state d-brane su 2 group ma...,method construct canon gaug invari quantum for...
615510,start n 1 scalar supermultiplet 2 1 dimens bui...,model bled slovenia juli 17-27 2001 old idea p...


In [99]:
X = training_df.drop(columns = 'Y')
y = training_df['Y']

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 41)

### XGBoost

In [26]:
%%time
xgb = XGBClassifier(learning_rate = 0.1, n_estimators = 1000, max_depth = 25)
xgb.fit(X_train, y_train)



CPU times: user 1h 9min 9s, sys: 8min 48s, total: 1h 17min 58s
Wall time: 10min 1s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=25, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [27]:
y_pred = xgb.predict(X_test)

In [28]:
f1_score(y_pred, y_test)

0.9816732113543262

## Light GBM

In [173]:
def lightGBMClassifier1(X_train, y_train):

    grid_params = {'n_estimators': [1175, 1200, 1250],
                   'num_leaves': [250, 300, 350],
                   'min_child_samples': [80, 100]}

    rf_clf = LGBMClassifier()

    rf_cv = GridSearchCV(estimator = rf_clf, 
                         param_grid=grid_params, 
                         cv =5, 
                         verbose=1, 
                         scoring = 'f1_weighted', 
                         n_jobs=-1)
    
    rf_cv.fit(X_train, y_train.values.ravel())
    print("best_score_", rf_cv.best_score_)
    print("best_params_", rf_cv.best_params_)
    
    rf_best = rf_cv.best_estimator_
    return rf_best
#     y_predicted = rf_best.predict(X_test)
#     rf_accuracy = accuracy_score(y_test, y_predicted)
#     rf_f1 = f1_score(y_test, y_predicted, average="weighted")
#     print("rf_accuracy, rf_f1", rf_accuracy, rf_f1)




In [174]:
%%time
gbm = lightGBMClassifier1(X,y)

# best_score_ 0.9808501773477625
# best_params_ {'min_child_samples': 90, 'n_estimators': 1100, 'num_leaves': 400}

# best_score_ 0.9808827083203641
# best_params_ {'min_child_samples': 80, 'n_estimators': 1200, 'num_leaves': 405}

# best_score_ 0.9808940299134872
# best_params_ {'min_child_samples': 81, 'n_estimators': 1250, 'num_leaves': 405}

# best_score_ 0.9809020231922009
# best_params_ {'min_child_samples': 82, 'n_estimators': 1250, 'num_leaves': 406}

# best_score_ 0.9809166500036139
# best_params_ {'min_child_samples': 82, 'n_estimators': 1252, 'num_leaves': 406}

# best_score_ 0.980924780052322
# best_params_ {'min_child_samples': 82, 'n_estimators': 1256, 'num_leaves': 406}


# Fitting 5 folds for each of 18 candidates, totalling 90 fits
# best_score_ 0.9826454820577499
# best_params_ {'min_child_samples': 80, 'n_estimators': 1200, 'num_leaves': 350}
# CPU times: user 4min 49s, sys: 1min 3s, total: 5min 52s
# Wall time: 44min 34s

Fitting 5 folds for each of 18 candidates, totalling 90 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already

In [40]:
gbm = LGBMClassifier(min_child_samples=82, n_estimators=1256, num_leaves=406)
gbm.fit(X, y)

LGBMClassifier(min_child_samples=82, n_estimators=1256, num_leaves=406)

In [43]:
ypreds = gbm.predict(X_test)
print(accuracy_score(y_test, ypreds))

1.0


In [77]:
def lightGBMClassifier(X_train, X_test, y_train, y_test):

    grid_params = {'n_estimators': [300, 500, 700],
                   'num_leaves': [100, 150, 200],
                   'min_child_samples': [50, 100, 150]}

    rf_clf = LGBMClassifier()

    rf_cv = GridSearchCV(estimator = rf_clf, 
                         param_grid=grid_params, 
                         cv =2, 
                         verbose=2, 
                         scoring = 'f1_weighted', 
                         n_jobs=-1)
    
    rf_cv.fit(X_train, y_train.values.ravel())
    print("rf_cv.best_params_", rf_cv.best_params_)
    
    rf_best = rf_cv.best_estimator_
    y_predicted = rf_best.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_predicted)
    rf_f1 = f1_score(y_test, y_predicted, average="weighted")
    print("rf_accuracy, rf_f1", rf_accuracy, rf_f1)

    return rf_accuracy, rf_f1

In [78]:
%%time
a,f = lightGBMClassifier(X_train, X_test, y_train, y_test)
# rf_cv.best_params_ {'min_child_samples': 100, 'n_estimators': 500, 'num_leaves': 200}
# rf_accuracy, rf_f1 0.9801899769298255 0.9801950694901748

Fitting 2 folds for each of 27 candidates, totalling 54 fits
rf_cv.best_params_ {'min_child_samples': 100, 'n_estimators': 500, 'num_leaves': 200}
rf_accuracy, rf_f1 0.9801899769298255 0.9801950694901748
CPU times: user 1min 15s, sys: 14.4 s, total: 1min 29s
Wall time: 5min 9s


In [31]:
%%time
gbm = LGBMClassifier(num_leaves = 200, n_estimators = 500, min_child_samples = 100)
gbm.fit(X_train, y_train)
# LGBMClassifier(min_child_samples=100, n_estimators=500, num_leaves=200)

CPU times: user 1min, sys: 9.88 s, total: 1min 10s
Wall time: 9.07 s


LGBMClassifier(min_child_samples=100, n_estimators=500, num_leaves=200)

In [32]:
y_pred = gbm.predict(X_test)
f1_score(y_pred, y_test)

# 0.9810149635018287

0.9817532448098009

In [73]:
%%time
gbm = LGBMClassifier(num_leaves = 200, n_estimators = 500, min_child_samples = 100)
gbm.fit(X, y)

CPU times: user 1min 17s, sys: 21 s, total: 1min 38s
Wall time: 14.6 s


LGBMClassifier(min_child_samples=100, n_estimators=500, num_leaves=200)

## Random Forest

In [147]:
def randomForestClassifier1(X_train, y_train):

    grid_params = {'n_estimators': [ 390, 400, 410],
                   'max_depth': [90, 100, 110],
                   'min_samples_leaf': [1]}

    rf_clf = RandomForestClassifier()

    rf_cv = GridSearchCV(estimator = rf_clf, 
                         param_grid=grid_params, 
                         cv =5, 
                         verbose=1, 
                         scoring = 'f1_weighted', 
                         n_jobs=-1)
    
    rf_cv.fit(X_train, y_train.values.ravel())
    print("best_score_", rf_cv.best_score_)
    print("best_params_", rf_cv.best_params_)
    
    rf_best = rf_cv.best_estimator_
    return rf_best

In [148]:
%%time
rf = randomForestClassifier1(X, y)

# best_score_ 0.9793063101578294
# best_params_ {'max_depth': 110, 'min_samples_leaf': 1, 'n_estimators': 400}

# best_score_ 0.9792916555649237
# best_params_ {'max_depth': 108, 'min_samples_leaf': 1, 'n_estimators': 400}

# best_score_ 0.9799314414321392
# best_params_ {'max_depth': 50, 'min_samples_leaf': 1, 'n_estimators': 450}

# Fitting 5 folds for each of 32 candidates, totalling 160 fits
# best_score_ 0.9797498064066816
# best_params_ {'max_depth': 100, 'min_samples_leaf': 1, 'n_estimators': 400}
# CPU times: user 9min 59s, sys: 2.8 s, total: 10min 1s
# Wall time: 5h 6min

Fitting 5 folds for each of 9 candidates, totalling 45 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already 

In [29]:
rf

RandomForestClassifier(max_depth=50, n_estimators=450)

In [22]:
# def randomForestClassifier(X_train, X_test, y_train, y_test):

#     grid_params = {'n_estimators': [450, 500, 550],
#                    'max_depth': [153, 155, 157],
#                    'min_samples_leaf': [1]}

#     rf_clf = RandomForestClassifier()

#     rf_cv = GridSearchCV(estimator = rf_clf, 
#                          param_grid=grid_params, 
#                          cv =2, 
#                          verbose=2, 
#                          scoring = 'f1_weighted', 
#                          n_jobs=-1)
    
#     rf_cv.fit(X_train, y_train.values.ravel())
#     print("rf_cv.best_params_", rf_cv.best_params_)
    
#     rf_best = rf_cv.best_estimator_
#     y_predicted = rf_best.predict(X_test)
#     rf_accuracy = accuracy_score(y_test, y_predicted)
#     rf_f1 = f1_score(y_test, y_predicted, average="weighted")
#     print("rf_accuracy, rf_f1", rf_accuracy, rf_f1)

#     return rf_accuracy, rf_f1

In [112]:
# %%time
# a,f = randomForestClassifier(X_train, X_test, y_train, y_test)
# print("a =", a)
# print("f =", f)

# # rf_cv.best_params_ {'max_depth': 155, 'min_samples_leaf': 1, 'n_estimators': 500}
# # rf_accuracy, rf_f1 0.9784570060762291 0.9784663816673383

# # rf_cv.best_params_ {'max_depth': 153, 'min_samples_leaf': 1, 'n_estimators': 500}
# # rf_accuracy, rf_f1 0.9784028507370541 0.9784122498967688

In [113]:
# %%time
# rf = RandomForestClassifier(max_depth= 153, min_samples_leaf= 1, n_estimators= 500)
# rf.fit(X, y)

In [None]:
# y_pred = rf.predict(X_test)
# f1_score(y_pred, y_test)

## Submission

In [176]:
gbm

LGBMClassifier(min_child_samples=80, n_estimators=1250, num_leaves=250)

In [181]:
submodel = gbm

In [182]:
y_pred = list(submodel.predict(testing_df))
data = {'id': range(len(y_pred)), 'category': y_pred}
submission = pd.DataFrame(data).set_index('id')

In [180]:
submission.to_csv('gbmsubmission2.csv')

In [179]:
submission.category.value_counts()

1    17075
0    15573
Name: category, dtype: int64

In [175]:
submission.category.value_counts()

1    16981
0    15667
Name: category, dtype: int64

In [132]:
testing[testing['source_year'] < testing['target_year']].index

Int64Index([    8,    16,    21,    24,    25,    29,    32,    35,    37,
               42,
            ...
            32608, 32610, 32613, 32621, 32629, 32630, 32639, 32641, 32644,
            32646],
           dtype='int64', length=6977)

In [159]:
testing.loc[8,:]

source_id                                                             9603027
target_id                                                              301251
source_year                                                              1996
source_authors                                        choonkyu lee,q-han park
source_title                                    gravit bp dyon witout dilaton
source_jn                                                          phys.lett.
source_abstract             describ curved-spac bp dyon solut adm mass sat...
target_year                                                              2003
target_authors                        v.p. akulov,oktay cebecioglu,a. pashnev
target_title                     superconform quantum mechan nonlinear realiz
target_jn                                                                 NaN
target_abstract             approach framework nonlinear realiz reder acti...
year_diff                                                       

In [155]:
nolink = testing[testing['source_year'] < testing['target_year']].index

In [156]:
print(nolink)

Int64Index([    8,    16,    21,    24,    25,    29,    32,    35,    37,
               42,
            ...
            32608, 32610, 32613, 32621, 32629, 32630, 32639, 32641, 32644,
            32646],
           dtype='int64', length=6977)


In [157]:
len(nolink)

6977

In [None]:
submission.category

In [141]:
# submission.loc[nolink,:].loc[ (submission.category > 0), :]

In [158]:
submission.iloc[nolink][submission.iloc[nolink]['category'] > 0]

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
53,1
126,1
624,1
784,1
815,1
...,...
30431,1
31044,1
31818,1
32005,1


In [160]:
submission.loc[nolink,'category'] = 0

In [161]:
submission.iloc[nolink][submission.iloc[nolink]['category'] > 0]

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1


In [145]:
submission.shape

(32648, 1)