In [18]:
import pandas as pd
import networkx as nx
import random
import spacy
import en_core_web_lg
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import lightgbm as lgb
import numpy as np
import igraph
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
from sklearn.metrics.pairwise import pairwise_distances
from tqdm import tqdm

In [2]:
nlp = en_core_web_lg.load()

In [4]:
#loading training csv

training = pd.read_csv('C:/Users/priya/Downloads/ngsa/training_set.txt', sep = ' ', header = None)

(615512, 3)

In [6]:
#loading node information data and naming the columns

node_info = pd.read_csv('C:/Users/priya/Downloads/ngsa/node_info.csv', header= None)
node_info.columns = ['id', 'pub_year', 'title', 'authors', 'journal_name', 'abstract']
node_info = node_info.set_index('id')

Unnamed: 0_level_0,pub_year,title,authors,journal_name,abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...
1006,2000,questions in quantum physics,Rudolf Haag,,an assessment of the present status of the the...
1007,2000,topological defects in 3-d euclidean gravity,"Sheng Li, Yong Zhang, Zhongyuan Zhu",,by making use of the complete decomposition of...
1008,2000,n 0 supersymmetry and the non-relativistic mon...,Donald Spector,Phys.Lett.,we study some of the algebraic properties of t...
1009,2000,gluon pair production from space-time dependen...,"Gouranga C. Nayak, Walter Greiner",,we compute the probabilty for the processes a ...
1010,2000,instantons euclidean supersymmetry and wick ro...,"A.V. Belitsky, S. V, oren, P. van Nieuwenhuizen",Phys.Lett.,we discuss the reality properties of the fermi...


In [8]:
#creating the graph network - nodes and edges

IDs = [node_id for node_id in node_info.index]

training_list = training.values.tolist() # training dataframe convertion for easy edges list comprehension below
edges = [(node_pair[0], node_pair[1]) for node_pair in training_list if node_pair[2] == 1]

G = nx.DiGraph()
G.add_nodes_from(IDs)
G.add_edges_from(edges)

print("Number of nodes : " + str(G.number_of_nodes()))
print("Number of edges : " + str(G.number_of_edges()))

Number of nodes : 27770
Number of edges : 335130


In [10]:
#subsetting the training set to facilitate computation on laptop

training_reduced = training.sample(frac=0.07) # We keep 7%
training_reduced.columns = ['source', 'target', 'Y']

len(training_reduced)

43086

In [11]:
# Degree Centrality features
out_degree_centrality = nx.out_degree_centrality(G)
in_degree_centrality = nx.in_degree_centrality(G)
total_centrality = nx.degree_centrality(G)
training_reduced['source_out_centrality'] = training_reduced.apply(lambda row: out_degree_centrality[row.source],axis=1)
training_reduced['target_in_centrality'] = training_reduced.apply(lambda row: in_degree_centrality[row.target],axis=1)
training_reduced['source_centrality'] = training_reduced.apply(lambda row: total_centrality[row.source],axis=1)
training_reduced['target_centrality'] = training_reduced.apply(lambda row: total_centrality[row.target],axis=1)

In [12]:
#eigen vector centrality
eigen_centrality = nx.eigenvector_centrality(G)
training_reduced['source_evc'] = training_reduced.apply(lambda row: eigen_centrality[row.source],axis=1)
training_reduced['target_evc'] = training_reduced.apply(lambda row: eigen_centrality[row.target],axis=1)

In [13]:
# Page rank
page_rank = nx.pagerank_scipy(G)
training_reduced['target_pagerank'] = training_reduced.apply(lambda row: page_rank[row.target],axis=1)

# Preferential Attachment
# For a directed graph, is equal to K_out_source * K_in_target with K the number of neighbors. Which is equivalent to multiply the available centralities.
training_reduced['preferencial_attachment'] = training_reduced.apply(lambda row: row.source_out_centrality * row.target_in_centrality,axis=1)

# HITS algorithm
hub_score, authority_score = nx.hits(G)
training_reduced['source_hub_score'] = training_reduced.apply(lambda row: hub_score[row.source],axis=1)
training_reduced['target_authority_score'] = training_reduced.apply(lambda row: authority_score[row.target],axis=1)

In [15]:
#feature engineering on node attributes - based on node information like title, abstract, published date

#difference in publication year

training_reduced['pub_year_difference'] = training_reduced.apply(lambda row: node_info.pub_year[row.source] - node_info.pub_year[row.target] ,axis=1)
training_reduced['pub_year_difference']=training_reduced['pub_year_difference'].where(training_reduced['pub_year_difference'] >= 0, -1)

# common Authors
node_info['authors'] = node_info['authors'].fillna(value='')
training_reduced['common_authors'] = training_reduced.apply(lambda row: len(set(node_info.authors[row.source].split(",")).intersection(set(node_info.authors[row.target].split(",")))) ,axis=1)

#number of common journal name
node_info['journal_name'] = node_info['journal_name'].fillna(value='')
training_reduced['common_journals'] = training_reduced.apply(lambda row: len(set(node_info.journal_name[row.source]).intersection(set(node_info.journal_name[row.target]))) ,axis=1)


In [17]:
# Title similarity-spacy
training_reduced['title_similarity'] = training_reduced.apply(lambda row: nlp(node_info.title[row.source]).similarity(nlp(node_info.title[row.target])) ,axis=1)

# Abstract similarity- spacy
training_reduced['abstract_similarity'] = training_reduced.apply(lambda row: nlp(node_info.abstract[row.source]).similarity(nlp(node_info.abstract[row.target])) ,axis=1)


In [25]:
#cosine distance of abstracts - tf-idf


nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()



def tfidf_abstract():

    tfidf_abstracts = []

    for i in range(len(node_info)):
        abstract = node_info.iloc[i]['abstract'].lower().split(" ")
        abstract = [token for token in abstract if token not in stpwds]
        abstract = [stemmer.stem(token) for token in abstract]
        tfidf_abstracts.append(" ".join(abstract))

    vectorizer = TfidfVectorizer(min_df=2)
    tfidf_abstracts = vectorizer.fit_transform(tfidf_abstracts)

    tfidf_abstracts = tfidf_abstracts.toarray()

    return tfidf_abstracts

tfidf_abstracts = tfidf_abstract()


training_distance_abs = []
    
for i in range(len(training_reduced)):
    source = training_reduced.iloc[i]['source']
    target = training_reduced.iloc[i]['target']

    index_source = IDs.index(source)
    index_target = IDs.index(target)

    source_info = tfidf_abstracts[index_source].reshape(1, -1)
    target_info = tfidf_abstracts[index_target].reshape(1, -1)

    training_distance_abs.append(pairwise_distances(source_info, target_info, metric='cosine', n_jobs=1))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
training_distance_abs2 = (np.asarray(training_distance_abs).reshape(len(training_reduced),1)).tolist()
training_distance_abs = [val for sublist in training_distance_abs2 for val in sublist]
training_reduced['training_dist_abs']=training_distance_abs

In [61]:
#coside distance for titles - tf-idf

def tfidf_title():

    tfidf_title = []

    for i in range(len(node_info)):
        title = node_info.iloc[i]['title'].lower().split(" ")
        title = [token for token in title if token not in stpwds]
        title = [stemmer.stem(token) for token in title]
        tfidf_title.append(" ".join(title))

    vectorizer = TfidfVectorizer(min_df=2)
    tfidf_title = vectorizer.fit_transform(tfidf_title)

    tfidf_title = tfidf_title.toarray()

    return tfidf_title

tfidf_title = tfidf_title()


training_distance_title = []
    
for i in range(len(training_reduced)):
    source = training_reduced.iloc[i]['source']
    target = training_reduced.iloc[i]['target']

    index_source = IDs.index(source)
    index_target = IDs.index(target)

    source_info = tfidf_title[index_source].reshape(1, -1)
    target_info = tfidf_title[index_target].reshape(1, -1)

    training_distance_title.append(pairwise_distances(source_info, target_info, metric='cosine', n_jobs=1))

In [62]:
training_distance_title2=(np.asarray(training_distance_title).reshape(len(training_reduced),1)).tolist()
training_distance_title = [val for sublist in training_distance_title2 for val in sublist]
training_reduced['training_dist_title']=training_distance_title

In [89]:
training_reduced

Unnamed: 0,source,target,Y,source_out_centrality,target_in_centrality,source_centrality,target_centrality,source_evc,target_evc,target_pagerank,preferencial_attachment,source_hub_score,target_authority_score,pub_year_difference,common_authors,common_journals,title_similarity,abstract_similarity,training_dist_abs,training_dist_title
507477,9705120,9608024,1,0.001044,0.013108,0.002809,0.014909,8.545371e-05,1.141593e-02,0.001015,1.368923e-05,0.000103,0.001667,1,0,9,0.756032,0.948874,0.809550,0.919319
70689,12228,209062,0,0.000000,0.000000,0.000684,0.000252,1.359490e-04,2.216651e-38,0.000011,0.000000e+00,0.000000,0.000000,-1,0,0,0.492747,0.907499,0.944549,1.000000
422505,11282,5040,1,0.001801,0.005330,0.002521,0.005834,1.799037e-10,2.722042e-08,0.000119,9.596464e-06,0.000181,0.000435,0,0,4,0.220738,0.914158,0.755405,0.795546
379959,9610252,9606193,1,0.000576,0.002413,0.001729,0.003349,2.683788e-05,5.874529e-04,0.000069,1.390190e-06,0.000081,0.000276,0,0,5,0.371616,0.938083,0.899424,1.000000
509173,101122,9410167,1,0.001512,0.025316,0.001548,0.026180,1.997202e-35,2.526680e-01,0.003507,3.828989e-05,0.000098,0.003029,7,0,6,0.391077,0.978837,0.742181,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264424,101162,11002,1,0.001152,0.000612,0.002377,0.001693,1.625915e-08,7.963036e-09,0.000018,7.054698e-07,0.000025,0.000027,1,0,1,0.793785,0.952679,0.864674,0.892545
165444,9411242,9709101,0,0.000180,0.000504,0.000828,0.000720,2.116976e-05,4.716742e-09,0.000025,9.077736e-08,0.000008,0.000009,-1,0,5,0.614609,0.963613,0.947215,1.000000
115845,205321,107028,1,0.003061,0.000180,0.003817,0.000684,2.762823e-18,4.940902e-14,0.000013,5.511483e-07,0.000165,0.000036,1,1,0,0.301396,0.924721,0.944288,1.000000
538105,3262,3184,1,0.000612,0.000684,0.000864,0.002485,2.604463e-09,9.312014e-10,0.000018,4.188727e-07,0.000022,0.000021,0,0,8,0.438315,0.952725,0.885956,1.000000


In [92]:
from collections import defaultdict

out = defaultdict(list)
inc = defaultdict(list)
for i, j in edges:
    out[i].append(j)
    inc[j].append(i)
    

In [93]:
def jaccard(source, target):
    try:
        denom = 1/len(set(out[source]) | set(inc[target]))
    except:
        denom = 1
    jac = len(set(out[source]) & set(inc[target]))*denom
    return jac

In [94]:
def comonneighbors(source, target):
    source = set(out[source]) | set(inc[source])
    target = set(out[target]) | set(inc[target])
    return len(target & source)

In [95]:
#jaccard

ls=[]
for i in range(training_reduced.shape[0]):
    ls.append(jaccard(training_reduced.iloc[i]['source'], training_reduced.iloc[i]['target']))
training_reduced['jacard'] = ls

In [96]:
#common neighbors

ls = [] 
for i in range(training_reduced.shape[0]):
    ls.append(comonneighbors(training_reduced.iloc[i]['source'], training_reduced.iloc[i]['target']))
training_reduced['comonneigh'] = ls

In [97]:
#in neighbors
#out neighbors

ls1=[]
ls2=[]
for i in range(training_reduced.shape[0]):
    ls1.append(len(out[training_reduced.iloc[i]['source']]))
    ls2.append(len(inc[training_reduced.iloc[i]['target']]))
training_reduced['outneighbors'] = ls1
training_reduced['inneighbors'] = ls2

In [169]:
#common successors
ls=[]
for i, rows in training_reduced.iterrows():
    ls.append(len(set(out[rows['source']]) & set(out[rows['target']])))
training_reduced['common_successors'] = ls

In [170]:
#common predessesors
ls=[]
for i, rows in training_reduced.iterrows():
    ls.append(len(set(inc[rows['source']]) & set(inc[rows['target']])))
training_reduced['common_pred'] = ls

In [171]:
#number of overlapping words in title
def overlapping_title(source, target):
    title = node_info.loc[source, 'title']
    title = [token for token in title.lower().split(" ") if token not in stpwds]
    source = [stemmer.stem(token) for token in title]
    title = node_info.loc[target, 'title']
    title = [token for token in title.lower().split(" ") if token not in stpwds]
    target = [stemmer.stem(token) for token in title]    
    return len(set(target) & set(source))
ls=[]
for i, rows in training_reduced.iterrows():
    ls.append(overlapping_title(rows['source'], rows['target']))

training_reduced['overlap_title'] = ls

In [172]:
#number of overlapping words( >= 9 letters) in abstracts
def overlapping_abstract(source, target):
    abstract = node_info.loc[source, 'abstract']
    abstract = [token for token in abstract.lower().split(" ") if token not in stpwds and len(token)>8]
    source = [stemmer.stem(token) for token in abstract]
    abstract = node_info.loc[target, 'abstract']
    abstract = [token for token in abstract.lower().split(" ") if token not in stpwds and len(token)>8]
    target = [stemmer.stem(token) for token in abstract]    
    return len(set(target) & set(source))
ls=[]
for i, rows in training_reduced.iterrows():
    ls.append(overlapping_abstract(rows['source'], rows['target']))

training_reduced['overlap_abstract'] = ls

In [173]:
#paths of length one
ls=[]
for i, rows in training_reduced.iterrows():
    try:
        short_path = nx.shortest_path_length(G,source=rows['source'],target=rows['target'])
    except:
        short_path = -1
    ls.append(short_path)
training_reduced['short_path'] = ls

In [174]:
#popularity
ls=[]
for i, rows in training_reduced.iterrows():
    ls.append(sum([len(inc[in_target]) for in_target in inc[rows['target']]]))
training_reduced['popularity'] = ls

In [253]:
training_reduced

Unnamed: 0,source,target,Y,source_out_centrality,target_in_centrality,source_centrality,target_centrality,source_evc,target_evc,target_pagerank,...,jacard,comonneigh,outneighbors,inneighbors,common_successors,common_pred,overlap_title,overlap_abstract,short_path,popularity
507477,9705120,9608024,1,0.001044,0.013108,0.002809,0.014909,8.545371e-05,1.141593e-02,0.001015,...,0.031496,30,29,364,1,17,1,5,1,15075
70689,12228,209062,0,0.000000,0.000000,0.000684,0.000252,1.359490e-04,2.216651e-38,0.000011,...,0.000000,0,0,0,0,0,0,2,-1,0
422505,11282,5040,1,0.001801,0.005330,0.002521,0.005834,1.799037e-10,2.722042e-08,0.000119,...,0.076087,22,50,148,0,9,1,1,1,2771
379959,9610252,9606193,1,0.000576,0.002413,0.001729,0.003349,2.683788e-05,5.874529e-04,0.000069,...,0.012195,4,16,67,3,0,0,2,1,1072
509173,101122,9410167,1,0.001512,0.025316,0.001548,0.026180,1.997202e-35,2.526680e-01,0.003507,...,0.017760,14,42,703,1,0,0,2,1,28479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264424,101162,11002,1,0.001152,0.000612,0.002377,0.001693,1.625915e-08,7.963036e-09,0.000018,...,0.020833,21,32,17,18,2,1,0,1,399
165444,9411242,9709101,0,0.000180,0.000504,0.000828,0.000720,2.116976e-05,4.716742e-09,0.000025,...,0.000000,0,5,14,0,0,0,2,-1,72
115845,205321,107028,1,0.003061,0.000180,0.003817,0.000684,2.762823e-18,4.940902e-14,0.000013,...,0.022727,10,85,5,8,0,0,1,1,42
538105,3262,3184,1,0.000612,0.000684,0.000864,0.002485,2.604463e-09,9.312014e-10,0.000018,...,0.000000,9,17,19,6,3,0,0,1,136


# Processing on testing set

In [77]:
testing = pd.read_csv('C:/Users/priya/Downloads/ngsa/testing_set.txt', sep = ' ', header = None)
testing.columns = ['source', 'target']

In [78]:
# Degree Centrality features
out_degree_centrality = nx.out_degree_centrality(G)
in_degree_centrality = nx.in_degree_centrality(G)
total_centrality = nx.degree_centrality(G)
testing['source_out_centrality'] = testing.apply(lambda row: out_degree_centrality[row.source],axis=1)
testing['target_in_centrality'] = testing.apply(lambda row: in_degree_centrality[row.target],axis=1)
testing['source_centrality'] = testing.apply(lambda row: total_centrality[row.source],axis=1)
testing['target_centrality'] = testing.apply(lambda row: total_centrality[row.target],axis=1)

In [79]:
#eigen vector centrality
eigen_centrality = nx.eigenvector_centrality(G)
testing['source_evc'] = testing.apply(lambda row: eigen_centrality[row.source],axis=1)
testing['target_evc'] = testing.apply(lambda row: eigen_centrality[row.target],axis=1)

In [80]:
# Page rank
page_rank = nx.pagerank_scipy(G)
testing['target_pagerank'] = testing.apply(lambda row: page_rank[row.target],axis=1)

# Preferential Attachment
# For a directed graph, is equal to K_out_source * K_in_target with K the number of neighbors. Which is equivalent to multiply the available centralities.
testing['preferencial_attachment'] = testing.apply(lambda row: row.source_out_centrality * row.target_in_centrality,axis=1)

# HITS algorithm
hub_score, authority_score = nx.hits(G)
testing['source_hub_score'] = testing.apply(lambda row: hub_score[row.source],axis=1)
testing['target_authority_score'] = testing.apply(lambda row: authority_score[row.target],axis=1)

In [81]:
#feature engineering on node attributes - based on node information like title, abstract, published date

#difference in publication year

testing['pub_year_difference'] = testing.apply(lambda row: node_info.pub_year[row.source] - node_info.pub_year[row.target] ,axis=1)
testing['pub_year_difference']=testing['pub_year_difference'].where(testing['pub_year_difference'] >= 0, -1)

# common Authors
node_info['authors'] = node_info['authors'].fillna(value='')
testing['common_authors'] = testing.apply(lambda row: len(set(node_info.authors[row.source].split(",")).intersection(set(node_info.authors[row.target].split(",")))) ,axis=1)

#number of common journal name
node_info['journal_name'] = node_info['journal_name'].fillna(value='')
testing['common_journals'] = testing.apply(lambda row: len(set(node_info.journal_name[row.source]).intersection(set(node_info.journal_name[row.target]))) ,axis=1)



In [82]:
# Title
testing['title_similarity'] = testing.apply(lambda row: nlp(node_info.title[row.source]).similarity(nlp(node_info.title[row.target])) ,axis=1)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mo

In [83]:
# Abstract similarity- spacy
testing['abstract_similarity'] = testing.apply(lambda row: nlp(node_info.abstract[row.source]).similarity(nlp(node_info.abstract[row.target])) ,axis=1)

In [84]:
#cosine distance - abstract
testing_distance_abs = []
    
for i in range(len(testing)):
    source = testing.iloc[i]['source']
    target = testing.iloc[i]['target']

    index_source = IDs.index(source)
    index_target = IDs.index(target)

    source_info = tfidf_abstracts[index_source].reshape(1, -1)
    target_info = tfidf_abstracts[index_target].reshape(1, -1)

    testing_distance_abs.append(pairwise_distances(source_info, target_info, metric='cosine', n_jobs=1))

In [85]:
testing_distance_abs2 = (np.asarray(testing_distance_abs).reshape(len(testing),1)).tolist()
testing_distance_abs = [val for sublist in testing_distance_abs2 for val in sublist]
testing['testing_dist_abs']=testing_distance_abs

In [86]:
#cosine distance- title

testing_distance_title = []
    
for i in range(len(testing)):
    source = testing.iloc[i]['source']
    target = testing.iloc[i]['target']

    index_source = IDs.index(source)
    index_target = IDs.index(target)

    source_info = tfidf_title[index_source].reshape(1, -1)
    target_info = tfidf_title[index_target].reshape(1, -1)

    testing_distance_title.append(pairwise_distances(source_info, target_info, metric='cosine', n_jobs=1))

In [87]:
testing_distance_title2=(np.asarray(testing_distance_title).reshape(len(testing),1)).tolist()
testing_distance_title = [val for sublist in testing_distance_title2 for val in sublist]
testing['training_dist_title']=testing_distance_title

In [104]:
#jaccard

ls=[]
for i in range(testing.shape[0]):
    ls.append(jaccard(testing.iloc[i]['source'], testing.iloc[i]['target']))
testing['jacard'] = ls

In [105]:
#common neighbors

ls = [] 
for i in range(testing.shape[0]):
    ls.append(comonneighbors(testing.iloc[i]['source'], testing.iloc[i]['target']))
testing['comonneigh'] = ls

In [106]:
#in neighbors
#out neighbors

ls1=[]
ls2=[]
for i in range(testing.shape[0]):
    ls1.append(len(out[testing.iloc[i]['source']]))
    ls2.append(len(inc[testing.iloc[i]['target']]))
testing['outneighbors'] = ls1
testing['inneighbors'] = ls2

In [180]:
#common successors
ls=[]
for i, rows in testing.iterrows():
    ls.append(len(set(out[rows['source']]) & set(out[rows['target']])))
testing['common_successors'] = ls

In [181]:
#common predessesors
ls=[]
for i, rows in testing.iterrows():
    ls.append(len(set(inc[rows['source']]) & set(inc[rows['target']])))
testing['common_pred'] = ls

In [182]:
#number of overlapping words in title
ls=[]
for i, rows in testing.iterrows():
    ls.append(overlapping_title(rows['source'], rows['target']))

testing['overlap_title'] = ls

In [183]:
#number of overlapping words( >= 9 letters) in abstracts
ls=[]
for i, rows in testing.iterrows():
    ls.append(overlapping_abstract(rows['source'], rows['target']))

testing['overlap_abstract'] = ls

In [184]:
#paths of length onele]
ls=[]
for i, rows in training_reduced.iterrows():
    try:    
        short_path = nx.shortest_path_length(G,source=rows['source'],target=rows['target'])
    except:
        short_path = -1
    if short_path == 1:
        G.remove_edge(rows['source'],rows['target'])
        try:    
            short_path = nx.shortest_path_length(G,source=rows['source'],target=rows['target'])
        except:
            short_path = -1
        G.add_edge(rows['source'],rows['target'])
    ls.append(short_path)
training_reduced['short_path'] = ls

In [185]:
#popularity
ls=[]
for i, rows in testing.iterrows():
    ls.append(sum([len(inc[in_target]) for in_target in inc[rows['target']]]))
testing['popularity'] = ls

In [252]:
testing

Unnamed: 0,source,target,source_out_centrality,target_in_centrality,source_centrality,target_centrality,source_evc,target_evc,target_pagerank,preferencial_attachment,...,jacard,comonneigh,outneighbors,inneighbors,common_successors,common_pred,overlap_title,overlap_abstract,short_path,popularity
0,9807076,9807139,0.000360,0.000108,0.002125,0.000648,9.082262e-06,5.472969e-14,0.000012,3.890458e-08,...,0.000000,0,10,3,0,0,0,1,16,5
1,109162,1182,0.007310,0.001404,0.010911,0.001621,2.993220e-11,3.777396e-07,0.000069,1.026692e-05,...,0.075556,24,203,39,6,1,2,3,2,1266
2,9702187,9510135,0.000504,0.026144,0.008031,0.026612,4.259980e-03,8.746000e-02,0.002588,1.318087e-05,...,0.006803,59,14,726,0,54,1,0,2,28210
3,111048,110115,0.001440,0.000576,0.001837,0.002341,1.489686e-10,1.513932e-10,0.000016,8.299645e-07,...,0.056604,21,40,16,14,4,1,6,2,214
4,9910176,9410073,0.000216,0.005186,0.000252,0.005402,1.032959e-35,5.801489e-02,0.000879,1.120452e-06,...,0.000000,0,6,144,0,0,0,1,3,8521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32643,9705209,9305083,0.001368,0.001693,0.001512,0.001873,1.494568e-06,2.939870e-02,0.000367,2.316120e-06,...,0.011905,1,38,47,0,0,3,0,2,1789
32644,9307023,9503118,0.000000,0.000216,0.000072,0.000432,4.822759e-07,1.318309e-08,0.000019,0.000000e+00,...,0.000000,0,0,6,0,0,0,0,-1,9
32645,9608095,9205058,0.000936,0.000648,0.001404,0.000648,5.156258e-05,5.108872e-03,0.000110,6.069115e-07,...,0.000000,0,26,18,0,0,0,2,3,324
32646,9407008,106256,0.000216,0.000540,0.000828,0.001404,5.007298e-03,4.114137e-12,0.000026,1.167138e-07,...,0.000000,0,6,15,0,0,0,0,-1,62


In [243]:
#cutting out few features for trial

x_train_cut= training_reduced.drop(['common_journals','source_centrality', 'target_centrality', 'source_evc','target_evc','short_path'], axis= 1)

In [213]:
#cutting out few features from test for trial

x_test_cut=testing.drop(['common_journals','source_centrality', 'target_centrality', 'source_evc','target_evc','short_path'], axis= 1)

In [277]:
#Scaling all the features

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

training_reduced_scale= scaler.fit_transform(training_reduced[['source_out_centrality','target_in_centrality','source_centrality','target_centrality','source_evc','target_evc','target_pagerank','preferencial_attachment','source_hub_score','target_authority_score', 'pub_year_difference','common_authors','common_journals','title_similarity','abstract_similarity','training_dist_abs','training_dist_title','jacard','comonneigh','outneighbors','inneighbors','common_successors','common_pred','overlap_title','overlap_abstract','popularity']])
training_reduced_scale = pd.DataFrame(training_reduced_scale)

testing_scale=scaler.fit_transform(testing[['source_out_centrality','target_in_centrality','source_centrality','target_centrality','source_evc','target_evc','target_pagerank','preferencial_attachment','source_hub_score','target_authority_score', 'pub_year_difference','common_authors','common_journals','title_similarity','abstract_similarity','testing_dist_abs','training_dist_title','jacard','comonneigh','outneighbors','inneighbors','common_successors','common_pred','overlap_title','overlap_abstract','popularity']])
testing_scale = pd.DataFrame(testing_scale)

In [240]:
#test train split for actual(unscaled) features

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_train_cut.drop(['source', 'target', 'Y'], axis= 1), training_reduced.Y, test_size=0.2)

In [278]:
#test train split for scaled features

X_train, X_test, y_train, y_test = train_test_split(training_reduced_scale, training_reduced.Y, test_size=0.2)

In [303]:

RF =  RandomForestClassifier(n_estimators= 1000)
RF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [304]:
RF.score(X_test, y_test)

0.9755163611046647

In [299]:
#hyperparameter grid search for randomforest
n_estimators = [200, 400, 600, 800, 1000]
max_depth = [5, 10, 15]
min_samples_split = [2,4,6]
min_samples_leaf = [1, 3, 6]

from sklearn.model_selection import GridSearchCV

hyperF = dict(n_estimators = n_estimators, 
              min_samples_split = min_samples_split,
             min_samples_leaf = min_samples_leaf)
clf = GridSearchCV(RandomForestClassifier(), hyperF, cv = 3, verbose = 1,
                      n_jobs = -1, scoring='accuracy')

clf.fit(X_test, y_test)
clf_best = clf.best_estimator_

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  4.9min finished


In [300]:
clf_best.score(X_test, y_test)

1.0

In [263]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [264]:
lr.score(X_test, y_test)

0.941749825945695

In [266]:
xgb= XGBClassifier(n_estimators=1000)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [267]:
xgb.score(X_test, y_test)

0.9747041076815967

In [269]:
dt= DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [270]:
dt.score(X_test, y_test)

0.9543977721048967

In [272]:
svc= SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [273]:
svc.score(X_test, y_test)

0.949872360176375

In [279]:
lsvc= LinearSVC()
lsvc.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [280]:
lsvc.score(X_test, y_test)

0.9554420979345556

In [282]:
et=ExtraTreesClassifier(n_estimators=1000)
et.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=1000,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [283]:
et.score(X_test, y_test)

0.9747041076815967

In [285]:
gb=GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [286]:
gb.score(X_test, y_test)

0.9754003249013692

In [293]:
ab=AdaBoostClassifier(n_estimators=1000, learning_rate=0.8)
ab.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.8,
                   n_estimators=1000, random_state=None)

In [294]:
ab.score(X_test, y_test)

0.9720352750058018

In [296]:
#stacking classifiers

base_learners = [
                 ('lr', LogisticRegression()),
                 ('xgb', XGBClassifier(n_estimators=1000)),
                 ('rf', RandomForestClassifier(n_estimators=1000))
                     
                ]
stc = StackingClassifier(estimators=base_learners, final_estimator=LinearSVC())
stc.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


StackingClassifier(cv=None,
                   estimators=[('lr',
                                LogisticRegression(C=1.0, class_weight=None,
                                                   dual=False,
                                                   fit_intercept=True,
                                                   intercept_scaling=1,
                                                   l1_ratio=None, max_iter=100,
                                                   multi_class='auto',
                                                   n_jobs=None, penalty='l2',
                                                   random_state=None,
                                                   solver='lbfgs', tol=0.0001,
                                                   verbose=0,
                                                   warm_start=False)),
                               ('xgb',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                     

In [297]:
stc.score(X_test, y_test)

0.9774889765606869

In [302]:
#writing predictions to csv

predictions = list(clf_best.predict(testing_scale))
pred_df = pd.DataFrame(predictions,columns =['category'])
pred_df.index.names = ['id']
pred_df.to_csv('C:/Users/priya/Downloads/ngsa/predictions_RFgridsearch_fullfeatures_scaled.csv')