In [None]:
import arxiv_scraper as scrap
import pandas as pd
import numpy as np
import pickle

#scraping coauthorship information of articles in Math (Quantum Algebra subcategory) from January 1, 2018 to January 20, 2018 
coauthorship_scraper = scrap.Scraper(category='math', date_from='2020-08-01', date_until='2020-08-10', t=10, filters={'categories':['math.qa'],'abstract':['quantum']})
scraped_records = coauthorship_scraper.scrape()

#saving scraped information into pandas dataframe
scraped_columns = ('id', 'title', 'authors')
data_frame = pd.DataFrame(scraped_records, columns = scraped_columns)

#saving pandas dataframe into a .pkl file
data_frame.to_pickle('data.pkl')

http://export.arxiv.org/oai2?verb=ListRecords&from=2020-08-01&until=2020-08-10&metadataPrefix=arXiv&set=math
fetching up to  1000 records...
fetching up to  2000 records...
Got 503. Retrying after 10 seconds.
fetching up to  2000 records...
fetching is completed in 30.8 seconds.
Total number of records 123


In [None]:
import pandas as pd
import numpy as np
import pickle
import networkx as nx
import matplotlib.pyplot as plt
import collections
import sys

#function for forming an edge list. Each element in the edge list is a pair of authors which means they are co-author
def form_edge_list(records):
    m, n = np.shape(records)
    edgelist = []

    for index in range(m):
        authors = records[index, 2]
        num_of_authors = len(authors)

        i = 0;
        while i < num_of_authors - 1:
            j = i + 1
            while j < num_of_authors:
                if (authors[i], authors[j]) not in edgelist and (authors[j], authors[i]) not in edgelist:
                    edgelist = edgelist + [(authors[i], authors[j])]
                j += 1
            i += 1
    return edgelist

In [None]:
#start of main block where execution will begin
if __name__ == '__main__':

    #open the file having authorship records
    file = None
    try:
        file = open("data.pkl", 'rb')
    except FileNotFoundError:
        sys.exit("Error: First run coauthorship_scraper.py to generate data.pkl file")
        
    data_frame = pickle.load(file)
    rows = data_frame.values;


    #call function form_edge_list to generate an edge list for the network
    edge_list = form_edge_list(rows)

In [None]:
pos = pd.DataFrame(edge_list, columns=['u1','u2'])

pos['link'] = 1

pos.head(2)

Unnamed: 0,u1,u2,link
0,creutzig,gao,1
1,creutzig,linshaw,1


In [None]:
G = nx.from_pandas_edgelist(pos, 'u1', 'u2', create_using=nx.Graph())

In [None]:
#generating negative examples - 

from itertools import combinations

elements = list(set([e for l in edge_list for e in l])) # find all unique elements

complete_list = list(combinations(elements, 2)) # generate all possible combinations

#convert to sets to negate the order

set1 = [set(l) for l in edge_list]
complete_set = [set(l) for l in complete_list]

# find sets in `complete_set` but not in `set1`
all_unconnnected = [list(l) for l in complete_set if l not in set1]

print(len(edge_list))
print(len(all_unconnnected))

209
18901


In [None]:
df_not_connected = pd.DataFrame(all_unconnnected, columns=['u1', 'u2'])
df_not_connected['link'] = 0
df_not_connected.head(3)

Unnamed: 0,u1,u2,link
0,liénardy,girelli,0
1,liénardy,yadavalli,0
2,liénardy,rahaman,0


In [None]:
data = pd.concat([pos, df_not_connected])

In [None]:
!pip install node2vec

Collecting node2vec
  Downloading https://files.pythonhosted.org/packages/c0/da/7f0c49433ef91033e21d523e82be1570074a5d6ab8c74f8771774e9d2fd1/node2vec-0.3.2-py3-none-any.whl
Installing collected packages: node2vec
Successfully installed node2vec-0.3.2


In [None]:
from node2vec import Node2Vec

# Generate walks
node2vec = Node2Vec(G, dimensions=100, walk_length=16, num_walks=50)

# train node2vec model
n2w_model = node2vec.fit(window=7, min_count=1)

Computing transition probabilities: 100%|██████████| 196/196 [00:00<00:00, 8946.68it/s]
Generating walks (CPU: 1): 100%|██████████| 50/50 [00:08<00:00,  5.58it/s]


In [None]:
x = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(data['u1'], data['u2'])]

  """Entry point for launching an IPython kernel.


In [None]:
#models

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(np.array(x), data['link'], 
                                                test_size = 0.3, 
                                                random_state = 35)

In [None]:
#Random Forest

rc = RandomForestClassifier(n_estimators= 20, max_depth=7, random_state=0, min_samples_leaf=20)

rc.fit(xtrain, ytrain)

y_pred=rc.predict(xtest)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print (confusion_matrix(ytest, y_pred))
print("Accuracy:",metrics.accuracy_score(ytest, y_pred))

[[5674    0]
 [  59    0]]
Accuracy: 0.9897087039944182


In [None]:
#SVM
from sklearn import svm

clf = svm.SVC(random_state = 0, verbose = True)
#clf = svm.SVC()
clf.fit(xtrain,ytrain)
y_pred = (clf.predict(xtest))

print (confusion_matrix(ytest, y_pred))
print("Accuracy:",metrics.accuracy_score(ytest, y_pred))

[LibSVM][[5672    2]
 [  10   49]]
Accuracy: 0.9979068550497122


In [None]:
#naiye bayes
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
gnb.fit(xtrain, ytrain)

y_pred_gnb=rc.predict(xtest)

print (confusion_matrix(ytest, y_pred_gnb))
print("Accuracy:",metrics.accuracy_score(ytest, y_pred_gnb))

[[   0   56]
 [   0 5795]]
Accuracy: 0.9904289864980346


In [None]:
#neural network

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1, max_iter = 1000)
		#clf = svm.SVC()
clf.fit(xtrain, ytrain)
		
y_pred=(clf.predict(xtest))


print (confusion_matrix(ytest, y_pred))
print("Accuracy:",metrics.accuracy_score(ytest, y_pred))

[[5674    0]
 [  59    0]]
Accuracy: 0.9897087039944182


In [None]:
#adaboost

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                         algorithm="SAMME",
                         n_estimators=20)
		#clf = svm.SVC()
clf.fit(xtrain,ytrain)
		
y_pred = (clf.predict(xtest))

print (confusion_matrix(ytest, y_pred))
print("Accuracy:",metrics.accuracy_score(ytest, y_pred))

[[5670    4]
 [  34   25]]
Accuracy: 0.993371707657422


In [None]:
#KNN

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(xtrain,ytrain)
y_pred = clf.predict(xtest)
print (confusion_matrix(ytest, y_pred))
print("Accuracy:",metrics.accuracy_score(ytest, y_pred))

[[5669    5]
 [  26   33]]
Accuracy: 0.9945927088784232


In [None]:
lr = LogisticRegression(class_weight="balanced", max_iter = 1000)

lr.fit(xtrain, ytrain)

predictions = lr.predict(xtest)

print (confusion_matrix(predictions, y_pred))
print("Accuracy:",metrics.accuracy_score(predictions, y_pred))

[[3528   11]
 [2167   27]]
Accuracy: 0.6200941915227629


In [None]:
import lightgbm as lgbm

train_data = lgbm.Dataset(xtrain, ytrain)
test_data = lgbm.Dataset(xtest, ytest)

# define parameters
parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'num_threads' : 2,
    'seed' : 76
}

# train lightGBM model
model = lgbm.train(parameters,
                   train_data,
                   valid_sets=test_data,
                   num_boost_round=1000,
                   early_stopping_rounds=20)

[1]	valid_0's auc: 0.796771
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.830765
[3]	valid_0's auc: 0.84092
[4]	valid_0's auc: 0.861323
[5]	valid_0's auc: 0.865975
[6]	valid_0's auc: 0.864511
[7]	valid_0's auc: 0.865741
[8]	valid_0's auc: 0.867022
[9]	valid_0's auc: 0.874058
[10]	valid_0's auc: 0.869082
[11]	valid_0's auc: 0.868487
[12]	valid_0's auc: 0.871325
[13]	valid_0's auc: 0.877562
[14]	valid_0's auc: 0.887169
[15]	valid_0's auc: 0.885696
[16]	valid_0's auc: 0.885266
[17]	valid_0's auc: 0.891473
[18]	valid_0's auc: 0.889648
[19]	valid_0's auc: 0.887877
[20]	valid_0's auc: 0.888277
[21]	valid_0's auc: 0.884092
[22]	valid_0's auc: 0.812424
[23]	valid_0's auc: 0.822921
[24]	valid_0's auc: 0.80455
[25]	valid_0's auc: 0.766583
[26]	valid_0's auc: 0.755056
[27]	valid_0's auc: 0.743848
[28]	valid_0's auc: 0.635121
[29]	valid_0's auc: 0.636047
[30]	valid_0's auc: 0.690584
[31]	valid_0's auc: 0.666059
[32]	valid_0's auc: 0.587345
[33]	valid_0's auc: 0

In [None]:
#recommendations:

n2w_model.wv.most_similar('alekseev')

  if np.issubdtype(vec.dtype, np.int):


[('silvestrov', 0.9995461702346802),
 ('arutyunov', 0.9991171360015869),
 ('fadel', 0.5218631029129028),
 ('gepner', 0.50989830493927),
 ('corazza', 0.5018453001976013),
 ('schrohe', 0.5001792907714844),
 ('savin', 0.4962916672229767),
 ('belavin', 0.4957965910434723),
 ('wenzl', 0.4819181561470032),
 ('tang', 0.4756004810333252)]

In [None]:
#edge embedding

from node2vec.edges import HadamardEmbedder
edges_embs = HadamardEmbedder(keyed_vectors=n2w_model.wv)

In [None]:
edges_kv = edges_embs.as_keyed_vectors()
edges_kv.most_similar(str(('creutzig', 'gao')))

Generating edge features: 100%|██████████| 19306/19306.0 [00:00<00:00, 139824.86it/s]
  if np.issubdtype(vec.dtype, np.int):


[("('chen', 'creutzig')", 0.9966992735862732),
 ("('gao', 'linshaw')", 0.9966665506362915),
 ("('creutzig', 'wilde')", 0.9959723353385925),
 ("('creutzig', 'tan')", 0.9958452582359314),
 ("('chen', 'linshaw')", 0.9956979751586914),
 ("('linshaw', 'wilde')", 0.9939925670623779),
 ("('linshaw', 'tan')", 0.9933345913887024),
 ("('gao', 'wilde')", 0.9928609132766724),
 ("('gao', 'gao')", 0.9894586205482483),
 ("('wilde', 'wilde')", 0.9881450533866882)]

In [None]:
#DeepWalk
import random
def get_randomwalk(node, path_length):
    
    random_walk = [node]
    
    for i in range(path_length-1):
        temp = list(G.neighbors(node))
        temp = list(set(temp) - set(random_walk))    
        if len(temp) == 0:
            break

        random_node = random.choice(temp)
        random_walk.append(random_node)
        node = random_node
        
    return random_walk

In [None]:
get_randomwalk('alekseev', 10)

['alekseev', 'silvestrov', 'arutyunov']

In [None]:
from tqdm import tqdm
# get list of all nodes from the graph
all_nodes = list(G.nodes())

random_walks = []
for n in tqdm(all_nodes):
    for i in range(5):
        random_walks.append(get_randomwalk(n,10))
        
# count of sequences
len(random_walks)

100%|██████████| 196/196 [00:00<00:00, 8560.70it/s]


980

In [None]:
from gensim.models import Word2Vec

import warnings
warnings.filterwarnings('ignore')

# train skip-gram (word2vec) model
model = Word2Vec(window = 4, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(random_walks, progress_per=2)

model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1)

(41934, 66740)

In [None]:
model.similar_by_word('silvestrov')

[('arutyunov', 0.9975031018257141),
 ('alekseev', 0.9961102604866028),
 ('einav', 0.9702726602554321),
 ('martín-ruiz', 0.9696007966995239),
 ('mason', 0.9693496823310852),
 ('lapa', 0.9679994583129883),
 ('dunne', 0.9673644304275513),
 ('de baerdemacker', 0.9671090245246887),
 ('fadel', 0.9670577049255371),
 ('de vos', 0.9669851064682007)]

In [None]:
n2w_model.most_similar('silvestrov')

[('alekseev', 0.9995461702346802),
 ('arutyunov', 0.9992952346801758),
 ('fadel', 0.5280466675758362),
 ('gepner', 0.5087328553199768),
 ('corazza', 0.5077739357948303),
 ('schrohe', 0.5052917003631592),
 ('savin', 0.5016587972640991),
 ('belavin', 0.49439486861228943),
 ('wenzl', 0.4805348515510559),
 ('doussal', 0.4745516777038574)]

In [None]:
x_new = [(model[str(i)]+model[str(j)]) for i,j in zip(data['u1'], data['u2'])]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(np.array(x_new), data['link'], 
                                                test_size = 0.3, 
                                                random_state = 35)

In [None]:
#Random Forest deep walk

rc = RandomForestClassifier(n_estimators= 20, max_depth=7, random_state=0, min_samples_leaf=20)

rc.fit(x_train, y_train)

y_pred=rc.predict(x_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print (confusion_matrix(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

[[5674    0]
 [  59    0]]
Accuracy: 0.9897087039944182


In [None]:
#SVM deep walk
from sklearn import svm

clf = svm.SVC(random_state = 0, verbose = True)
#clf = svm.SVC()
clf.fit(x_train,y_train)
y_pred = (clf.predict(x_test))

print (confusion_matrix(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

[LibSVM][[5674    0]
 [  53    6]]
Accuracy: 0.9907552764695622


In [None]:
#naiye bayes
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
gnb.fit(x_train, y_train)

y_pred_gnb=rc.predict(x_test)

print (confusion_matrix(y_test, y_pred_gnb))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_gnb))

[[5674    0]
 [  59    0]]
Accuracy: 0.9897087039944182


In [None]:
#neural network

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1, max_iter = 1000)
		#clf = svm.SVC()
clf.fit(x_train, y_train)
		
y_pred=(clf.predict(x_test))


print (confusion_matrix(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

[[5674    0]
 [  58    1]]
Accuracy: 0.9898831327402756


In [None]:
#adaboost

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                         algorithm="SAMME",
                         n_estimators=20)
	
clf.fit(x_train,y_train)
		
y_pred = (clf.predict(x_test))

print (confusion_matrix(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

[[5669    5]
 [  49   10]]
Accuracy: 0.9905808477237049


In [None]:
#KNN

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print (confusion_matrix(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

[[5670    4]
 [  28   31]]
Accuracy: 0.9944182801325658


In [None]:
lr = LogisticRegression(class_weight="balanced", max_iter = 1000)

lr.fit(x_train, y_train)

predictions = lr.predict(x_test)

print (confusion_matrix(predictions, y_pred))
print("Accuracy:",metrics.accuracy_score(predictions, y_pred))

[[3738    8]
 [1960   27]]
Accuracy: 0.6567242281527996


In [None]:
train_data = lgbm.Dataset(xtrain, ytrain)
test_data = lgbm.Dataset(xtest, ytest)

# define parameters
parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'num_threads' : 2,
    'seed' : 76
}

# train lightGBM model
model = lgbm.train(parameters,
                   train_data,
                   valid_sets=test_data,
                   num_boost_round=1000,
                   early_stopping_rounds=20)

[1]	valid_0's auc: 0.796771
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.830765
[3]	valid_0's auc: 0.84092
[4]	valid_0's auc: 0.861323
[5]	valid_0's auc: 0.865975
[6]	valid_0's auc: 0.864511
[7]	valid_0's auc: 0.865741
[8]	valid_0's auc: 0.867022
[9]	valid_0's auc: 0.874058
[10]	valid_0's auc: 0.869082
[11]	valid_0's auc: 0.868487
[12]	valid_0's auc: 0.871325
[13]	valid_0's auc: 0.877562
[14]	valid_0's auc: 0.887169
[15]	valid_0's auc: 0.885696
[16]	valid_0's auc: 0.885266
[17]	valid_0's auc: 0.891473
[18]	valid_0's auc: 0.889648
[19]	valid_0's auc: 0.887877
[20]	valid_0's auc: 0.888277
[21]	valid_0's auc: 0.884092
[22]	valid_0's auc: 0.812424
[23]	valid_0's auc: 0.822921
[24]	valid_0's auc: 0.80455
[25]	valid_0's auc: 0.766583
[26]	valid_0's auc: 0.755056
[27]	valid_0's auc: 0.743848
[28]	valid_0's auc: 0.635121
[29]	valid_0's auc: 0.636047
[30]	valid_0's auc: 0.690584
[31]	valid_0's auc: 0.666059
[32]	valid_0's auc: 0.587345
[33]	valid_0's auc: 0

In [None]:

from sklearn.cluster import SpectralClustering

sc = SpectralClustering(2, affinity='precomputed', n_init=100)
sc.fit_predict(x_new)

SpectralClustering(affinity='precomputed', assign_labels='kmeans', coef0=1,
                   degree=3, eigen_solver=None, eigen_tol=0.0, gamma=1.0,
                   kernel_params=None, n_clusters=2, n_components=None,
                   n_init=100, n_jobs=None, n_neighbors=10, random_state=None)

In [3]:
list = [('axe',1),
        ('bullock', 2),
        ('goat', 3)]

for node,score in list:
  if  score >=2:
    print(node)

bullock
goat
