In [1]:
import random
import numpy as np
import pandas as pd
import jgraph
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import keras
import xgboost as xgb


from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from matplotlib import pyplot as plt
import matplotlib


import datetime
import time

  from numpy.core.umath_tests import inner1d
Using TensorFlow backend.


In [2]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dongwenjian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dongwenjian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Function Definition

In [3]:
def test_classifier(X_train, X_test, y_train, y_test):

    df_recap= pd.DataFrame(np.zeros((2, 7)),columns=['LogReg', 'NaiveBayes','SVM', 'RandomForest', 'GBM', 'XGBoost', 'NNET'],
                       index = ['F1', 'Accuracy'])


    logit = LogisticRegression()
    naiveb = GaussianNB()
    svm_ = SVC()
    rf = RandomForestClassifier()
    gbm = GradientBoostingClassifier()
    xg = xgb.XGBClassifier(max_depth=5, n_estimators=500, learning_rate=0.05)
    nnet = MLPClassifier()
   
    classifiers= [logit, naiveb,svm_,rf,gbm ,xg, nnet]
    idx=0
    for classifier in classifiers: 
        
        model = classifier.fit(X_train, y_train)
        predictions = model.predict(X_test)    
        f1 = f1_score(y_test, predictions)
        accuracy = accuracy_score(y_test, predictions)
        
        df_recap.iloc[0,idx]=np.round(f1,3)
        df_recap.iloc[1,idx]=np.round(accuracy,3)
        idx+=1
    
    return df_recap

# Load Data

In [4]:
with open("Data/training_set.txt", "r") as f:
#with open("../Data/training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)
    
training_set = [element[0].split(" ") for element in training_set]
training_set =  pd.DataFrame(training_set, columns=['NodeSrc', 'NodeDest', 'Edge'])
training_set.head(2)

Unnamed: 0,NodeSrc,NodeDest,Edge
0,9510123,9502114,1
1,9707075,9604178,1


In [5]:
type(training_set.NodeSrc[0])

str

In [6]:
with open("Data/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]
node_info =  pd.DataFrame(node_info, columns=['ID', 'year_pub', 'title','authors','name_journal','abstract'])
node_info.head(2)

Unnamed: 0,ID,year_pub,title,authors,name_journal,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...


In [7]:
type(node_info.iloc[0,0])

str

In [8]:
with open("Data/testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]
testing_set =  pd.DataFrame(testing_set, columns=['NodeSrc', 'NodeDest'])
testing_set.shape

(32648, 2)

# Pre-processing

In [9]:
# randomly select 5% of training set to fit and validate the models 
# 95% of the remaining graph is used to create the graph features
to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*0.05)))
dataset_train_val= training_set.iloc[to_keep]
df_rest = training_set.loc[~training_set.index.isin(to_keep)]

In [10]:
df_rest.shape

(584736, 3)

In [11]:
dataset_train_val.shape

(30776, 3)

In [12]:
nodes = df_rest.loc[df_rest['Edge']=='1']
nodes= nodes[['NodeSrc','NodeDest' ]]
nodes.to_csv('Data/nodes.txt', sep=' ', index=False, header=False)

In [13]:
import networkx as nx
DG=nx.DiGraph(directed=True)
DG=nx.read_edgelist('Data/nodes.txt', create_using=nx.DiGraph(), nodetype = str)


In [14]:
neighborsDict={}
for n in DG.nodes():
    neighborsDict[n]= list(DG.neighbors(n))

In [15]:
pr = nx.pagerank(DG, alpha=0.7)

# Training data frame

In [16]:
my_list = list(zip(dataset_train_val.NodeSrc, dataset_train_val.NodeDest))
dict_pairs = pd.Series( my_list , index=dataset_train_val.index).to_dict()

In [17]:
dataset= pd.DataFrame()
dataset['IDPairs']= dict_pairs.keys()
dataset['overlap_title'] = np.zeros(len(dataset))
dataset['overlap_abstract'] = np.zeros(len(dataset))
dataset['temp_diff'] = np.zeros(len(dataset))
dataset['comm_auth'] = np.zeros(len(dataset))
dataset['cossim_a_tfidf'] = np.zeros(len(dataset))
dataset['cossim_t_tfidf'] = np.zeros(len(dataset))
dataset['lsa_abstract'] = np.zeros(len(dataset))
dataset['lsa_title'] = np.zeros(len(dataset))
dataset['nb_cit_indiv'] = np.zeros(len(dataset)) 

dataset['common_out_neighbors'] = np.zeros(len(dataset))
dataset['common_in_neighbors'] = np.zeros(len(dataset))
dataset['jaccard_sim_out'] = np.zeros(len(dataset))
dataset['jaccard_sim_in'] = np.zeros(len(dataset))
dataset['shortest_path'] = np.zeros(len(dataset))
dataset['page_rank'] = np.zeros(len(dataset))
dataset['node2vec']   = np.zeros(len(dataset))


dataset.head(2)

Unnamed: 0,IDPairs,overlap_title,overlap_abstract,temp_diff,comm_auth,cossim_a_tfidf,cossim_t_tfidf,lsa_abstract,lsa_title,nb_cit_indiv,common_out_neighbors,common_in_neighbors,jaccard_sim_out,jaccard_sim_in,shortest_path,page_rank,node2vec
0,39605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,196908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
dataset['center_distance_abstract'] =  np.zeros(len(dataset))

In [18]:
temp = pd.DataFrame.from_dict(dict_pairs, orient='index', columns=['NodeSrc','NodeDest'])
temp['IDPairs']=dict_pairs.keys()
dataset_train_val['IDPairs']=dataset_train_val.index
df_merg=pd.merge(dataset_train_val, temp, on=['IDPairs'])
df_merg=df_merg[['IDPairs','Edge']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [19]:
dataset=pd.merge(dataset,df_merg, on='IDPairs')
dataset.head(1)

Unnamed: 0,IDPairs,overlap_title,overlap_abstract,temp_diff,comm_auth,cossim_a_tfidf,cossim_t_tfidf,lsa_abstract,lsa_title,nb_cit_indiv,common_out_neighbors,common_in_neighbors,jaccard_sim_out,jaccard_sim_in,shortest_path,page_rank,node2vec,Edge
0,39605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [35]:
dataset.head(5)

Unnamed: 0,IDPairs,overlap_title,overlap_abstract,temp_diff,comm_auth,cossim_a_tfidf,cossim_t_tfidf,lsa_abstract,lsa_title,nb_cit_indiv,common_out_neighbors,common_in_neighbors,jaccard_sim_out,jaccard_sim_in,shortest_path,page_rank,node2vec,Edge
0,39605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,196908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,208886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,578957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,36851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


# Test data frame

In [20]:
my_list_test = list(zip(testing_set.NodeSrc, testing_set.NodeDest))
dict_pairs_test = pd.Series(my_list_test, index=testing_set.index).to_dict()

In [21]:
dataset_test= pd.DataFrame()
dataset_test['IDPairs']= dict_pairs_test.keys()
dataset_test['overlap_title'] = np.zeros(len(dataset_test))
dataset_test['overlap_abstract'] = np.zeros(len(dataset_test))
dataset_test['temp_diff'] = np.zeros(len(dataset_test))
dataset_test['overlap_title'] = np.zeros(len(dataset_test))
dataset_test['comm_auth'] = np.zeros(len(dataset_test))


dataset_test['overlap_title'] = np.zeros(len(dataset_test))
dataset_test['overlap_abstract'] = np.zeros(len(dataset_test))
dataset_test['temp_diff'] = np.zeros(len(dataset_test))
dataset_test['comm_auth'] = np.zeros(len(dataset_test))
dataset_test['cossim_a_tfidf'] = np.zeros(len(dataset_test))
dataset_test['cossim_t_tfidf'] = np.zeros(len(dataset_test))
dataset_test['lsa_abstract'] = np.zeros(len(dataset_test))
dataset_test['lsa_title'] = np.zeros(len(dataset_test))
dataset_test['nb_cit_indiv'] = np.zeros(len(dataset_test)) 

dataset_test['common_out_neighbors'] = np.zeros(len(dataset_test))
dataset_test['common_in_neighbors'] = np.zeros(len(dataset_test))
dataset_test['jaccard_sim_out'] = np.zeros(len(dataset_test))
dataset_test['jaccard_sim_in'] = np.zeros(len(dataset_test))
dataset_test['shortest_path'] = np.zeros(len(dataset_test))
dataset_test['page_rank'] = np.zeros(len(dataset_test))
dataset_test['node2vec']   = np.zeros(len(dataset_test))

dataset_test.head(2)

Unnamed: 0,IDPairs,overlap_title,overlap_abstract,temp_diff,comm_auth,cossim_a_tfidf,cossim_t_tfidf,lsa_abstract,lsa_title,nb_cit_indiv,common_out_neighbors,common_in_neighbors,jaccard_sim_out,jaccard_sim_in,shortest_path,page_rank,node2vec
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature engineering

## A. Text-based features

### 1. Number of overlapping words in the title

In [101]:
def overlap_title(idPair, dict_pairs):
    source = dict_pairs.get(idPair)[0]
    target = dict_pairs.get(idPair)[1] 
    
    source_info = node_info.loc [node_info['ID']==source]
    target_info = node_info.loc [node_info['ID']==target]
    
    #Title
    source_title = source_info.iloc[0,2].lower().split(" ")   
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
      
    target_title = source_info.iloc[0,2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    return (len(set(source_title).intersection(set(target_title))))

In [102]:
time_beg = datetime.datetime.now()
dataset['overlap_title']=list(map(lambda i: overlap_title(i,dict_pairs), dataset['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

0:04:31.985548


In [103]:
time_beg = datetime.datetime.now()
dataset_test['overlap_title']=list(map(lambda i: overlap_title(i,dict_pairs_test), dataset_test['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

0:04:59.857006


### 2. Number of overlapping words in the abstract

In [115]:
def overlap_abstract(idPair,dict_pairs):
    source = dict_pairs.get(idPair)[0]
    target = dict_pairs.get(idPair)[1] 
    
    source_info = node_info.loc [node_info['ID']==source]
    target_info = node_info.loc [node_info['ID']==target]
    
    #Abstract
    source_abstract = source_info.iloc[0,5].lower().split(" ")   
    source_abstract = [token for token in source_abstract if token not in stpwds]
    source_abstract = [stemmer.stem(token) for token in source_abstract]
      
    target_abstract = source_info.iloc[0,5].lower().split(" ")
    target_abstract = [token for token in target_abstract if token not in stpwds]
    target_abstract = [stemmer.stem(token) for token in target_abstract]
    
    return (len(set(source_abstract).intersection(set(target_abstract))))

In [116]:
time_beg = datetime.datetime.now()
dataset['overlap_abstract']=list(map(lambda i: overlap_abstract(i,dict_pairs), dataset['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

IndexError: string index out of range

In [108]:
time_beg = datetime.datetime.now()
dataset_test['overlap_abstract']=list(map(lambda i: overlap_abstract(i,dict_pairs_test), dataset_test['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

0:07:15.901143


### 3. Temporal distance between the papers

In [109]:
def tmp_dist(idPair,dict_pairs):
    source = dict_pairs.get(idPair)[0]
    target = dict_pairs.get(idPair)[1] 
    
    source_info = node_info.loc [node_info['ID']==source]
    target_info = node_info.loc [node_info['ID']==target]
    
    #Year
    source_year = source_info.iloc[0,1]
    target_year = target_info.iloc[0,1]
 
    return (int(source_year) - int(target_year))

In [110]:
time_beg = datetime.datetime.now()
dataset['temp_diff']=list(map(lambda i: tmp_dist(i,dict_pairs), dataset['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

0:04:20.158177


In [111]:
time_beg = datetime.datetime.now()
dataset_test['temp_diff']=list(map(lambda i: tmp_dist(i, dict_pairs_test), dataset_test['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

0:04:38.409913


### 4. Number of common authors

In [112]:
def comm_auth(idPair, dict_pairs):
    source = dict_pairs.get(idPair)[0]
    target = dict_pairs.get(idPair)[1] 
    
    source_info = node_info.loc [node_info['ID']==source]
    target_info = node_info.loc [node_info['ID']==target]
    
    #Authors
    source_auth = source_info.iloc[0,3].split(",")
    target_auth = target_info.iloc[0,3].split(",")
 
    return (len(set(source_auth).intersection(set(target_auth))))

In [113]:
time_beg = datetime.datetime.now()
dataset['comm_auth']=list(map(lambda i: comm_auth(i,dict_pairs), dataset['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

0:03:12.986838


In [114]:
time_beg = datetime.datetime.now()
dataset_test['comm_auth']=list(map(lambda i: comm_auth(i, dict_pairs_test), dataset_test['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

0:03:17.099837


In [None]:
dataset.head(2)

### Test the classifiers

In [None]:
col= ['IDPairs','Edge']
colnames=[i for i in dataset.columns if i not in col]
X = dataset[colnames]

print("Features : %s"% list(X))

y = dataset['Edge']
y= list(map (lambda i: int(i), y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
test_classifier(X_train, X_test, y_train, y_test)

## B. Cosine distance - TF-IDF

### 1. Cosine distance between two TF-IDF abstracts

In [118]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity

In [119]:
raw_documents = list(node_info['abstract'])

from nltk.tokenize import word_tokenize
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in raw_documents]

In [122]:
# Creation of dictionary, corpus and TF-IDF model
dictionary = gensim.corpora.Dictionary(gen_docs)
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
tf_idf = gensim.models.TfidfModel(corpus)

2019-01-06 22:20:39,334 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-01-06 22:20:41,305 : INFO : adding document #10000 to Dictionary(21076 unique tokens: ['2', 'a', 'and', 'are', 'as']...)
2019-01-06 22:20:42,373 : INFO : adding document #20000 to Dictionary(30553 unique tokens: ['2', 'a', 'and', 'are', 'as']...)
2019-01-06 22:20:43,514 : INFO : built Dictionary(35816 unique tokens: ['2', 'a', 'and', 'are', 'as']...) from 27770 documents (total 2819131 corpus positions)
2019-01-06 22:20:48,114 : INFO : collecting document frequencies
2019-01-06 22:20:48,115 : INFO : PROGRESS: processing document #0
2019-01-06 22:20:48,306 : INFO : PROGRESS: processing document #10000
2019-01-06 22:20:48,478 : INFO : PROGRESS: processing document #20000
2019-01-06 22:20:48,616 : INFO : calculating IDF weights for 27770 documents and 35815 features (1811062 matrix non-zeros)


In [123]:
def cossim_title(ID,dict_pairs,node_info,dictionary,tf_idf):
    
    doc1=node_info.loc[node_info['ID']==dict_pairs[ID][0],'title'].values[0]
    doc2=node_info.loc[node_info['ID']==dict_pairs[ID][1],'title'].values[0]

    idx_doc1=node_info.loc[node_info['ID']==dict_pairs[ID][0],'title'].index[0]
    idx_doc2=node_info.loc[node_info['ID']==dict_pairs[ID][1],'title'].index[0]
    
    vec_bow1 = dictionary.doc2bow(doc1.lower().split())
    vec_bow2 = dictionary.doc2bow(doc2.lower().split())

    return gensim.matutils.cossim(tf_idf[vec_bow1], tf_idf[vec_bow2])

In [124]:
start_time=time.time()
dataset['cossim_t_tfidf']=np.zeros(len(dataset))
dataset['cossim_t_tfidf']=list(map(lambda i: cossim_title(i,dict_pairs,node_info,dictionary,tf_idf), dataset['IDPairs']))
print("--- %s minutes ---" % (np.float(time.time() - start_time)/60))

--- 7.132623251279195 minutes ---


In [125]:
start_time=time.time()
dataset_test['cossim_t_tfidf']=np.zeros(len(dataset_test))
dataset_test['cossim_t_tfidf']=list(map(lambda i: cossim_title(i,dict_pairs_test,node_info,dictionary,tf_idf), dataset_test['IDPairs']))
print("--- %s minutes ---" % (np.float(time.time() - start_time)/60))

--- 7.667019053300222 minutes ---


### Test the classifiers

In [None]:
col= ['IDPairs','Edge']
colnames=[i for i in dataset.columns if i not in col]
X = dataset[colnames]

print("Features : %s"% list(X))

y = dataset['Edge']
y= list(map (lambda i: int(i), y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
test_classifier(X_train, X_test, y_train, y_test)

## C. Latent Semantic Analysis

In [127]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer

### 1. The cosine distance between two LSA abstracts

In [128]:
# compute TFIDF vector of each paper
corpus = set(node_info['abstract'])
vectorizer = TfidfVectorizer(stop_words="english")
features_TFIDF = vectorizer.fit_transform(corpus)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [129]:
# Fit LSA. Use algorithm = “randomized” for large datasets 
lsa = TruncatedSVD(100, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(features_TFIDF)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [None]:
# Compute document similarity using LSA components
similarity = np.asarray(np.asmatrix(dtm_lsa) * np.asmatrix(dtm_lsa).T) 
save= pd.DataFrame(similarity,index=corpus, columns=corpus)
save.head(1)

In [None]:
def cosdist_lsa_abstract(ID,dict_pairs ):    
    abstract1=node_info.loc[node_info['ID']==dict_pairs[ID][0],'abstract'].values[0]
    abstract2=node_info.loc[node_info['ID']==dict_pairs[ID][1],'abstract'].values[0]        
    return save.loc[abstract1][abstract2]
    ataset['lsa_abstract']=np.zeros(len(dataset))

In [None]:
time_beg = datetime.datetime.now()
dataset['lsa_abstract']=list(map(lambda i: cosdist_lsa_abstract(i, dict_pairs), dataset['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

In [None]:
time_beg = datetime.datetime.now()
dataset_test['lsa_abstract']=list(map(lambda i: cosdist_lsa_abstract(i, dict_pairs_test), dataset_test['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

### 2. The cosine distance between two LSA titles

In [None]:
# compute TFIDF vector of each paper
corpus = set(node_info['title'])
vectorizer = TfidfVectorizer(stop_words="english")
features_TFIDF = vectorizer.fit_transform(corpus)

In [None]:
# Fit LSA. Use algorithm = “randomized” for large datasets 
lsa = TruncatedSVD(100, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(features_TFIDF)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [None]:
# Compute document similarity using LSA components
similarity = np.asarray(np.asmatrix(dtm_lsa) * np.asmatrix(dtm_lsa).T) 
save= pd.DataFrame(similarity,index=corpus, columns=corpus)
save.head(2)

In [None]:
def cosdist_lsa_title(ID, dict_pairs):    
    title1=node_info.loc[node_info['ID']==dict_pairs[ID][0],'title'].values[0]
    title2=node_info.loc[node_info['ID']==dict_pairs[ID][1],'title'].values[0]        
    return save.loc[title1][title2]

In [None]:
time_beg = datetime.datetime.now()
dataset['lsa_title']=list(map(lambda i: cosdist_lsa_title(i, dict_pairs), dataset['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

In [None]:
time_beg = datetime.datetime.now()
dataset_test['lsa_title']=list(map(lambda i: cosdist_lsa_title(i, dict_pairs_test), dataset_test['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

### Test the classifiers

In [None]:
col= ['IDPairs','Edge']
colnames=[i for i in dataset.columns if i not in col]
X = dataset[colnames]

print("Features : %s"% list(X))

y = dataset['Edge']
y= list(map (lambda i: int(i), y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
test_classifier(X_train, X_test, y_train, y_test)

## D. Graph Based Features

### 1. Number of citations between authors

### 2. Common neighbours

### 3. Jaccard Index

### 4. Shortest Path

### 5. Page Rank

### Test the classifiers

## E. Node2Vec

In [130]:
import io
import os

In [131]:
#nodes embeddings output from https://github.com/aditya-grover/node2vec
#load node embedding

node2vec = {}
with io.open('./node2vec/emb/train_nodes_emb.txt', encoding='utf-8') as f:
    next(f)
    for i, line in enumerate(f):
        node, vec = line.split(' ', 1)
        node2vec[node] = np.fromstring(vec, sep=' ')
print('Loaded %s pretrained node vectors' % (len(node2vec)))

FileNotFoundError: [Errno 2] No such file or directory: './node2vec/emb/train_nodes_emb.txt'

In [None]:
def n2v(ID, dict_pairs):
    node_src= dict_pairs[ID][0]
    nodes_dest= dict_pairs[ID][1]
    if node_src not in node2vec or nodes_dest not in node2vec:
        return 0
        
    v1 = node2vec[node_src] #w1 vector embedding
    v2 = node2vec[nodes_dest] #w2 vector embedding
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [None]:
dataset['node2vec']=list(map(lambda i: n2v(i,dict_pairs), dataset['IDPairs']))

In [None]:
dataset_test['node2vec']=list(map(lambda i: n2v(i,dict_pairs_test), dataset_test['IDPairs']))

### Test the Classifiers

In [None]:
col= ['IDPairs','Edge']
colnames=[i for i in dataset.columns if i not in col]
X = dataset[colnames]

print("Features : %s"% list(X))

y = dataset['Edge']
y= list(map(lambda i: int(i), y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
test_classifier(X_train, X_test, y_train, y_test)

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# F. Word vectors

In [22]:
# imports needed and logging
import gzip
import gensim 
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [23]:
def abstract_generator():
    for abstract in node_info['abstract']:
        yield gensim.utils.simple_preprocess(abstract)


In [24]:
documents = list(abstract_generator())
model = gensim.models.Word2Vec(
        documents,
        size=150,
        window=10,
        min_count=2,
        workers=2)


2019-01-06 20:05:34,042 : INFO : collecting all words and their counts
2019-01-06 20:05:34,044 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-06 20:05:34,256 : INFO : PROGRESS: at sentence #10000, processed 1004920 words, keeping 14716 word types
2019-01-06 20:05:34,430 : INFO : PROGRESS: at sentence #20000, processed 1928403 words, keeping 20013 word types
2019-01-06 20:05:34,575 : INFO : collected 22592 word types from a corpus of 2669524 raw words and 27770 sentences
2019-01-06 20:05:34,576 : INFO : Loading a fresh vocabulary
2019-01-06 20:05:34,644 : INFO : effective_min_count=2 retains 14598 unique words (64% of original 22592, drops 7994)
2019-01-06 20:05:34,645 : INFO : effective_min_count=2 leaves 2661530 word corpus (99% of original 2669524, drops 7994)
2019-01-06 20:05:34,712 : INFO : deleting the raw counts dictionary of 22592 items
2019-01-06 20:05:34,714 : INFO : sample=0.001 downsamples 37 most-common words
2019-01-06 20:05:34,715 : INF

In [25]:
model.train(documents, total_examples=len(documents), epochs=10)

2019-01-06 20:06:29,656 : INFO : training model with 2 workers on 14598 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2019-01-06 20:06:30,664 : INFO : EPOCH 1 - PROGRESS: at 26.70% examples, 548994 words/s, in_qsize 3, out_qsize 0
2019-01-06 20:06:31,665 : INFO : EPOCH 1 - PROGRESS: at 55.04% examples, 549526 words/s, in_qsize 3, out_qsize 0
2019-01-06 20:06:32,670 : INFO : EPOCH 1 - PROGRESS: at 88.40% examples, 577714 words/s, in_qsize 3, out_qsize 0
2019-01-06 20:06:33,019 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-01-06 20:06:33,027 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-01-06 20:06:33,028 : INFO : EPOCH - 1 : training on 2669524 raw words (1968529 effective words) took 3.4s, 584928 effective words/s
2019-01-06 20:06:34,059 : INFO : EPOCH 2 - PROGRESS: at 25.35% examples, 507249 words/s, in_qsize 3, out_qsize 0
2019-01-06 20:06:35,069 : INFO : EPOCH 2 - PROGRESS: at 58.47% examples, 57312

(19683453, 26695240)

In [26]:
word_vectors = model.wv

In [30]:
word_vectors.get_vector('solid').shape

(150,)

In [84]:
word_vectors.vector_size

150

In [91]:
word_vectors = model.wv


def centroid_distance_abstract(idPair, dict_pairs):
    source = dict_pairs.get(idPair)[0]
    target = dict_pairs.get(idPair)[1] 
    
    source_info = node_info.loc[node_info['ID']==source]
    target_info = node_info.loc[node_info['ID']==target]
    
    
#     print(source_info['abstract'])
#     #print(source_info['abstract'].to_string())
#     print(len(source_info['abstract'].tolist()))
#     print(source_info['abstract'].tolist()[0])
#     for w in source_info['abstract'].tolist()[0].split():
#         print(w)
    
    cen_source = np.zeros((word_vectors.vector_size,), dtype = 'float')
    counter_source = 0
    for w in source_info['abstract'].tolist()[0].split():
        try:
            cen_source += word_vectors.get_vector(w)
            counter_source += 1  # If a word not exists, then this line doesn't run
        except KeyError:
            pass
    if counter_source == 0:  # If the node doesn't have a single valid word vector, return -1 as a flag.
        return -1
    cen_source /= counter_source
    
    cen_target = np.zeros((word_vectors.vector_size,), dtype = 'float')
    counter_target = 0
    for w in target_info['abstract'].tolist()[0].split():
        try:
            cen_target += word_vectors.get_vector(w)
            counter_target += 1
        except KeyError:
                pass
    if counter_target == 0:
        return -1
    cen_target /= counter_target
    
    return np.linalg.norm(cen_source - cen_target)
        

In [92]:
centroid_distance_abstract(39605, dict_pairs)

3.424449203422787

In [93]:
time_beg = datetime.datetime.now()
dataset['center_distance_abstract']=list(map(lambda i: centroid_distance_abstract(i,dict_pairs), dataset['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

NameError: name 'timefo_beg' is not defined

In [132]:
dataset.head(12)

Unnamed: 0,IDPairs,overlap_title,overlap_abstract,temp_diff,comm_auth,cossim_a_tfidf,cossim_t_tfidf,lsa_abstract,lsa_title,nb_cit_indiv,common_out_neighbors,common_in_neighbors,jaccard_sim_out,jaccard_sim_in,shortest_path,page_rank,node2vec,Edge,center_distance_abstract
0,39605,7,0.0,1,0,0.0,0.076979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,3.424449
1,196908,3,0.0,-4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.20775
2,208886,5,0.0,1,0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,3.41931
3,578957,5,0.0,7,0,0.0,0.042357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.174467
4,36851,6,0.0,0,0,0.0,0.204469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.413794
5,160011,7,0.0,5,0,0.0,0.59599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.620734
6,50043,6,0.0,-2,0,0.0,0.150132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.801676
7,260639,3,0.0,1,0,0.0,0.616259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.465783
8,460072,6,0.0,-2,0,0.0,1.3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6.058379
9,324093,2,0.0,1,0,0.0,0.191881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.218448


In [98]:
time_beg = datetime.datetime.now()
dataset_test['center_distance_abstract']=list(map(lambda i: centroid_distance_abstract(i,dict_pairs_test), dataset_test['IDPairs']))
time_end = datetime.datetime.now()
print (time_end-time_beg)

0:05:10.372880


### Test the classifiers

In [133]:
col= ['IDPairs','Edge']
colnames=[i for i in dataset.columns if i not in col]
X = dataset[colnames]

print("Features : %s"% list(X))

y = dataset['Edge']
y= list(map (lambda i: int(i), y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
test_classifier(X_train, X_test, y_train, y_test)

Features : ['overlap_title', 'overlap_abstract', 'temp_diff', 'comm_auth', 'cossim_a_tfidf', 'cossim_t_tfidf', 'lsa_abstract', 'lsa_title', 'nb_cit_indiv', 'common_out_neighbors', 'common_in_neighbors', 'jaccard_sim_out', 'jaccard_sim_in', 'shortest_path', 'page_rank', 'node2vec', 'center_distance_abstract']


  if diff:


Unnamed: 0,LogReg,NaiveBayes,SVM,RandomForest,GBM,XGBoost,NNET
F1,0.785,0.558,0.834,0.798,0.838,0.834,0.831
Accuracy,0.763,0.654,0.809,0.784,0.821,0.815,0.816


&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

In [134]:
col= ['IDPairs','Edge','center_distance_abstract']
colnames=[i for i in dataset.columns if i not in col]
X = dataset[colnames]

print("Features : %s"% list(X))

y = dataset['Edge']
y= list(map (lambda i: int(i), y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
test_classifier(X_train, X_test, y_train, y_test)

Features : ['overlap_title', 'overlap_abstract', 'temp_diff', 'comm_auth', 'cossim_a_tfidf', 'cossim_t_tfidf', 'lsa_abstract', 'lsa_title', 'nb_cit_indiv', 'common_out_neighbors', 'common_in_neighbors', 'jaccard_sim_out', 'jaccard_sim_in', 'shortest_path', 'page_rank', 'node2vec']


  if diff:


Unnamed: 0,LogReg,NaiveBayes,SVM,RandomForest,GBM,XGBoost,NNET
F1,0.667,0.549,0.811,0.779,0.81,0.811,0.81
Accuracy,0.654,0.65,0.765,0.748,0.775,0.776,0.775


# Features Importance

In [None]:
rf = RandomForestClassifier()
model_rf = rf.fit(X_train, y_train)

In [None]:
features = pd.DataFrame()
features['feature'] = X.columns
features['importance'] = model_rf.feature_importances_
features.sort_values(by=['importance'], ascending=False, inplace=True)
features.set_index('feature', inplace=True)

In [None]:
list(features.values.reshape(-1))

In [None]:
importance = list(features.values.reshape(-1))
bars = list(features.index.values)

y_pos = np.arange(len(bars))
 
fig, ax = plt.subplots(figsize=(16,9))

# Create horizontal bars
plt.barh(y_pos, importance)
 
# Create names on the y-axis
plt.yticks(y_pos, bars)

# Add title
plt.title('Feature Importance')

# Show graphic
plt.show()

# Model Comparison

In [None]:
tps0=time.clock()

# Classifiers
rf   = RandomForestClassifier()
gbm = GradientBoostingClassifier()
boost = xgb.XGBClassifier()
svm_  = SVC()
logi = LogisticRegression()
nnet=MLPClassifier()
nb = GaussianNB()

# Number of iterations
B=10 # to test the loop : use B=3 instead

# Parameters grids

listMethGrid=[logi,nb, svm_,rf,gbm,boost,nnet]
arrayErreur=np.empty((B,7))

In [None]:
for i in range(B):
    print(i)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    
    # Computation of test error
    for j,method in enumerate(listMethGrid):
        methFit=method.fit(X_train, y_train)
        predictions = methFit.predict(X_test)
        arrayErreur[i,j]=f1_score(y_test, predictions)
        
tps1=time.clock()
print("Execution time in mn :",(tps1 - tps0)/60)

dataframeErreur=pd.DataFrame(arrayErreur,columns=["Logit","NB","SVM","RF","GBM","XGB","NNET"])

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(16,9))
x = list(dataframeErreur.columns.values) * 10
y = list(dataframeErreur.values.reshape(-1))
sns.boxplot(x=x, y=y, linewidth=2)
plt.title("F1 Score")

# Fine tuning

# Prediction

In [None]:
col= ['IDPairs','Edge']
colnames=[i for i in dataset.columns if i not in col]

In [None]:
model = GradientBoostingClassifier()
model = model.fit(X_train, y_train)
test=dataset_test[colnames]
predictions = model.predict(test)

In [None]:
final_pred = pd.concat([dataset_test['IDPairs'],pd.DataFrame(predictions)],axis=1)
final_pred.columns = ['id','category']
final_pred.to_csv('../Data/my_pred.csv',index=None)

# Save dataset

In [126]:
dataset.to_csv('Data/dataset.csv', sep=',')
dataset_test.to_csv('Data/dataset_test.csv', sep=',')