In [1]:
import random
import numpy as np
import igraph
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv

with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

###################
# random baseline #
###################

random_predictions = np.random.choice([0, 1], size=len(testing_set))
random_predictions = zip(range(len(testing_set)),random_predictions)

def write(prediction, filename) :
    with open(filename,"w") as pred:
        csv_out = csv.writer(pred)
        csv_out.writerow(('id', 'category'))
        for row in prediction:
            csv_out.writerow(row)

## Chargement des fichiers et séparation train_test_split

In [2]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]
y = [elt[-1] for elt in training_set]
y = np.array(y).astype(int)
with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

## Extraction des features

In [4]:
## idée rajouter titre à l'abstract
corpus = [element[5] + ' ' + element[2].lower() for element in node_info]

In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
#nltk.download('stopwords'); nltk.download('punkt'); nltk.download('wordnet')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
def tokenize(text) :
    tokens = word_tokenize(text)
    result = [stemmer.stem(i) for i in tokens if( (len(i) > 2) &  (i.isalpha()) & (i not in stop_words)) ]
    return ' '.join(result)
## paralellisation de la lemmatisation des textes
from multiprocessing import Pool
with Pool(8) as p: 
    corpus_new = p.map(tokenize, corpus)

In [6]:
## création de dictionnaires pour stocker les informations des noeuds
dico_node_info = {elt[0]: [elt[1], elt[3].split(","), corpus_new[i].split()] for i, elt in enumerate(node_info)}

### Distribution des mots pour savoir où couper

In [7]:
import pandas as pd
tmp = pd.Series((' '.join(corpus_new)).split())
tmp = tmp.value_counts()

In [52]:
tmp.head(20)

theori       33359
field        21474
model        17099
gaug         14756
string       14042
gener        12328
solut        10049
quantum       9556
space         9010
use           8609
equat         8228
algebra       7783
function      7696
also          7675
show          7585
discuss       7472
result        7435
construct     7201
term          7141
effect        7037
dtype: int64

In [None]:
# compute TFIDF vector of each paper
vectorizer = TfidfVectorizer(decode_error='ignore', smooth_idf=False, stop_words=stop_words,
                      encoding='utf-8', min_df = 0, max_df=34000)
# each row is a node in the order of node_info
corpus_idf = vectorizer.fit_transform(corpus_new)

In [12]:
node_info[0]

['1001',
 '2000',
 'compactification geometry and duality',
 'Paul S. Aspinwall',
 '',
 'these are notes based on lectures given at tasi99 we review the geometry of the moduli space of n 2 theories in four dimensions from the point of view of superstring compactification the cases of a type iia or type iib string compactified on a calabi-yau threefold and the heterotic string compactified on k3xt2 are each considered in detail we pay specific attention to the differences between n 2 theories and n 2 theories the moduli spaces of vector multiplets and the moduli spaces of hypermultiplets are reviewed in the case of hypermultiplets this review is limited by the poor state of our current understanding some peculiarities such as mixed instantons and the non-existence of a universal hypermultiplet are discussed']

In [77]:
def get_features(tuple_to_process) : 
    ret = []
    source = tuple_to_process[0]
    target = tuple_to_process[1]
    
    source_info = dico_node_info[source]
    target_info = dico_node_info[target]
    
    # convert to lowercase and tokenize
    source_abstract = source_info[1]
    target_abstract = source_info[1]
    
    source_auth = source_info[2]
    target_auth = target_info[2]
    
    ret.append(len(set(source_abstract).intersection(set(target_abstract))))
    ret.append(int(source_info[0]) - int(target_info[0]))
    ret.append(len(set(source_auth).intersection(set(target_auth))))
    return np.array(ret)
import tqdm
train_tr = []
for elt in tqdm.tqdm(training_set) :
    train_tr.append(get_features(elt))
train_tr = np.asarray(train_tr)

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
id_train, id_val, y_train, y_val = train_test_split(list(range(len(training_set))), 
                                                    y, test_size=0.1, stratify = y, random_state=10)
train = train_tr[id_train]
val = train_tr[id_val]
train = scaler.fit_transform(train)
val = scaler.transform(val)
print(train.shape, val.shape)

(553960, 3) (61552, 3)


In [100]:
# initialize basic SVM
from sklearn.model_selection import GridSearchCV
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
classifier = svm.LinearSVC()
C = np.logspace(-1, 2, 20)
params = dict(C=C, class_weight=['balanced'])
clf = GridSearchCV(classifier, params, cv=3, verbose=10, n_jobs = -1,  scoring='f1')
clf.fit(train, y_train)
import sklearn
y_pred = clf.predict(val)
print(sklearn.metrics.f1_score(y_val, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed: 17.0min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 18.9min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([  0.1       ,   0.14384499,   0.20691381,   0.29763514,
         0.42813324,   0.61584821,   0.88586679,   1.27427499,
         1.83298071,   2.6366509 ,   3.79269019,   5.45559478,
         7.8475997 ,  11.28837892,  16.23776739,  23.35721469,
        33.59818286,  48.32930239,  69.51927962, 100.        ]),
                         'class_weight': ['balanced']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=10)

In [108]:
from sklearn import linear_model
logistic = linear_model.LogisticRegression()
C = np.logspace(-1, 2, 20)
params = dict(C=C, class_weight=['balanced'], solver=['lbfgs'])
clf = GridSearchCV(logistic, params, cv=3, verbose=10, n_jobs = -1,  scoring='f1')
clf.fit(train, y_train)
y_pred = clf.predict(val)
print(sklearn.metrics.f1_score(y_val, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed:    9.8s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   11.1s finished


0.7590520385516738


In [110]:
#!pip install node2vec

## Working with graph

In [3]:
import warnings
warnings.simplefilter('ignore')

In [4]:
import pandas as pd
data = pd.DataFrame({'node_1':[int(elt[0]) for elt in training_set], 
                     'node_2':[int(elt[1]) for elt in training_set],
                    'link' : [int(elt[2]) for elt in training_set]})

In [5]:
data_test = pd.DataFrame({'node_1':[int(elt[0]) for elt in testing_set], 
                     'node_2':[int(elt[1]) for elt in testing_set]})

In [6]:
import networkx as nx
import tqdm

In [7]:
fb_df_temp = data[data.link == 1]
initial_node_count = len(set(fb_df_temp.node_1.unique()).union(set(fb_df_temp.node_2.unique())))
# empty list to store removable links
omissible_links_index = []
for i in tqdm.tqdm(data[data.link == 1].index.values):
    break
    # remove a node pair and build a new graph
    tmp = fb_df_temp.drop(index = i) 
    all_nodes = set(tmp.node_1.unique()).union(set(tmp.node_2.unique()))
    if len(all_nodes) == initial_node_count :
        omissible_links_index.append(i)
        fb_df_temp = fb_df_temp.drop(index = i)        
    else :
        pass
    if len(omissible_links_index) == 30000 :
        break

  0%|          | 0/335130 [00:00<?, ?it/s]


In [8]:
#np.save('omissible.npy', omissible_links_index)

In [8]:
omissible_links_index = np.load('omissible.npy')

In [9]:
fb_df_partial = data[data.link == 1].drop(index = omissible_links_index).drop(columns=['link'])
G_data = nx.from_pandas_edgelist(fb_df_partial, "node_1", "node_2", create_using=nx.Graph())

In [None]:
from node2vec import Node2Vec
node2vec = Node2Vec(G_data, dimensions=128, walk_length=100, 
                    num_walks=50, workers=1, temp_folder='')
# train node2vec model
n2w_model = node2vec.fit(window=5, min_count=1)
n2w_model.wv.save_word2vec_format('node_embv2')

Computing transition probabilities: 100%|██████████| 27684/27684 [02:51<00:00, 161.78it/s] 
Generating walks (CPU: 1): 100%|██████████| 50/50 [1:49:18<00:00, 131.16s/it]


In [10]:
from gensim.models import KeyedVectors
n2w_model = KeyedVectors.load_word2vec_format('node_emb', binary=False)

In [65]:
def get_node_emb(data) :
    #node_emb_ = []
    node_emb = []
    for i, j in zip(data['node_1'], data['node_2']) :
        if i in G_data.nodes():
            emb_i = n2w_model.wv[str(i)]
        else :
            emb_i = np.zeros(128)
        if j in G_data.nodes():
            emb_j = n2w_model.wv[str(j)]
        else :
            emb_j = np.zeros(128)
        node_emb.append(np.vstack((emb_i, emb_j)))
        #node_emb_.append(np.hstack((emb_j, emb_i)))
        #node_emb.append(emb_i * emb_j)
    node_emb = np.asarray(node_emb)
    #node_emb_ = np.asarray(node_emb_)
    return node_emb #np.vstack((node_emb, node_emb_))
node_emb = get_node_emb(data.drop(index = omissible_links_index))

In [66]:
node_emb.shape

(585512, 2, 128)

In [106]:
#from sklearn.model_selection import GridSearchCV
#import sklearn
#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler
#xtrain, xtest, ytrain, ytest = train_test_split(node_emb, data['link'], 
#                                                test_size = 0.2, 
#                                                random_state = 35, stratify = data.link)

In [107]:
#print(xtrain.shape,xtest.shape)

In [22]:
## neural network
import tensorflow.keras.backend as K
import sklearn
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [67]:
## Validation sur les noeuds omis !!
omitted_links = data[data.link == 1].iloc[omissible_links_index]
omitted_emb = get_node_emb(omitted_links)

In [68]:
omitted_emb.shape

(30000, 2, 128)

In [69]:
y_train = data.drop(index = omissible_links_index).link.values

In [70]:
y_train.shape

(585512,)

In [None]:
model.add(Bidirectional(LSTM(rnn_cell_size,
                              dropout=0.3,
                              return_sequences=False,
                              return_state=False,
                             recurrent_activation='relu',
                             name="bi_lstm_0")))

In [57]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, Input
from tensorflow.core.protobuf import rewriter_config_pb2
import tensorflow.keras.backend as K
tf.keras.backend.clear_session()  # For easy reset of notebook state.
n_classes  = 2

model = Sequential()
#model.add(LSTM(32, dropout=0.1, recurrent_dropout=0.1, input_shape=(2, 128), activation='relu'))
#model.add(Bidirectional(LSTM(32, dropout=0.1,
#                             recurrent_activation='relu'), 
#                        input_shape=(2, 128)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=[f1_m])
bs = 1024
n_epochs = 10
history = model.fit(node_emb, y_train,
                    batch_size=bs, epochs=n_epochs,
                    validation_data=(omitted_emb, np.ones(len(omitted_emb))))
y_pred = model.predict(omitted_emb)
y_pred = np.where(y_pred >0.5, 1, 0)
print(sklearn.metrics.f1_score(np.ones(len(omitted_emb)), y_pred))

Train on 1171024 samples, validate on 60000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.9805454082066094


In [62]:
y_pred = model.predict(omitted_emb)
y_pred = np.where(y_pred >0.5, 1, 0)
print(sklearn.metrics.f1_score(np.ones(len(omitted_emb)), y_pred))

0.9799214539519543


In [64]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, Input

In [None]:
#Input image
input_img= Input(shape=(128,))#  encoded and decoded layer for the autoencoder
encoded = Dense(units=32, activation='relu')(input_img)
encoded = Dense(units=1, activation='sigmoid')(encoded)

decoded = Dense(units=32, activation='relu')(encoded)
decoded = Dense(units=128, activation='relu')(decoded)

autoencoder=Model(inputs = input_img, [encoded, decoded])#extracting encoder
encoder = Model(input_img, encoded)# compiling the autoencoder
autoencoder.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])# Fitting the noise trained data to the autoencoder 
autoencoder.fit(X_train_noisy, X_train_noisy,
                epochs=100,
                batch_size=256,
                shuffle=True,
                validation_data=(X_test_noisy, X_test_noisy))

In [70]:
## prédiction finale
## ré-entrainement sur le dataset total sans les omis 
## Flemme puisqu'il faut re-embedder les noeuds, je ferais ça plutard

In [79]:
emb_test = get_node_emb(data_test)

  """
  if __name__ == '__main__':


In [84]:
y_test = model.predict(emb_test)
y_test = np.where(y_test >0.5, 1, 0)[:, 0]

In [87]:
predictions_SVM = zip(range(len(testing_set)), y_test)
with open("improved_predictions.csv","w") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(('id', 'category'))
    for row in predictions_SVM:
        csv_out.writerow(row)