## Import libraries

In [None]:
!pip install catboost

In [2]:
import re
import csv
import gzip
import pickle
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from random import randint
from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn import utils
from sklearn.utils import shuffle
from sklearn.metrics import log_loss
from sklearn.metrics.pairwise import cosine_similarity

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

import spacy
import gensim
from gensim.test.utils import common_texts
from gensim.models import Doc2Vec, Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Define functions and baselines

In [3]:
model_dict = \
    {
    'model_1': {
        'name': 'CB',
        'pl': CatBoostClassifier(random_seed=42, verbose=0)
        },
     'model_2': {
        'name': 'XGB',
        'pl': XGBClassifier(random_state=42, eval_metric='logloss')
        },
    'model_3': {
        'name': 'LGB',
        'pl': LGBMClassifier(random_state=42)
        }
    }

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [5]:
# Remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

# Tokenizer
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens

# Lemmatizer
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

# Remove stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

## Node2Vec embeddings

In [6]:
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)

nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Number of nodes: 138499
Number of edges: 1091955


In [7]:
with gzip.open('embeddings/node_embeddings.emb', "rb") as f:
    node_emb = pickle.load(f)

## SciBERT embeddings

In [8]:
with gzip.open('embeddings/abstract_embeddings.emb', "rb") as f:
    bert_emb = pickle.load(f)

## Text preprocessing

In [11]:
abstracts = dict()
with open('data/abstracts.txt', 'r') as f:
    for line in f:
        node, abstract = line.rstrip('\n').split('|--|')
        abstract = abstract.lower()
        abstract = tokenization(remove_punctuation(abstract))[0]
        abstract = remove_stopwords(lemmatize_words(abstract))
        abstracts[int(node)] = abstract

data = pd.DataFrame.from_dict(abstracts, orient='index', columns=['abstract'])
data = data.replace(r'\n',' ', regex=True)

In [12]:
authors = dict()
with open('data/authors.txt', 'r') as s:
    for line in s:
        node, author = line.rstrip("\n").split('|--|')
        author = author.lower()
        authors[int(node)] = author

data_author = pd.DataFrame.from_dict(authors, orient='index', columns=['author'])
data_author = data_author.replace(r'\n',' ', regex=True)

## TF-IDF

In [13]:
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5))
X = vec.fit_transform(data['abstract'])
X

<138499x196487 sparse matrix of type '<class 'numpy.float64'>'
	with 119630300 stored elements in Compressed Sparse Row format>

In [14]:
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5))
X_aut = vec.fit_transform(data_author['author'])
X_aut

<138499x238637 sparse matrix of type '<class 'numpy.float64'>'
	with 13987960 stored elements in Compressed Sparse Row format>

In [15]:
authors = dict()
with open('data/authors.txt', 'r') as s:
    for line in s:
        node, author = line.rstrip("\n").split('|--|')
        authors[int(node)] = author

for node in authors:
    authors[node] = set(authors[node].split(','))

for node in abstracts:
    abstracts[node] = set(abstracts[node].split())

## Create feature set

In [19]:
X_train = np.zeros((2*m, 15))
y_train = np.zeros(2*m)

for i,edge in enumerate(G.edges()):
    X_train[i, 0] = cosine_similarity(X[edge[0]], X[edge[1]])[0,0]
    X_train[i, 1] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X_train[i, 2] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X_train[i, 3] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    X_train[i, 4] = len(authors[edge[0]]) + len(authors[edge[1]])
    X_train[i, 5] = abs(len(authors[edge[0]]) - len(authors[edge[1]]))
    X_train[i, 6] = len(authors[edge[0]].intersection(authors[edge[1]]))
    X_train[i, 7] = G.degree(edge[0]) + G.degree(edge[1])
    X_train[i, 8] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_train[i, 9] = cosine_similarity(X_aut[edge[0]], X_aut[edge[1]])[0,0]
    X_train[i, 10] = list(nx.adamic_adar_index(G, [(edge[0], edge[1])]))[0][2]
    X_train[i, 11] = list(nx.jaccard_coefficient(G, [(edge[0], edge[1])]))[0][2]
    X_train[i, 12] = list(nx.preferential_attachment(G, [(edge[0], edge[1])]))[0][2]
    X_train[i, 13] = len(list(nx.common_neighbors(G, u=edge[0], v=edge[1])))
    X_train[i, 14] = cosine_similarity(bert_emb[edge[0]], bert_emb[edge[1]])[0,0]
    # X_train[i, 15] = cosine_similarity(node_emb[edge[0]].reshape(1, -1), node_emb[edge[1]].reshape(1, -1))[0,0]
    y_train[i] = 1

    n1 = randint(0, n-1)
    n2 = randint(0, n-1)
    if ( (G.has_edge(n1, n2)) ):
        pass
    else:
        X_train[m+i, 0] = cosine_similarity(X[n1], X[n2])[0, 0]
        X_train[m+i, 1] = len(abstracts[n1]) + len(abstracts[n2])
        X_train[m+i, 2] = abs(len(abstracts[n1]) - len(abstracts[n2]))
        X_train[m+i, 3] = len(abstracts[n1].intersection(abstracts[n2]))
        X_train[m+i, 4] = len(authors[n1]) + len(authors[n2])
        X_train[m+i, 5] = abs(len(authors[n1]) - len(authors[n2]))
        X_train[m+i, 6] = len(authors[n1].intersection(authors[n2]))
        X_train[m+i, 7] = G.degree(n1) + G.degree(n2)
        X_train[m+i, 8] = abs(G.degree(n1) - G.degree(n2))
        X_train[m+i, 9] = cosine_similarity(X_aut[n1], X_aut[n2])[0, 0]
        X_train[m+i, 10] = list(nx.adamic_adar_index(G, [(n1, n2)]))[0][2]
        X_train[m+i, 11] = list(nx.jaccard_coefficient(G, [(n1, n2)]))[0][2]
        X_train[m+i, 12] = list(nx.preferential_attachment(G, [(n1, n2)]))[0][2]
        X_train[m+i, 13] = len(list(nx.common_neighbors(G, u=n1, v=n2)))
        X_train[m+i, 14] = cosine_similarity(bert_emb[n1], bert_emb[n2])[0,0]
        # X_train[m+i, 15] = cosine_similarity(node_emb[n1].reshape(1, -1), node_emb[n2].reshape(1, -1))[0,0]
        y_train[m+i] = 0

print('Size of training matrix:', X_train.shape)

Size of training matrix: (2183910, 15)


In [20]:
node_pairs = list()
with open('data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((int(t[0]), int(t[1])))

X_test = np.zeros((len(node_pairs), 15))
for i,node_pair in enumerate(node_pairs):
    X_test[i,0] = cosine_similarity(X[node_pair[0]], X[node_pair[1]])[0,0]
    X_test[i,1] = len(abstracts[node_pair[0]]) + len(abstracts[node_pair[1]])
    X_test[i,2] = abs(len(abstracts[node_pair[0]]) - len(abstracts[node_pair[1]]))
    X_test[i,3] = len(abstracts[node_pair[0]].intersection(abstracts[node_pair[1]]))
    X_test[i,4] = len(authors[node_pair[0]]) + len(authors[node_pair[1]])
    X_test[i,5] = abs(len(authors[node_pair[0]]) - len(authors[node_pair[1]]))
    X_test[i,6] = len(authors[node_pair[0]].intersection(authors[node_pair[1]]))
    X_test[i,7] = G.degree(node_pair[0]) + G.degree(node_pair[1])
    X_test[i,8] = abs(G.degree(node_pair[0]) - G.degree(node_pair[1]))
    X_test[i,9] = cosine_similarity(X_aut[node_pair[0]], X_aut[node_pair[1]])[0,0]
    X_test[i,10] = list(nx.adamic_adar_index(G, [(node_pair[0], node_pair[1])]))[0][2]
    X_test[i,11] = list(nx.jaccard_coefficient(G, [(node_pair[0], node_pair[1])]))[0][2]
    X_test[i,12] = list(nx.preferential_attachment(G, [(node_pair[0], node_pair[1])]))[0][2]
    X_test[i,13] = len(list(nx.common_neighbors(G, u=node_pair[0], v=node_pair[1])))
    X_test[i,14] = cosine_similarity(bert_emb[node_pair[0]], bert_emb[node_pair[1]])[0,0]
    # X_test[i,15] = cosine_similarity(node_emb[node_pair[0]].reshape(1, -1), node_emb[node_pair[1]].reshape(1, -1))[0,0]

print('Size of test matrix:', X_test.shape)

Size of test matrix: (106692, 15)


## 10-fold CV

In [21]:
l1_train = pd.DataFrame()
l1_test = pd.DataFrame()

l1_train['target'] = y_train.copy()
l1_test['id'] = list(range(len(X_test)))

for _, value in model_dict.items():

    scores = []
    n_splits=10
    kf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

    train_preds = np.zeros(shape=(len(X_train)))
    test_preds = np.zeros(shape=(len(X_test)))

    for i, (train_idx, test_idx) in enumerate(kf.split(X_train, y_train)):

        x_train, x_val = X_train[train_idx].copy(), X_train[test_idx].copy()
        Y_train, Y_val = y_train[train_idx].copy(), y_train[test_idx].copy()

        model = value['pl']

        if value['name'] == 'CB' or value['name'] == 'XGB' or value['name'] == 'LGB':
            model.fit(x_train, Y_train,
                      eval_set=[(x_val, Y_val)],
                      early_stopping_rounds=200,
                      verbose=0)
        else:
            model.fit(x_train, Y_train)
   
        train_oof_preds = model.predict_proba(x_val)[:, 1]
        train_preds[test_idx] = train_oof_preds

        score = log_loss(Y_val, train_oof_preds)
        scores.append(score)

        print(f"{value['name']}: LogLoss = {score}")

        test_oof_preds = model.predict_proba(X_test)[:, 1]
        test_preds += test_oof_preds / n_splits

    print(f"\n--> Overall metrics for {value['name']}")
    print(f": LogLoss = {np.array(scores).mean()} +/- {np.array(scores).std()}\n")

    l1_train[f"{value['name']}"] = train_preds
    l1_test[f"{value['name']}"] = test_preds

CB: LogLoss = 0.14555364645955102
CB: LogLoss = 0.14538328607301673
CB: LogLoss = 0.144397953335743
CB: LogLoss = 0.14392480853270642
CB: LogLoss = 0.14505864545486005
CB: LogLoss = 0.14174465088218902
CB: LogLoss = 0.14588428508216925
CB: LogLoss = 0.14550709020746033
CB: LogLoss = 0.14421849928845323
CB: LogLoss = 0.14681128657527043

--> Overall metrics for CB
: LogLoss = 0.14484841518914193 +/- 0.0013142865029192776

XGB: LogLoss = 0.14897767476740134
XGB: LogLoss = 0.1487337970404187
XGB: LogLoss = 0.1477214528596524
XGB: LogLoss = 0.14724232826186617
XGB: LogLoss = 0.14829261465959784
XGB: LogLoss = 0.14496715689474546
XGB: LogLoss = 0.14931936320659614
XGB: LogLoss = 0.14905735461005948
XGB: LogLoss = 0.14760168971840315
XGB: LogLoss = 0.14980501416380088

--> Overall metrics for XGB
: LogLoss = 0.14817184461825414 +/- 0.0013194643301604133

LGB: LogLoss = 0.14644139344216175
LGB: LogLoss = 0.14611558983057726
LGB: LogLoss = 0.1453040989348506
LGB: LogLoss = 0.14479974041519345


## Meta-model

In [22]:
scores = []
n_splits=10
kf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

train_preds = np.zeros(shape=(len(l1_train.index)))
test_preds = np.zeros(shape=(len(l1_test.index)))

features = l1_test.columns.to_list()[1:]
y = l1_train['target'].copy()

for i, (train_idx, test_idx) in enumerate(kf.split(l1_train, y)):

    x_train, x_val = l1_train[features].iloc[train_idx].copy(), l1_train[features].iloc[test_idx].copy()
    y_train, y_val = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()

    model =  LinearRegression()
    model.fit(x_train, y_train)

    train_oof_preds = model.predict(x_val)
    train_preds[test_idx] = train_oof_preds

    score = log_loss(y_val, train_oof_preds)
    scores.append(score)

    print(f"LR : LogLoss = {score}")

    test_oof_preds = model.predict(l1_test[features])
    test_preds += test_oof_preds / n_splits

print(f"\n--> Overall metrics for LR")
print(f": LogLoss = {np.array(scores).mean()} +/- {np.array(scores).std()}\n")

LR : LogLoss = 0.14547154025271317
LR : LogLoss = 0.14529648419957247
LR : LogLoss = 0.14430923423082653
LR : LogLoss = 0.14382606197346118
LR : LogLoss = 0.1449224041456537
LR : LogLoss = 0.1416825930786438
LR : LogLoss = 0.1458792759771605
LR : LogLoss = 0.14545530169712725
LR : LogLoss = 0.14425735636904863
LR : LogLoss = 0.14681020646917525

--> Overall metrics for LR
: LogLoss = 0.14479104583933824 +/- 0.0013247235642763662



## Get submission

In [23]:
sub = pd.DataFrame()
sub['id'] = list(range(len(X_test)))
sub['predicted'] = test_preds

sub.to_csv('submission.csv', index=False)

sub.head()

Unnamed: 0,id,predicted
0,0,0.996897
1,1,0.083376
2,2,0.738238
3,3,0.267565
4,4,0.079678
