# DeepWalk Embedding

In [1]:
#! pip install tensorflow==2.4.0 six~=1.15.0 joblib==0.14.0 numpy~=1.19.2
# Install ge from git@github.com:shenweichen/GraphEmbedding.git.

In [2]:
import numpy as np
import pandas as pd
import networkx as nx
from ge import DeepWalk
import datetime

In [3]:
inoutpath = '/home/xiaopengxu/Desktop/data-covid-review/2021-05-11/'
compdata_path = inoutpath + 'features.ori_doc2vec.csv'
feature_path = inoutpath + 'features.ori_doc2vec_deepwalk.csv'
num_walks = 2000

## Load data and remove duplicate papers

In [4]:
def load_data(compdata_path):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Loading data ...")
    papers = pd.read_csv(compdata_path, index_col=False)
    papers.drop(['Unnamed: 0'], axis=1, inplace=True)
    print("Count number of published papers in archives: ")
    print(pd.notnull(papers.published).value_counts())

    return papers

def redup_papers(papers):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Remove duplicates ...")
    papers_redup = papers.drop_duplicates(subset=['title'])
    print("Number of duplicated papers: {}".format(len(papers) - len(papers_redup)))
    return papers_redup

In [5]:
papers = load_data(compdata_path)
papers_redup = redup_papers(papers)

2021-05-14 13:47:59.109182: Loading data ...
Count number of published papers in archives: 
False    14313
True       779
Name: published, dtype: int64
2021-05-14 13:47:59.664823: Remove duplicates ...
Number of duplicated papers: 329


  if (await self.run_code(code, result,  async_=asy)):


## Get referencing edges

Extract the paper referencing edgelist

In [6]:
def generate_edgelist(papers_redup):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Generating edgelist ...")
    doi2ref = dict()  # dict of doi -> reference doi's
    for idx in papers_redup.index:
        doi2ref[papers_redup['p_doi'].loc[idx]] = papers_redup['ref'].loc[idx]

    edge_list = list()  # list of doi -> ref doi
    for k in doi2ref.keys():
        if type(doi2ref[k]) is float:  ## handle nan values
            continue

        for val in doi2ref[k].split(', '):
            if val == 'NA':  # ignore NA values of papers
                continue
            edge_list.append((k, val))

    return edge_list

In [7]:
edge_list = generate_edgelist(papers_redup)

2021-05-14 13:47:59.758205: Generating edgelist ...


## DeepWalk Embedding

In [8]:
def train_get_embeddings(edge_list, papers, num_walks=2000, em_size=50):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Generating directed citation graph ...")

    G = nx.DiGraph()
    G.add_edges_from(edge_list)

    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Start training DeepWalk model ...")

    model = DeepWalk(G, walk_length=10, num_walks=num_walks, workers=10)
    model.train(window_size=5, iter=5, workers=10, embed_size=em_size)

    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Get DeepWalk embeddings ...")
    embeddings = model.get_embeddings()

    num_nodes = len(papers)
    p_doi = papers['p_doi'].tolist()
    DW_em = np.empty([num_nodes, em_size])
    for i in range(num_nodes):
        node = p_doi[i]
        if node in embeddings.keys():
            DW_em[i, :] = embeddings[node]

    columns = ['dw ' + str(i + 1) for i in range(em_size)]
    pd_deepwalk_features = pd.DataFrame(DW_em, columns=columns)

    return pd_deepwalk_features


In [9]:
pd_deepwalk_features = train_get_embeddings(edge_list, papers, num_walks=num_walks)

2021-05-14 13:48:00.243287: Generating directed citation graph ...
2021-05-14 13:48:01.318434: Start training DeepWalk model ...


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:  7.4min finished


Learning embedding vectors...
Learning embedding vectors done!
2021-05-14 15:48:53.543376: Get DeepWalk embeddings ...


## Combine features and save

In [10]:
def save_features(filepath, papers, pd_deepwalk_features):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Save features ...")

    papers.reset_index(drop=True, inplace=True)
    pd_deepwalk_features.reset_index(drop=True, inplace=True)

    pd.concat([papers, pd_deepwalk_features], axis=1).to_csv(filepath)


In [11]:
save_features(feature_path, papers, pd_deepwalk_features)

2021-05-14 15:49:18.279460: Save features ...
