# Doc2vec embedding

- Extract features using Doc2vec

In [1]:
#! pip install gensim==3.6.0 scipy==1.6.3 numpy==1.20.2
#! pip install pandas==1.2.4 nltk==3.6.2 matplotlib==3.4.2 scikit-learn==0.24.2 fastdtw==0.3.2 networkx==2.1

In [2]:
import re
import argparse
import gensim
import pandas as pd
import datetime
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models.doc2vec import Doc2Vec
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/xiaopengxu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
inoutpath = '/home/xiaopengxu/Desktop/data-covid-review/2021-05-11/'

compdata_path = inoutpath + 'compdata_ext_ref.csv'
model_path = inoutpath + 'model.doc2vec'
feature_path = inoutpath + 'features.ori_doc2vec.csv'

## Data loading & exploration

In [4]:
def load_data(compdata_path):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Loading data ...")
    papers = pd.read_csv(compdata_path, index_col=False)
    papers.drop(['Unnamed: 0'], axis=1, inplace=True)
    print("Count number of published papers in archives: ")
    print(pd.notnull(papers.published).value_counts())

    return papers

In [5]:
papers = load_data(compdata_path)

2021-05-14 13:42:24.140651: Loading data ...
Count number of published papers in archives: 
False    14313
True       779
Name: published, dtype: int64


## Preprocessing

In [6]:
def pre_process(papers):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Start preprocessing abstracts ...")
    doc_words = papers['abstract'].map(lambda x: re.sub('[,\:\.!?]', ' ', x))  # use only abstracts

    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    doc_tokenizers = doc_words.apply(lambda x: tokenizer.tokenize(x.lower()))

    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in doc_tokenizers]

    return docs

In [7]:
docs = pre_process(papers)

2021-05-14 13:42:24.644210: Start preprocessing abstracts ...


## Doc2Vec feature extraction

In [8]:
def process_doc(docs):
    for i in range(len(docs)):
        yield gensim.models.doc2vec.TaggedDocument(docs[i], [i])

In [9]:
def train(model_path: str, docs: list, em_size=50, min_count=2, epochs=40):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Start training Doc2Vec model ...")

    corpus = list(process_doc(docs))

    model = gensim.models.doc2vec.Doc2Vec(vector_size=em_size, min_count=min_count, epochs=epochs)
    model.build_vocab(corpus)
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
    model.save(model_path)

In [10]:
def get_embeddings(model_path, docs, em_size=50):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Get doc2vec embeddings ...")

    model = Doc2Vec.load(model_path)
    doc2vec_features = list(map(lambda doc: model.infer_vector(doc), docs))

    columns = ['dv ' + str(i + 1) for i in range(em_size)]
    pd_doc2vec_features = pd.DataFrame(doc2vec_features, columns=columns)
    return pd_doc2vec_features

In [11]:
train(model_path, docs)
pd_doc2vec_features = get_embeddings(model_path, docs)

2021-05-14 13:42:36.710772: Start training Doc2Vec model ...
2021-05-14 13:44:52.935062: Get doc2vec embeddings ...


## Combine features and save

In [12]:
def save_features(filepath, papers, pd_doc2vec_features):
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Save features ...")

    papers.reset_index(drop=True, inplace=True)
    pd_doc2vec_features.reset_index(drop=True, inplace=True)

    pd.concat([papers, pd_doc2vec_features], axis=1).to_csv(filepath)

In [13]:
save_features(feature_path, papers, pd_doc2vec_features)

2021-05-14 13:47:39.575263: Save features ...
