In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re

import sqlite3 as sql
from gensim.corpora import Dictionary
from datasets import load_dataset

tqdm.pandas()

In [2]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import pycld2 as cld2

stop_words = set(stopwords.words('english'))
lemmatize = WordNetLemmatizer().lemmatize
tokenizer = RegexpTokenizer(r'\w+')

printable_char_re = re.compile(f'[^{string.printable[:-2]}]')

def remove_nonprintable_chars(text:str):
    return printable_char_re.sub("", text)


def normalize_text_doc(doc:str):
    doc = remove_nonprintable_chars(doc)
    words = [lemmatize(w) for w in tokenizer.tokenize(doc.lower()) if len(w) > 2]
    return [w for w in words if w not in stop_words]


def detect_lang(text):
    is_rel,_,langs = cld2.detect(remove_nonprintable_chars(text))
    if not is_rel:
        return "unk"
    return langs[0][1]

## Read and filter documnets

### Bio

In [4]:
# db_path = '/home/vpetukhov/data_nlp/dbs/all.db'
db_path = '/home/vpetukhov/data_nlp/dbs/bio.db'
db = sql.connect(db_path)

In [5]:
%%time
# train_df = pd.read_sql("SELECT title,abstract FROM Data LIMIT 100000", db)
train_df = pd.read_sql("SELECT id,title,abstract FROM Data", db).set_index('id')
train_df = train_df[train_df.abstract.map(lambda x: len(x.split())) > 20]
train_df.shape

CPU times: user 3min 28s, sys: 1min 22s, total: 4min 50s
Wall time: 13min 6s


(9828896, 2)

In [98]:
train_df['lang'] = [detect_lang(t[:200]) for t in tqdm(train_df.abstract.values)];

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9828896.0), HTML(value='')))




In [100]:
train_df = train_df[train_df['lang'] == 'en']
train_df.shape

(9407480, 3)

In [101]:
train_df['text'] = train_df.title + "\n" + train_df.abstract
# train_df['text_norm'] = ["".join(re.findall(r'[\s\w\d]', t)).lower() for t in tqdm(train_df.text.values)]

In [110]:
with open("cache/bio_tokenized.txt", "w") as f:
    for k,t in tqdm(train_df['text'].items(), total=train_df.shape[0]):
        print(k + "," + " ".join(normalize_text_doc(t)), file=f)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9407480.0), HTML(value='')))




In [None]:
token_data = load_dataset("csv", data_files="./cache/bio_tokenized.txt", column_names=['uid', 'tokens'], streaming=True)['train']

Using custom data configuration default-47076e7fb39e64a5


In [None]:
%time token_dict = Dictionary(map(lambda x: x['tokens'].split(), tqdm(token_data, total=token_data.info.splits['train'].num_examples)))
token_dict.save("./cache/bio_tokens.dict")

  0%|          | 0/9407480 [00:00<?, ?it/s]

CPU times: user 1h 17min 30s, sys: 6min 36s, total: 1h 24min 6s
Wall time: 1h 24min 3s


### All

In [5]:
db_path = '/home/vpetukhov/data_nlp/dbs/all.db'
db = sql.connect(db_path)

In [11]:
cur = db.execute("SELECT id,title,abstract FROM Data")
tokenized_texts = {}
with open("cache/all_tokenized.txt", "w") as f:
    for pid,title,abstract in tqdm(cur):
        if len(abstract.split()) < 20:
            continue

        lang = detect_lang(abstract[:200])
        if lang != 'en':
            continue

        text = title + "\n" + abstract
        # print(pid + "," + " ".join(normalize_text_doc(text)), file=f)
        tokenized_texts[pid] = " ".join(normalize_text_doc(text))

0it [00:00, ?it/s]

In [12]:
# shuffle Series
tokenized_texts = Series(tokenized_texts).sample(frac=1.0, replace=False, random_state=42)

In [14]:
with open("cache/all_tokenized.txt", "w") as f:
    for k,t in tqdm(tokenized_texts.items(), total=tokenized_texts.size):
        print(k + "," + t, file=f)

  0%|          | 0/41908692 [00:00<?, ?it/s]

In [13]:
token_dict = Dictionary(map(str.split, tqdm(tokenized_texts.values)))
token_dict.save("./cache/all_tokens.dict")

  0%|          | 0/41908692 [00:00<?, ?it/s]

## Process tokens

In [3]:
DATA_PART = "all" # bio
token_dict = Dictionary.load(f"./cache/{DATA_PART}_tokens.dict")
len(token_dict)

2005581

In [4]:
doc_freqs = Series(token_dict.dfs)
# doc_freqs = doc_freqs[(doc_freqs / token_dict.num_docs) > 0.0005]
doc_freqs = doc_freqs[(doc_freqs / token_dict.num_docs) > 0.0001]
doc_freqs.size

29434

In [5]:
token_dict.filter_tokens(good_ids=doc_freqs.index)
token_dict.compactify()

## Word2vec

In [6]:
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [7]:
WINDOW_SIZE = 1000
# WINDOW_SIZE = 5
model = Word2Vec(vector_size=200, min_count=2, workers=12, window=WINDOW_SIZE)
model.build_vocab_from_freq({token_dict[k]: f for k,f in token_dict.cfs.items()})

INFO - 23:37:53: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=200, alpha=0.025>', 'datetime': '2022-10-10T23:37:53.808479', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.15.71-gentoo-x86_64-x86_64-Intel-R-_Core-TM-_i7-8850H_CPU_@_2.60GHz-with-glibc2.10', 'event': 'created'}
INFO - 23:37:53: Processing provided word frequencies
INFO - 23:37:53: collected 29434 unique word types, with total frequency of 4718217881
INFO - 23:37:53: Creating a fresh vocabulary
INFO - 23:37:53: Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 29434 unique words (100.00% of original 29434, drops 0)', 'datetime': '2022-10-10T23:37:53.954048', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.15.71-gentoo-x86_64-x86_64-Intel-R-_Core-TM-_i7-8850H_CPU_@_2.60GHz-with-glibc2.10', 'event': 'prepare_vocab'}
INFO - 23:37:53: Word2Vec lifecycle event {'msg': 'effectiv

**TODO: make `window=5` and compare it on Bio before processing all**

In [8]:
model.wv.vectors_lockf = np.ones(len(model.wv.vectors))

In [9]:
%time model.wv.intersect_word2vec_format("/home/vpetukhov/data_nlp/embeddings/BioWordVec_PubMed_MIMICIII_d200.vec.bin", binary=True, lockf=1.0)

INFO - 23:38:01: loading projection weights from /home/vpetukhov/data_nlp/embeddings/BioWordVec_PubMed_MIMICIII_d200.vec.bin
INFO - 23:40:22: KeyedVectors lifecycle event {'msg': 'merged 29360 vectors into (29434, 200) matrix from /home/vpetukhov/data_nlp/embeddings/BioWordVec_PubMed_MIMICIII_d200.vec.bin', 'datetime': '2022-10-10T23:40:22.633958', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.15.71-gentoo-x86_64-x86_64-Intel-R-_Core-TM-_i7-8850H_CPU_@_2.60GHz-with-glibc2.10', 'event': 'intersect_word2vec_format'}


CPU times: user 2min 11s, sys: 6.55 s, total: 2min 17s
Wall time: 2min 20s


In [10]:
token_data = load_dataset("csv", data_files=f"./cache/{DATA_PART}_tokenized.txt", column_names=['uid', 'tokens'], streaming=True)['train']



In [11]:
class StringSplitterCorpus:
    def __init__(self, token_data, token_dict):
        self.token_data = token_data
        self.dict_words = set(token_dict.token2id.keys())

    def __iter__(self):
        for ts in self.token_data:
            if pd.isna(ts['tokens']):
                continue

            yield [w for w in ts['tokens'].split() if w in self.dict_words]

In [12]:
token_corpus = StringSplitterCorpus(token_data, token_dict)

In [13]:
model.train(token_corpus, total_examples=token_dict.num_docs, epochs=3, report_delay=60) # epochs=5

INFO - 23:40:23: Word2Vec lifecycle event {'msg': 'training model with 12 workers on 29434 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=1000 shrink_windows=True', 'datetime': '2022-10-10T23:40:23.748487', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.15.71-gentoo-x86_64-x86_64-Intel-R-_Core-TM-_i7-8850H_CPU_@_2.60GHz-with-glibc2.10', 'event': 'train'}
INFO - 23:40:24: EPOCH 0 - PROGRESS: at 0.00% examples, 163008 words/s, in_qsize 23, out_qsize 0
INFO - 23:41:24: EPOCH 0 - PROGRESS: at 0.34% examples, 259087 words/s, in_qsize 23, out_qsize 0
INFO - 23:42:24: EPOCH 0 - PROGRESS: at 0.68% examples, 260752 words/s, in_qsize 21, out_qsize 0
INFO - 23:43:24: EPOCH 0 - PROGRESS: at 1.08% examples, 275579 words/s, in_qsize 23, out_qsize 0
INFO - 23:44:24: EPOCH 0 - PROGRESS: at 1.48% examples, 282855 words/s, in_qsize 23, out_qsize 0
INFO - 23:45:25: EPOCH 0 - PROGRESS: at 1.89% examples, 288971 words/s

(13840728780, 14154653643)

In [22]:
# model.train(train_iter, total_examples=token_dict.num_docs, epochs=3, report_delay=60) # epochs=5

INFO - 00:51:54: Word2Vec lifecycle event {'msg': 'training model with 12 workers on 29434 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=1000 shrink_windows=True', 'datetime': '2022-10-08T00:51:54.414614', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.15.71-gentoo-x86_64-x86_64-with-glibc2.10', 'event': 'train'}
INFO - 00:51:55: EPOCH 0 - PROGRESS: at 0.00% examples, 188793 words/s, in_qsize 22, out_qsize 1
INFO - 00:52:55: EPOCH 0 - PROGRESS: at 0.39% examples, 296946 words/s, in_qsize 23, out_qsize 0
INFO - 00:53:55: EPOCH 0 - PROGRESS: at 0.77% examples, 293849 words/s, in_qsize 23, out_qsize 0
INFO - 00:54:55: EPOCH 0 - PROGRESS: at 1.14% examples, 290546 words/s, in_qsize 23, out_qsize 0
INFO - 00:55:55: EPOCH 0 - PROGRESS: at 1.51% examples, 288523 words/s, in_qsize 24, out_qsize 0
INFO - 00:56:55: EPOCH 0 - PROGRESS: at 1.88% examples, 287159 words/s, in_qsize 23, out_qsize 0
INFO - 00:57:5

KeyboardInterrupt: 

In [14]:
%time model.save(f"/home/vpetukhov/data_nlp/embeddings/w2v_pubmed_200_finetuned_{DATA_PART}_w{WINDOW_SIZE}.model")

INFO - 11:50:21: Word2Vec lifecycle event {'fname_or_handle': '/home/vpetukhov/data_nlp/embeddings/w2v_pubmed_200_finetuned_all_w1000.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-10-11T11:50:21.707258', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.15.71-gentoo-x86_64-x86_64-Intel-R-_Core-TM-_i7-8850H_CPU_@_2.60GHz-with-glibc2.10', 'event': 'saving'}
INFO - 11:50:21: not storing attribute cum_table
INFO - 11:50:21: saved /home/vpetukhov/data_nlp/embeddings/w2v_pubmed_200_finetuned_all_w1000.model


CPU times: user 22.8 ms, sys: 46.6 ms, total: 69.4 ms
Wall time: 73.3 ms
