# Word Embedding

In [1]:
!pip install gensim

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting gensim
  Downloading http://mirrors.tencentyun.com/pypi/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 11.6 MB/s eta 0:00:01
[?25hCollecting smart-open>=1.8.1
  Downloading http://mirrors.tencentyun.com/pypi/packages/0b/8e/464b06f5efd26f2dc16ce7bd1662c2f31cadf9104fdbcbf5994674cc3a51/smart_open-2.1.0.tar.gz (116 kB)
[K     |████████████████████████████████| 116 kB 74.9 MB/s eta 0:00:01
Collecting boto
  Downloading http://mirrors.tencentyun.com/pypi/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 17.7 MB/s eta 0:00:01
[?25hCollecting boto3
  Downloading http://mirrors.tencentyun.com/pypi/packages/c9/a5/e06492d12da34135728559aa18ba6bc841a82cea5a5b3bcbb643ea2dbe0

In [1]:
import os
import random
import time
import warnings
warnings.filterwarnings('ignore')
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
import gensim
from gensim.models.callbacks import CallbackAny2Vec
np.random.seed(2020)
os.environ['PYTHONHASHSEED'] = '0'

In [2]:
save_path_word2vec = './embedding/word2vec'
save_path_glove    = './embedding/glove'
save_path_fasttext = './embedding/fasttext'
for path in [save_path_word2vec, save_path_glove, save_path_fasttext]:
    if not os.path.exists(path):
        os.makedirs(path)

In [3]:
import logging
logging.basicConfig(filename='./embedding/word2vec/train.log', format='%(asctime)s:%(message)s', level=logging.CRITICAL)

In [4]:
df = pd.read_pickle('./processed_data/processed_data_numerical.pkl')

# Word2Vec

In [5]:
class EpochLogger(CallbackAny2Vec):
    def __init__(self, name, path):
        self.path = path
        self.epoch = 0
        self.best_loss = None
        self.name = name

    def on_epoch_end(self, model):
        cur_loss = float(model.get_latest_training_loss())
#         if self.best_loss is None or cur_loss <= self.best_loss:
#             self.best_loss = cur_loss
#             model.wv.save_word2vec_format(self.path)
        message = "[{}] Epoch #{} {:.2f}".format(self.name, self.epoch, cur_loss)
        print(message)
        logging.critical(message)
        model.running_training_loss = 0.0  # word2vec默认是累计损失，会溢出
        self.epoch += 1

In [None]:
for name, epochs in zip(['creative_id', 'ad_id', 'product_id', 'product_category', 'advertiser_id', 'industry'], [80, 80, 20, 20, 20, 20]):
    path = os.path.join(save_path_word2vec, '{}_word2vec_sg1_hs0_win20_mc1_size300.txt'.format(name))
    input_docs = list(df[name].apply(lambda x: list(x.astype(str))))
    w2v = gensim.models.Word2Vec(input_docs, size=300, sg=1, hs=0, alpha=0.025, min_alpha=0, window=20, seed=2020, workers=32, min_count=1, iter=epochs, compute_loss=True, callbacks=[EpochLogger(name, path)])
    w2v.wv.save_word2vec_format(path)
    del input_docs, w2v
    gc.collect()
    
    
embedding_path = './embedding/word2vec'
creative_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'creative_id_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=4445721, glove=False)

ad_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'ad_id_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=3812203, glove=False)

advertiser_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'advertiser_id_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=62966, glove=False)

product_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'product_id_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=44316, glove=False)

industry_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'industry_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=337, glove=False)

product_cate_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'product_category_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=19, glove=False)

[creative_id] Epoch #0 134217728.00
[creative_id] Epoch #1 134217728.00
[creative_id] Epoch #2 134217728.00
[creative_id] Epoch #3 134217728.00
[creative_id] Epoch #4 134217728.00
[creative_id] Epoch #5 134217728.00
[creative_id] Epoch #6 134217728.00
[creative_id] Epoch #7 134217728.00


In [14]:
os.listdir('./embedding/word2vec')

['product_category_word2vec_sg1_hs0_win100_mc1_size128.txt',
 'creative_id_word2vec_sg1_hs0_win20_mc1_size300.txt',
 'product_category_word2vec_sg1_hs0_win20_mc1_size300.txt',
 'creative_id_word2vec_sg1_hs0_win100_mc1_size128.txt',
 'advertiser_id_word2vec_sg1_hs0_win20_mc1_size300.txt',
 'train.log',
 'product_id_word2vec_sg1_hs0_win20_mc1_size300.txt',
 '.ipynb_checkpoints',
 'ad_id_word2vec_sg1_hs0_win10_mc1_size512.txt',
 'product_category_word2vec_sg1_hs0_win10_mc1_size512.txt',
 'ad_id_word2vec_sg1_hs0_win100_mc1_size128.txt',
 'advertiser_id_word2vec_sg1_hs0_win10_mc1_size512.txt',
 'product_id_word2vec_sg1_hs0_win100_mc1_size128.txt',
 'embedding_w2v_sg1_hs0_win100_size128.npz',
 'embedding_w2v_sg1_hs0_win10_size512.npz',
 'creative_id_word2vec_sg1_hs0_win10_mc1_size512.txt',
 'industry_word2vec_sg1_hs0_win100_mc1_size128.txt',
 'industry_word2vec_sg1_hs0_win20_mc1_size300.txt',
 'advertiser_id_word2vec_sg1_hs0_win100_mc1_size128.txt',
 'industry_word2vec_sg1_hs0_win10_mc1_size

In [None]:
def get_word_embedding(embed_path, vocab_size, glove=False):
    pre_embedding = {}
    # 用python的生成器读取大文件，并且选取在index中的读入，减少内存消耗
    with open(embed_path, encoding='utf8') as f:
        first_line = next(f)
        word_num, embed_size = int(first_line.split()[0]), int(first_line.split()[1])
        if glove:
            # glove 是context vector 和 bias vector 的concat
            word_num -= 1
            embed_size = 2*embed_size
        embedding_matrix = np.zeros((vocab_size, embed_size))
        for line in tqdm(f, total=word_num):
            tmp = line.strip().split() 
            if tmp[0] == '<unk>':
                continue
            embedding_matrix[int(tmp[0]), :] = np.array(tmp[1:embed_size+1]).astype(np.float)
    return embedding_matrix

In [11]:
# embedding_path = './embedding/word2vec'
# creative_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'creative_id_word2vec_sg1_hs0_win10_mc1_size300.txt'), vocab_size=4445721, glove=False)

# ad_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'ad_id_word2vec_sg1_hs0_win10_mc1_size300.txt'), vocab_size=3812203, glove=False)

# advertiser_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'advertiser_id_word2vec_sg1_hs0_win10_mc1_size300.txt'), vocab_size=62966, glove=False)

# product_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'product_id_word2vec_sg1_hs0_win10_mc1_size300.txt'), vocab_size=44316, glove=False)

# industry_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'industry_word2vec_sg1_hs0_win10_mc1_size300.txt'), vocab_size=337, glove=False)

# product_cate_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'product_category_word2vec_sg1_hs0_win10_mc1_size300.txt'), vocab_size=19, glove=False)
# print(creative_w2v.shape)
# print(ad_w2v.shape)
# print(advertiser_w2v.shape)
# print(product_w2v.shape)
# print(industry_w2v.shape)
# print(product_cate_w2v.shape)

# # 保存好embedding，便于下次直接读取
# np.savez(os.path.join(embedding_path, 'embedding_w2v_sg1_hs0_win10_size300'), creative_w2v=creative_w2v.astype(np.float16), ad_w2v=ad_w2v.astype(np.float16), advertiser_w2v=advertiser_w2v.astype(np.float16), product_w2v=product_w2v.astype(np.float16), industry_w2v=industry_w2v.astype(np.float16), product_cate_w2v=product_cate_w2v.astype(np.float16))


# embedding_path = './embedding/word2vec'
# creative_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'creative_id_word2vec_sg1_hs0_win10_mc1_size128.txt'), vocab_size=4445721, glove=False)

# ad_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'ad_id_word2vec_sg1_hs0_win10_mc1_size128.txt'), vocab_size=3812203, glove=False)

# advertiser_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'advertiser_id_word2vec_sg1_hs0_win10_mc1_size128.txt'), vocab_size=62966, glove=False)

# product_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'product_id_word2vec_sg1_hs0_win10_mc1_size128.txt'), vocab_size=44316, glove=False)

# industry_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'industry_word2vec_sg1_hs0_win10_mc1_size128.txt'), vocab_size=337, glove=False)

# product_cate_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'product_category_word2vec_sg1_hs0_win10_mc1_size128.txt'), vocab_size=19, glove=False)
# print(creative_w2v.shape)
# print(ad_w2v.shape)
# print(advertiser_w2v.shape)
# print(product_w2v.shape)
# print(industry_w2v.shape)
# print(product_cate_w2v.shape)

# # 保存好embedding，便于下次直接读取
# np.savez(os.path.join(embedding_path, 'embedding_w2v_sg1_hs0_win10_size128'), creative_w2v=creative_w2v.astype(np.float16), ad_w2v=ad_w2v.astype(np.float16), advertiser_w2v=advertiser_w2v.astype(np.float16), product_w2v=product_w2v.astype(np.float16), industry_w2v=industry_w2v.astype(np.float16), product_cate_w2v=product_cate_w2v.astype(np.float16))


embedding_path = './embedding/word2vec'
creative_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'creative_id_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=4445721, glove=False)

ad_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'ad_id_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=3812203, glove=False)

advertiser_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'advertiser_id_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=62966, glove=False)

product_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'product_id_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=44316, glove=False)

industry_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'industry_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=337, glove=False)

product_cate_w2v = get_word_embedding(embed_path=os.path.join(embedding_path, 'product_category_word2vec_sg1_hs0_win20_mc1_size300.txt'), vocab_size=19, glove=False)
print(creative_w2v.shape)
print(ad_w2v.shape)
print(advertiser_w2v.shape)
print(product_w2v.shape)
print(industry_w2v.shape)
print(product_cate_w2v.shape)

# 保存好embedding，便于下次直接读取
np.savez(os.path.join(embedding_path, 'embedding_w2v_sg1_hs0_win20_size300'), creative_w2v=creative_w2v.astype(np.float16), ad_w2v=ad_w2v.astype(np.float16), advertiser_w2v=advertiser_w2v.astype(np.float16), product_w2v=product_w2v.astype(np.float16), industry_w2v=industry_w2v.astype(np.float16), product_cate_w2v=product_cate_w2v.astype(np.float16))

100%|██████████| 4445720/4445720 [15:53<00:00, 4662.31it/s]
100%|██████████| 3812202/3812202 [13:42<00:00, 4636.61it/s]
100%|██████████| 62965/62965 [00:13<00:00, 4691.80it/s]
100%|██████████| 44315/44315 [00:09<00:00, 4674.42it/s]
100%|██████████| 336/336 [00:00<00:00, 4759.77it/s]
100%|██████████| 18/18 [00:00<00:00, 4094.45it/s]


(4445721, 300)
(3812203, 300)
(62966, 300)
(44316, 300)
(337, 300)
(19, 300)
