# Get word embeddings from training data

- `sacremoses`: Moses tokenizer to tokenize English titles
- `jieba`: Jieba tokenizer to tokenize Traditional Chinese titles
- `fasttext`: Train word embeddings

In [17]:
import io

import pandas as pd
from sacremoses import MosesTokenizer
import jieba
import fasttext

## English training set

Outline of the steps:
- Load data into a dataframe `pd.DataFrame`
- Use Moses tokenizer to tokenize each title into tokens, then combine them again `return_str=True`
- Save these tokenized titles into a `.csv` file for `fasttext` to use later
- Train English word embeddings with `fasttext`

In [2]:
en_df = pd.read_csv('train_en.csv')
en_df

Unnamed: 0,product_title,category
0,Recollections Color Splash Clear Stamps & Stencil,Hobbies & Stationery
1,"soap,lotion scrub set 400",Health & Personal Care
2,Spigen Galaxy S10e Case Tough Armor Gunmetal,Mobile Accessories
3,Acrylic Lanalon Bright Red,Hobbies & Stationery
4,303 FLAT SHEET/Blanket 100% cotton,Home & Living
...,...,...
499995,rocker arm roller racing mio,Motors
499996,Secosana (preloved bag),Women's Bags
499997,jag bag,Women's Bags
499998,Baby wipes 15 sheets (Alcohol and Paraben Free...,Babies & Kids


In [4]:
mtk = MosesTokenizer()
en_titles = en_df['product_title'].apply(mtk.tokenize, return_str=True)
en_titles

0         Recollections Color Splash Clear Stamps &amp; ...
1                               soap , lotion scrub set 400
2              Spigen Galaxy S10e Case Tough Armor Gunmetal
3                                Acrylic Lanalon Bright Red
4                     303 FLAT SHEET / Blanket 100 % cotton
                                ...                        
499995                         rocker arm roller racing mio
499996                            Secosana ( preloved bag )
499997                                              jag bag
499998    Baby wipes 15 sheets ( Alcohol and Paraben Fre...
499999                    PRE-LOVED ORIGINAL GREEN FINO BAG
Name: product_title, Length: 500000, dtype: object

In [5]:
en_titles.to_csv('titles_en.csv', index=False, header=False)

**NOTE**: `fasttext.train_unsupervised` does not accept Python list and will use white space tokenizer by default. Thus, after tokenizing the titles, we have to combine the tokens back, separated by white spaces

In [6]:
model_en = fasttext.train_unsupervised('titles_en.csv')

In [7]:
model_en.words[:10]

['</s>', '/', '(', ')', 'for', ',', '-', 'with', 'and', '&amp;']

In [8]:
model_en.save_model("en_vect.bin")

## Traditional Chinese training set

Repeat the same steps as the English dataset. Use `jieba` instead

In [9]:
tcn_df = pd.read_csv('train_tcn.csv')
tcn_df

Unnamed: 0,product_title,category
0,Gucci Gucci Guilty Pour Femme Stud Edition 罪愛女...,Health & Beauty
1,（二手）PS4 GTA 5 俠盜獵車手5 Grand Theif Auto V繁體 中文版,Game Kingdom
2,百獸卡,Life & Entertainment
3,nac nac活氧全效柔衣素,Mother & Baby
4,#Nike耐吉官方F.C. 男子足球長褲新款標準型 拒水 拉鏈褲腳\nCD0557,Men's Apparel
...,...,...
499995,Dress,Women's Apparel
499996,Lilian Lin,Food & Beverages
499997,77 抹茶杏仁乳加 77乳加 減甜 大人味 大人的77 宇治抹茶 杏仁 宇治抹茶杏仁 抹茶 ...,Food & Beverages
499998,Panasonic 國際牌 電動 牙刷頭 (EW-DM81 專用刷頭) WEW0974-W,Home Electronic


There is an empty row in the dataset. This will throw an error in `jieba` code later, thus we have to remove it

In [10]:
tcn_df = tcn_df.dropna(subset=['product_title']).reset_index()
tcn_df

Unnamed: 0,index,product_title,category
0,0,Gucci Gucci Guilty Pour Femme Stud Edition 罪愛女...,Health & Beauty
1,1,（二手）PS4 GTA 5 俠盜獵車手5 Grand Theif Auto V繁體 中文版,Game Kingdom
2,2,百獸卡,Life & Entertainment
3,3,nac nac活氧全效柔衣素,Mother & Baby
4,4,#Nike耐吉官方F.C. 男子足球長褲新款標準型 拒水 拉鏈褲腳\nCD0557,Men's Apparel
...,...,...,...
499994,499995,Dress,Women's Apparel
499995,499996,Lilian Lin,Food & Beverages
499996,499997,77 抹茶杏仁乳加 77乳加 減甜 大人味 大人的77 宇治抹茶 杏仁 宇治抹茶杏仁 抹茶 ...,Food & Beverages
499997,499998,Panasonic 國際牌 電動 牙刷頭 (EW-DM81 專用刷頭) WEW0974-W,Home Electronic


In [11]:
tcn_titles = tcn_df['product_title'].apply(jieba.lcut)
tcn_titles

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Thien\AppData\Local\Temp\jieba.cache
Loading model cost 0.760 seconds.
Prefix dict has been built successfully.


0         [Gucci,  , Gucci,  , Guilty,  , Pour,  , Femme...
1         [（, 二手, ）, PS4,  , GTA,  , 5,  , 俠盜, 獵車, 手, 5,...
2                                                   [百獸, 卡]
3                                [nac,  , nac, 活氧全, 效柔, 衣素]
4         [#, Nike, 耐吉, 官方, F, ., C, .,  , 男子, 足球, 長, 褲,...
                                ...                        
499994                                              [Dress]
499995                                     [Lilian,  , Lin]
499996    [77,  , 抹, 茶, 杏仁, 乳加,  , 77, 乳加,  , 減甜,  , 大人,...
499997    [Panasonic,  , 國際牌,  ,  , 電動,  , 牙刷, 頭,  ,  , ...
499998    [正品,  , 新款, 紅眼,  , 戰神哈, 奴曼, 手鐲,  , 哈魯曼, 手環,  ,...
Name: product_title, Length: 499999, dtype: object

In [12]:
tcn_titles = tcn_titles.str.join(" ")
tcn_titles

0         Gucci   Gucci   Guilty   Pour   Femme   Stud  ...
1         （ 二手 ） PS4   GTA   5   俠盜 獵車 手 5   Grand   The...
2                                                      百獸 卡
3                                       nac   nac 活氧全 效柔 衣素
4         # Nike 耐吉 官方 F . C .   男子 足球 長 褲 新款 標準 型   拒水 ...
                                ...                        
499994                                                Dress
499995                                         Lilian   Lin
499996    77   抹 茶 杏仁 乳加   77 乳加   減甜   大人 味   大人 的 77  ...
499997    Panasonic   國際牌     電動   牙刷 頭     ( EW - DM81 ...
499998    正品   新款 紅眼   戰神哈 奴曼 手鐲   哈魯曼 手環   戰神哈 努曼   龍波 ...
Name: product_title, Length: 499999, dtype: object

In [13]:
tcn_titles.to_csv('titles_tcn.csv', index=False, header=False)

In [14]:
model_tcn = fasttext.train_unsupervised('titles_tcn.csv')

In [15]:
model_tcn.words[:10]

['</s>', '-', '】', '【', '/', '現貨', '(', ')', '新款', '包']

In [16]:
model_tcn.save_model('tcn_vect.bin')

## Check the result

Take the 100th words from the Traditional Chinese model and print out its word embedding (a vector)

In [26]:
test = model_tcn.words[100]
print(test)
model_en.get_word_vector(test)

袖


array([ 0.00553463,  0.0036858 ,  0.00300124,  0.0094137 , -0.00154002,
        0.00702213,  0.00068921,  0.00608398, -0.00794067, -0.00300166,
        0.00525145,  0.00560152,  0.00236384,  0.00936521, -0.00615536,
        0.00224061, -0.00104833,  0.0045906 , -0.00735329, -0.00687921,
        0.0088753 , -0.00404528, -0.00283604, -0.00886293,  0.0066917 ,
       -0.00832281, -0.00899456,  0.00772121, -0.00037216,  0.00466333,
       -0.00831519,  0.00913491,  0.00076379, -0.00256103, -0.00429249,
       -0.00187946, -0.00889406,  0.00978339, -0.00472016,  0.00415304,
        0.00806945,  0.00885123,  0.00335726,  0.00842476, -0.00500071,
       -0.00867984, -0.00670121,  0.00719426,  0.00156702,  0.00123291,
       -0.00028147, -0.00549498, -0.00742083, -0.00077295, -0.00895753,
       -0.00676985,  0.00911156, -0.00085633,  0.00904469,  0.00869599,
        0.00490582,  0.00579019,  0.00533494,  0.00788952,  0.0034216 ,
        0.00702486, -0.00135054, -0.00152523, -0.00382493,  0.00

## Visualization with TensorFlow Projector

Export the words and word embeddings to `.tsv` files for visualization in TensorFlow Projector (https://projector.tensorflow.org/)

In [28]:
def export_to_tf_projector(model, name):
    out_v = io.open('{}_vecs.tsv'.format(name), 'w', encoding='utf-8')
    out_m = io.open('{}_meta.tsv'.format(name), 'w', encoding='utf-8')

    for word in model.words:
        vec = model.get_word_vector(word)
        out_m.write(word + "\n")
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    
    out_v.close()
    out_m.close()

In [None]:
export_to_tf_projector(model_en, 'en')
export_to_tf_projector(model_tcn, 'tcn')