# Get word embeddings from training data

- `sacremoses`: Moses tokenizer to tokenize English titles
- `jieba`: Jieba tokenizer to tokenize Traditional Chinese titles
- `fasttext`: Train word embeddings

In [10]:
import io

import pandas as pd
from sacremoses import MosesTokenizer
import jieba
import fasttext
import re

## English training set

Outline of the steps:
- Load data into a dataframe `pd.DataFrame`
- Use Moses tokenizer to tokenize each title into tokens, then combine them again `return_str=True`
- Save these tokenized titles into a `.csv` file for `fasttext` to use later
- Train English word embeddings with `fasttext`

In [2]:
en_df = pd.read_csv('train_en.csv')
en_df

Unnamed: 0,product_title,category
0,Recollections Color Splash Clear Stamps & Stencil,Hobbies & Stationery
1,"soap,lotion scrub set 400",Health & Personal Care
2,Spigen Galaxy S10e Case Tough Armor Gunmetal,Mobile Accessories
3,Acrylic Lanalon Bright Red,Hobbies & Stationery
4,303 FLAT SHEET/Blanket 100% cotton,Home & Living
...,...,...
499995,rocker arm roller racing mio,Motors
499996,Secosana (preloved bag),Women's Bags
499997,jag bag,Women's Bags
499998,Baby wipes 15 sheets (Alcohol and Paraben Free...,Babies & Kids


In [5]:
mtk = MosesTokenizer()
en_titles = en_df['product_title'].apply(mtk.tokenize, return_str=True)
en_titles

0         Recollections Color Splash Clear Stamps &amp; ...
1                               soap , lotion scrub set 400
2              Spigen Galaxy S10e Case Tough Armor Gunmetal
3                                Acrylic Lanalon Bright Red
4                     303 FLAT SHEET / Blanket 100 % cotton
                                ...                        
499995                         rocker arm roller racing mio
499996                            Secosana ( preloved bag )
499997                                              jag bag
499998    Baby wipes 15 sheets ( Alcohol and Paraben Fre...
499999                    PRE-LOVED ORIGINAL GREEN FINO BAG
Name: product_title, Length: 500000, dtype: object

In [6]:
en_titles.to_csv('titles_en.csv', index=False, header=False)

**NOTE**: `fasttext.train_unsupervised` does not accept Python list and will use white space tokenizer by default. Thus, after tokenizing the titles, we have to combine the tokens back, separated by white spaces
- Using `dim=300` to generate embeddings of dimension 300

In [None]:
model_en = fasttext.train_unsupervised('titles_en.csv', dim=300)

In [6]:
model_en.words[:10]

['</s>', '/', '(', ')', 'for', ',', '-', 'with', 'and', '&amp;']

In [7]:
model_en.save_model("en_vect.bin")

## Traditional Chinese training set

Repeat the same steps as the English dataset. Use `jieba` instead

In [3]:
tcn_df = pd.read_csv('train_tcn.csv')
tcn_df

Unnamed: 0,product_title,category
0,Gucci Gucci Guilty Pour Femme Stud Edition 罪愛女...,Health & Beauty
1,（二手）PS4 GTA 5 俠盜獵車手5 Grand Theif Auto V繁體 中文版,Game Kingdom
2,百獸卡,Life & Entertainment
3,nac nac活氧全效柔衣素,Mother & Baby
4,#Nike耐吉官方F.C. 男子足球長褲新款標準型 拒水 拉鏈褲腳\nCD0557,Men's Apparel
...,...,...
499995,Dress,Women's Apparel
499996,Lilian Lin,Food & Beverages
499997,77 抹茶杏仁乳加 77乳加 減甜 大人味 大人的77 宇治抹茶 杏仁 宇治抹茶杏仁 抹茶 ...,Food & Beverages
499998,Panasonic 國際牌 電動 牙刷頭 (EW-DM81 專用刷頭) WEW0974-W,Home Electronic


There is an empty row in the dataset. This will throw an error in `jieba` code later, thus we have to remove it

In [4]:
tcn_df = tcn_df.dropna(subset=['product_title']).reset_index(drop=True)
tcn_df

Unnamed: 0,product_title,category
0,Gucci Gucci Guilty Pour Femme Stud Edition 罪愛女...,Health & Beauty
1,（二手）PS4 GTA 5 俠盜獵車手5 Grand Theif Auto V繁體 中文版,Game Kingdom
2,百獸卡,Life & Entertainment
3,nac nac活氧全效柔衣素,Mother & Baby
4,#Nike耐吉官方F.C. 男子足球長褲新款標準型 拒水 拉鏈褲腳\nCD0557,Men's Apparel
...,...,...
499994,Dress,Women's Apparel
499995,Lilian Lin,Food & Beverages
499996,77 抹茶杏仁乳加 77乳加 減甜 大人味 大人的77 宇治抹茶 杏仁 宇治抹茶杏仁 抹茶 ...,Food & Beverages
499997,Panasonic 國際牌 電動 牙刷頭 (EW-DM81 專用刷頭) WEW0974-W,Home Electronic


In [5]:
tcn_titles = tcn_df['product_title'].apply(jieba.lcut, cut_all=True)
tcn_titles

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Luca\AppData\Local\Temp\jieba.cache
Loading model cost 1.488 seconds.
Prefix dict has been built successfully.


0         [Gucci, ,  , , Gucci, ,  , , Guilty, ,  , , Po...
1         [（, 二手, ）, PS4, ,  , , GTA, ,  , , 5, ,  , , 俠...
2                                                 [百, 獸, 卡]
3                    [nac, ,  , , nac, 活, 氧, 全, 效, 柔, 衣, 素]
4         [#, Nike, 耐, 吉, 官方, F, ., C, ., ,  , , 男子, 足球,...
                                ...                        
499994                                              [Dress]
499995                                 [Lilian, ,  , , Lin]
499996    [77, ,  , , 抹, 茶, 杏仁, 乳, 加, ,  , , 77, 乳, 加, ,...
499997    [Panasonic, ,  , , 國, 際, 牌, ,  , ,  , , 電, 動, ...
499998    [正品, ,  , , 新款, 紅, 眼, ,  , , 戰, 神, 哈, 奴, 曼, 手,...
Name: product_title, Length: 499999, dtype: object

In [7]:
tcn_df.product_title[0].split(' ')

['Gucci',
 'Gucci',
 'Guilty',
 'Pour',
 'Femme',
 'Stud',
 'Edition',
 '罪愛女性淡香水限量版',
 '50ml',
 'T']

In [9]:
tcn_titles = tcn_titles.str.join(" ")
tcn_titles

0         Gucci     Gucci     Guilty     Pour     Femme ...
1         （ 二手 ） PS4     GTA     5     俠 盜 獵 車 手 5     G...
2                                                     百 獸 卡
3                                 nac     nac 活 氧 全 效 柔 衣 素
4         # Nike 耐 吉 官方 F . C .     男子 足球 長 褲 新款 標 準 型  ...
                                ...                        
499994                                                Dress
499995                                       Lilian     Lin
499996    77     抹 茶 杏仁 乳 加     77 乳 加     減 甜     大人 人味...
499997    Panasonic     國 際 牌        電 動     牙刷 頭       ...
499998    正品     新款 紅 眼     戰 神 哈 奴 曼 手 鐲     哈 魯 曼 手 環 ...
Name: product_title, Length: 499999, dtype: object

In [12]:
tcn_titles.to_csv('titles_tcn.csv', index=False, header=False)

- Use `minn=1` because each Chinese character is a word, so this will capture each Chinese character in a word also.

In [13]:
model_tcn = fasttext.train_unsupervised('titles_tcn.csv', minn=1, dim=300)

In [14]:
model_tcn.words[:10]

['</s>', '-', '/', '【', '】', '貨', '包', '現', '裝', '機']

In [15]:
model_tcn.save_model('tcn_vect.bin')

## Check the result

Take the 100th words from the Traditional Chinese model and print out its word embedding (a vector)

In [16]:
test = model_tcn.words[100]
print(test)
model_en.get_word_vector(test)

日本


array([-1.21177849e-03, -1.51913933e-04,  1.02480676e-03, -2.42322967e-05,
        1.14398473e-03,  7.73033767e-04, -7.90226215e-04,  5.68792282e-04,
       -1.62327942e-03,  2.55770661e-04, -1.32358936e-03, -2.42995244e-04,
       -3.88128217e-04,  1.44166313e-03, -3.36985744e-04,  3.68652138e-04,
        7.94189051e-04, -1.95515552e-03, -8.36003092e-05, -1.36525184e-03,
       -2.94883903e-05, -1.96821988e-03,  8.68611911e-04,  6.25004875e-04,
        3.23192595e-04, -4.75835608e-04, -7.70022743e-04,  1.42968562e-03,
       -5.34988241e-04,  2.60288798e-04,  3.45395500e-04,  9.54298303e-04,
       -1.35630253e-03,  1.83041993e-04, -3.43519700e-04, -2.60008895e-03,
        6.09521230e-04,  1.64041948e-03, -8.18782020e-04,  9.43286344e-04,
       -1.38947857e-03,  1.31055294e-03,  6.47053123e-04, -1.08342734e-03,
       -1.67794060e-03,  9.05688386e-04, -1.62743781e-05,  2.08744430e-03,
       -1.07317080e-03,  4.62193595e-04,  6.84143219e-04, -2.45806994e-03,
        1.04830181e-03,  

## Visualization with TensorFlow Projector

Export the words and word embeddings to `.tsv` files for visualization in TensorFlow Projector (https://projector.tensorflow.org/)

In [17]:
def export_to_tf_projector(model, name):
    out_v = io.open('{}_vecs.tsv'.format(name), 'w', encoding='utf-8')
    out_m = io.open('{}_meta.tsv'.format(name), 'w', encoding='utf-8')

    for word in model.words:
        vec = model.get_word_vector(word)
        out_m.write(word + "\n")
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    
    out_v.close()
    out_m.close()

In [18]:
export_to_tf_projector(model_en, 'en')
export_to_tf_projector(model_tcn, 'tcn')

In [70]:
import emoji

ModuleNotFoundError: No module named 'emoji'

In [26]:
tcn_titles.sample(10)

28230                ［ 客 訂 中 ］ 暖 心 艾 浴 包 11 包 ( 月子 擦澡 好 物 )
176936    【 優 惠 多 】    北 歐 實 木 餐桌 輕 奢 家用 小 戶 型 長 方形 吃 飯 ...
348766    懶 人 沙 發 小 戶 型 雙 人 臥 室 客 廳 沙 發 簡 易 布 藝 多功能 功能 可...
313035    泰 國 佛 牌 聖 物 - 虎 頭 魯 士 立 尊 / 智慧     冥想     控 靈 ...
481144    📣 此 商品 48 小 時 內 快速 出 貨 🚀 美 國 Greenies 新 健 綠 》 ...
289644    2018 新 發 売     日立     8L     快速 乾 衣 除 濕 機     ...
333498                          < 二手 > 皮 膚 測 試 奈 米 噴 霧 補 水器
197837                                          緞 面 綁 帶 短 褲
170633                           可 拆 式 衣 領 （ 蕾 絲 珠珠 優 雅 款 ）
125279    ★ king 車 貼 ★\ n 反光 三角 三角形 角形 貼 紙 安全 警示 貼 汽 車 貼...
Name: product_title, dtype: object

- emojis
- escape characters `\n`
- uppercase and lowercase
- number is not important?
- punctuations can be replaced? `/` can be replaced with space `' '`
- remove all types of backets `() {} <> "" ''` (maybe replace with space first)
- multiple spaces replaced with single space

tokenize first, then remove characters
- only retrain words (.isalpha())
- remove emojis

In [124]:
'、'.isalpha()

False

In [None]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def escape_ansi(line):
    ansi_escape =re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
    return ansi_escape.sub('', line)

# use text.replace('\n', '') to remove \n

# use ' '.join(mystring.split()) to substitle multiple whitespaces with single whitespace