# Train

1. **gensim==3.6.0 required!!!**
2. final_perfume_data_clean.csv
3. GoogleNews-vectors-negative300.bin
4. ~~perfume_w2v~~ (pretrained model)

In [29]:
import pandas as pd
data = pd.read_csv('../src/database/final_perfume_data_clean.csv', sep='#')
data.head(5)

Unnamed: 0,Name,Brand,Description,Notes,Image URL
0,Tihota Eau de Parfum,Indult,"Rapa Nui for sugar, Tihota is, quite simply, ...","Vanilla bean, musks",https://static.luckyscent.com/images/products/...
1,Sola Parfum,Di Ser,A tribute to the expanse of space extending f...,"Lavender, Yuzu, Lemongrass, Magnolia, Geraniu...",https://static.luckyscent.com/images/products/...
2,Kagiroi Parfum,Di Ser,An aromatic ode to the ancient beauty of Japa...,"Green yuzu, green shikuwasa, sansho seed, cor...",https://static.luckyscent.com/images/products/...
3,Velvet Fantasy Eau de Parfum,Montale,Velvet Fantasy is a solar fragrance where cit...,"tangerine, pink pepper, black coffee, leat...",https://static.luckyscent.com/images/products/...
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,There's no way A Lab On Fire could relocate t...,"Bergamot, almond, violet, jasmine, leather, s...",https://static.luckyscent.com/images/products/...
...,...,...,...,...,...
2106,Perfect Veil Eau de Parfum,Sarah Horowitz Parfums,"This was created to smell like clean, naked s...","top: lemon, bergamot base: musk, vanilla, san...",https://static.luckyscent.com/images/products/...
2107,Scent Eau de Parfum,Costume National,Scent is Costume Nationals debut fragrance. W...,"amber, jasmine tea, mother of pearl hibiscus ...",https://static.luckyscent.com/images/products/...
2108,Bronze Eau de Parfum,Nanadebary,"Bronze is a naughty little vixen of a scent, ...","bergamot, mandarine, cinnamon, jasmine, iris,...",https://static.luckyscent.com/images/products/...
2109,Monyette Paris Fragrance Oil,Monyette Paris,"This isn't just sexy, it's big night out sexy...","Tahitian gardenia, French muguet du bois, hin...",https://static.luckyscent.com/images/products/...


In [27]:
# preprocessing

data['clean_docs'] = data['Description'].str.replace("[^a-zA-Z']", ' ')
data['clean_docs'] = data['clean_docs'].str.replace("'[a-su-zA-SU-Z]", ' ')
data['clean_docs'] = data['clean_docs'].str.replace(' +', ' ')
data['clean_docs'] = data['clean_docs'].str.lower()

data['clean_docs']

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


0        rapa nui for sugar tihota is quite simply the...
1        a tribute to the expanse of space extending f...
2        an aromatic ode to the ancient beauty of japa...
3        velvet fantasy is a solar fragrance where cit...
4        there no way a lab on fire could relocate to ...
                              ...                        
2106     this was created to smell like clean naked sk...
2107     scent is costume nationals debut fragrance wh...
2108     bronze is a naughty little vixen of a scent s...
2109     this isn't just sexy it big night out sexy it...
2110     named for the year the frapin family establis...
Name: clean_docs, Length: 2111, dtype: object

In [11]:
from nltk.corpus import stopwords
import nltk

try:
    stopword = stopwords.words('english')
except:
    nltk.download('stopwords')
    nltk.download('punkt')
    stopword = stopwords.words('english')

In [12]:
import re
x = ' '.join(stopword)
nt = re.findall("\w+'t", x)
print(nt)

["don't", "aren't", "couldn't", "didn't", "doesn't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "needn't", "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't"]


In [13]:
stopword = list(set(stopword) - ({'no', 'nor', 'not', 'only', 'too'}| set(nt)))

In [14]:
from tensorflow.python.keras.preprocessing.text import text_to_word_sequence
data['clean_docs'] = data['clean_docs'].apply(lambda x: [t for t in text_to_word_sequence(x)
                                                         if t not in stopword and len(t)>1])

data['clean_docs']

0       [rapa, nui, sugar, tihota, quite, simply, one,...
1       [tribute, expanse, space, extending, sky, flow...
2       [aromatic, ode, ancient, beauty, japan, kagiro...
3       [velvet, fantasy, solar, fragrance, citrus, ve...
4       [no, way, lab, fire, could, relocate, los, ang...
                              ...                        
2106    [created, smell, like, clean, naked, skin, hon...
2107    [scent, costume, nationals, debut, fragrance, ...
2108    [bronze, naughty, little, vixen, scent, slinki...
2109    [isn't, sexy, big, night, sexy, reel, sexy, am...
2110    [named, year, frapin, family, established, cog...
Name: clean_docs, Length: 2111, dtype: object

In [None]:
# training

corpus = [word for word in data['clean_docs']]

from gensim.models import Word2Vec
# The codes below are not work for gensim version over 3.6.0
w2v_model = Word2Vec(size=300, window=5, min_count=2, workers=-1)
w2v_model.build_vocab(corpus)
w2v_model.intersect_word2vec_format('../../src/pretrained_models/GoogleNews-vectors-negative300.bin.gz',
                                    lockf=1.0, binary=True)

In [None]:
w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs=15)

In [None]:
w2v_model.save('perfume_w2v')