In [346]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import word2vec
import re
import json
from elasticsearch import Elasticsearch

In [347]:
es = Elasticsearch(hosts=['163.172.173.89:53837'])

In [348]:
products = pd.read_csv('data/products.csv', index_col='id')
nutrients = pd.read_csv('data/nutrients.csv', index_col='id')
product_nutrients = pd.read_csv('data/product_nutrients.csv', index_col=['product_id', 'nutrient_id'])
categories = pd.read_csv('data/categories.csv', index_col='id')
categorisations = pd.read_csv('data/categorisations.csv', index_col=['category_id', 'categorisable_id'])
images = pd.read_csv('data/images.csv', index_col='id')

In [349]:
def parse_fr(value):
    if pd.isnull(value):
        return np.nan
    langs = {lang: name for lang, name in re.findall(r'"([^"]+)" ?=> ?"([^"]+)"', value)}
    return langs.get('fr', np.nan)

In [350]:
def join(i1, k1, i2, k2):
    return i1.reset_index().join(i2.reset_index().set_index(k2), on=k1, lsuffix='_')

In [351]:
products['fr'] = products.name_translations.apply(parse_fr)
products.fr.count()

2028

In [352]:
nutrients['fr'] = nutrients.name_translations.apply(parse_fr)
nutrients.fr.count()

40

In [353]:
fld_products = ['fr', 'barcode', 'unit', 'quantity', 'portion_quantity', 'alcool_by_volume']
sel_products = products[pd.notnull(products.fr)][fld_products].fillna(value=-1)
sel_products.head()

Unnamed: 0_level_0,fr,barcode,unit,quantity,portion_quantity,alcool_by_volume
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1192,Saint-Omer,3162330052641,ml,500.0,-1.0,5.0
1194,Heineken,7610055020750,g,500.0,-1.0,5.0
1125,granini : orange-mangue,7610235004884,ml,1000.0,-1.0,0.0
1168,AARBERG: Sucre en morceaux,7610278041105,g,1000.0,-1.0,0.0
1153,"Belle France : Lentilles, cuisinées",3258561211534,g,410.0,200.0,0.0


In [354]:
fld_nutrients = ['product_id', 'fr', 'per_portion', 'per_day', 'per_hundred', 'unit']
sel_nutrients = join(product_nutrients, 'nutrient_id', nutrients, 'id')[fld_nutrients].fillna(value=-1)
sel_nutrients.head()

Unnamed: 0,product_id,fr,per_portion,per_day,per_hundred,unit
0,971,Énergie,117.0,6.0,2430.0,kJ
1,971,Énergie (kCal),-1.0,-1.0,581.0,kCal
2,971,Protéines,2.0,4.0,10.0,g
3,971,Matières grasses,8.0,2.0,42.0,g
4,971,Glucides,7.0,3.0,38.0,g


In [355]:
config = { 
    'index': {
        '_index': 'kyf',
        '_type': 'default'
    }
}
prods = [config]
for idx, p in sel_products.iterrows():
    obj = p.to_dict()
    obj['nutrients'] = {}
    for _, nut in sel_nutrients[sel_nutrients.product_id == idx].iterrows():
        label = nut['fr']
        del nut['fr']
        obj['nutrients'][label] = nut.to_dict()
    prods.append(obj)

In [356]:
len(prods)

2029

In [357]:
es.bulk(prods)

POST http://163.172.173.89:53837/_bulk [status:400 request:3.780s]


RequestError: TransportError(400, 'illegal_argument_exception', 'Malformed action/metadata line [5], expected a simple value for field [Matières grasses] but found [START_OBJECT]')

In [29]:
data = re.sub('\s+', ' ', re.sub('[:\.\(\)0123456789%,–\?\\\&\']', '', ' '.join(product_fr.values).replace('-', ' ').lower()))

In [35]:
with open('data/names.txt', 'w') as f:
    f.write(data)
    f.close()

In [36]:
word2vec.word2phrase('data/names.txt', 'data/names-phrases.txt', verbose=True)

b'Starting training using file data/names.txt\n'b'\n'b'Vocab size (unigrams + bigrams): 4479\n'b'Words in train file: 9084\n'

In [37]:
word2vec.word2vec('data/names.txt', 'data/names-model.bin', size=100, verbose=True)

b'Starting training using file data/names.txt\n'b'Vocab size: 355\n'b'Words in train file: 6245\n'

In [38]:
word2vec.word2clusters('data/names.txt', 'data/names-clusters.txt', 100, verbose=True)

b'Starting training using file data/names.txt\n'b'Vocab size: 355\n'b'Words in train file: 6245\n'

In [39]:
model = word2vec.load('data/names-model.bin')

In [40]:
clusters = word2vec.load_clusters('data/names-clusters.txt')

In [41]:
model.vocab

array(['</s>', 'de', 'au', 'et', 'à', 'coop', 'naturaplan', 'aux', 'la',
       'avec', 'bio', 'chocolat', 'betty', 'bossi', 'légumes', 'crème',
       'lait', 'prix', 'hipp', 'pommes', 'vanille', 'poulet',
       'qualitéprix', 'alnatura', 'mélange', 'sauce', 'pomme', 'migros',
       'qualité', 'fruits', 'en', 'glace', 'glacée', 'fromage', 'pour',
       'sucre', 'oeufs', 'droetker', 'thé', 'terre', 'riz', 'beurre',
       'petits', 'classic', 'm', 'viande', 'fraise', 'du', 'tomate',
       'nature', 'salade', 'mini', 'mix', 'tomates', 'des', 'pâte', 'pois',
       'nestle', 'carottes', 'müesli', 'belle', 'knorr', 'france', 'pain',
       'barilla', 'pizza', 'suisses', 'cacao', 'citron', 'extra', 'noir',
       'blévita', 'farine', 'boeuf', 'jambon', 'graines', 'jus', 'les',
       'herbes', 'sandwich', 'soup', 'sorbet', 'noisettes', 'miel',
       'garantie', 'non', 'le', 'suisse', 'sans', 'cake', 'amandes',
       'banane', 'huile', 'orange', 'choco', 'tranches', 'ice', 'veau',
   

In [42]:
model.vectors.shape

(355, 100)

In [43]:
indexes, metrics = model.analogy(pos=['snack'], neg=[], n=10)
model.generate_response(indexes, metrics).tolist()

[('thé', 0.24156979215359647),
 ('brut', 0.23303440575248086),
 ('burger', 0.22113394505038694),
 ('budget', 0.2152204918120521),
 ('morceaux', 0.21465580162318393),
 ('cornatur', 0.21333251990601387),
 ('le', 0.20786125768686398),
 ('lhuile', 0.2033170147972994),
 ('barilla', 0.19151669514687542),
 ('blé', 0.18079573320276193)]