In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import word2vec
import re
import json
from elasticsearch import Elasticsearch

In [2]:
es = Elasticsearch(hosts=['163.172.173.89:53837'])

In [3]:
products = pd.read_csv('data/products.csv', index_col='id')
nutrients = pd.read_csv('data/nutrients.csv', index_col='id')
product_nutrients = pd.read_csv('data/product_nutrients.csv', index_col=['product_id', 'nutrient_id'])
categories = pd.read_csv('data/categories.csv', index_col='id')
categorisations = pd.read_csv('data/categorisations.csv', index_col=['category_id', 'categorisable_id'])
images = pd.read_csv('data/images.csv', index_col='id')

In [4]:
def parse_fr(value):
    if pd.isnull(value):
        return np.nan
    langs = {lang: name for lang, name in re.findall(r'"([^"]+)" ?=> ?"([^"]+)"', value)}
    return langs.get('fr', np.nan)

In [5]:
def join(i1, k1, i2, k2):
    return i1.reset_index().join(i2.reset_index().set_index(k2), on=k1, lsuffix='_')

In [6]:
products['fr'] = products.name_translations.apply(parse_fr)
products.fr.count()

2028

In [7]:
nutrients['fr'] = nutrients.name_translations.apply(parse_fr)
nutrients.fr.count()

40

In [8]:
fld_products = ['fr', 'barcode', 'unit', 'quantity', 'portion_quantity', 'alcool_by_volume']
sel_products = products[pd.notnull(products.fr)][fld_products].fillna(value=-1)
sel_products.head()

Unnamed: 0_level_0,fr,barcode,unit,quantity,portion_quantity,alcool_by_volume
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1192,Saint-Omer,3162330052641,ml,500.0,-1.0,5.0
1194,Heineken,7610055020750,g,500.0,-1.0,5.0
1125,granini : orange-mangue,7610235004884,ml,1000.0,-1.0,0.0
1168,AARBERG: Sucre en morceaux,7610278041105,g,1000.0,-1.0,0.0
1153,"Belle France : Lentilles, cuisinées",3258561211534,g,410.0,200.0,0.0


In [9]:
fld_nutrients = ['product_id', 'fr', 'per_portion', 'per_day', 'per_hundred', 'unit']
sel_nutrients = join(product_nutrients, 'nutrient_id', nutrients, 'id')[fld_nutrients].fillna(value=-1)
sel_nutrients.head()

Unnamed: 0,product_id,fr,per_portion,per_day,per_hundred,unit
0,971,Énergie,117.0,6.0,2430.0,kJ
1,971,Énergie (kCal),-1.0,-1.0,581.0,kCal
2,971,Protéines,2.0,4.0,10.0,g
3,971,Matières grasses,8.0,2.0,42.0,g
4,971,Glucides,7.0,3.0,38.0,g


In [22]:
prods = []
for idx, p in sel_products.iterrows():
    obj = p.to_dict()
    obj['nutrients'] = []
    for _, nut in sel_nutrients[sel_nutrients.product_id == idx].iterrows():
        obj['nutrients'].append(nut.to_dict())
    action = { 
        'index': {
            '_index': 'kyf',
            '_type': 'default',
            '_id': obj['barcode']
        }
    }
    prods.append(action)
    prods.append(obj)

In [23]:
len(prods)

4056

In [24]:
prods[3]

{'alcool_by_volume': 5.0,
 'barcode': '7610055020750',
 'fr': 'Heineken',
 'nutrients': [],
 'portion_quantity': -1.0,
 'quantity': 500.0,
 'unit': 'g'}

In [26]:
es.bulk(prods)

{'errors': False,
 'items': [{'index': {'_id': '3162330052641',
    '_index': 'kyf',
    '_shards': {'failed': 0, 'successful': 1, 'total': 2},
    '_type': 'default',
    '_version': 2,
    'status': 200}},
  {'index': {'_id': '7610055020750',
    '_index': 'kyf',
    '_shards': {'failed': 0, 'successful': 1, 'total': 2},
    '_type': 'default',
    '_version': 1,
    'status': 201}},
  {'index': {'_id': '7610235004884',
    '_index': 'kyf',
    '_shards': {'failed': 0, 'successful': 1, 'total': 2},
    '_type': 'default',
    '_version': 1,
    'status': 201}},
  {'index': {'_id': '7610278041105',
    '_index': 'kyf',
    '_shards': {'failed': 0, 'successful': 1, 'total': 2},
    '_type': 'default',
    '_version': 1,
    'status': 201}},
  {'index': {'_id': '3258561211534',
    '_index': 'kyf',
    '_shards': {'failed': 0, 'successful': 1, 'total': 2},
    '_type': 'default',
    '_version': 1,
    'status': 201}},
  {'index': {'_id': '3258561011431',
    '_index': 'kyf',
    '_shar

In [None]:
data = re.sub('\s+', ' ', re.sub('[:\.\(\)0123456789%,–\?\\\&\']', '', ' '.join(product_fr.values).replace('-', ' ').lower()))

In [None]:
with open('data/names.txt', 'w') as f:
    f.write(data)
    f.close()

In [None]:
word2vec.word2phrase('data/names.txt', 'data/names-phrases.txt', verbose=True)

In [None]:
word2vec.word2vec('data/names.txt', 'data/names-model.bin', size=100, verbose=True)

In [None]:
word2vec.word2clusters('data/names.txt', 'data/names-clusters.txt', 100, verbose=True)

In [None]:
model = word2vec.load('data/names-model.bin')

In [None]:
clusters = word2vec.load_clusters('data/names-clusters.txt')

In [None]:
model.vocab

In [None]:
model.vectors.shape

In [None]:
indexes, metrics = model.analogy(pos=['snack'], neg=[], n=10)
model.generate_response(indexes, metrics).tolist()