In [90]:
import collections
import itertools
import json
import os

import numpy as np
import pandas
import random

from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC

MIN_NUMBER_PRODUCTS_PER_CATEGORY = 3000
TRAINING_DATASET_SIZE = 25 / 100

## Load data from CSV dump

In [91]:
csv = pandas.read_csv(
    'en.openfoodfacts.org.products.csv',
    sep='\t',
    usecols=[0, 7, 15],
    dtype={'code': 'str', 'product_name': 'str'},
    converters={'categories_tags': lambda x: x.split(',') if x else np.NaN}
)
# Filter products with and without categories in two different DataFrames
products_with_categories = csv[pandas.notnull(csv['categories_tags'])]
products_without_categories = csv[pandas.isnull(csv['categories_tags'])]

## Fitting on dataset

In [92]:
# Let's build vectors of products and categories, for training purpose.
categories = [
    category
    for category, count in collections.Counter(
        category for category_list in products_with_categories['categories_tags'] for category in category_list
    ).items()
    # Filter out categories without enough products
    if count > MIN_NUMBER_PRODUCTS_PER_CATEGORY and category != ''
]
print('Keeping %d categories.' % len(categories))
# Filter out empty lists of categories
XY = products_with_categories.copy()
XY['categories_tags'] = XY['categories_tags'].map(lambda c_list: [c for c in c_list if c in categories])
mask = XY['categories_tags'].str.len() > 0
XY = XY[mask]

Keeping 33 categories.


In [93]:
# Select training and testing sample
X = XY['product_name'].values.astype('U')
Y = [np.array(c).astype('U') for c in XY['categories_tags'].values]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, train_size=TRAINING_DATASET_SIZE)

# Check each category is sufficiently represented
min([
    count
    for _, count in collections.Counter(
        category for category_list in Y_train for category in category_list
    ).items()
])



741

In [94]:
mlb = MultiLabelBinarizer()
Y_train_transformed = mlb.fit_transform(Y_train)

In [95]:
# Fit our classifier
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))
])

classifier.fit(X_train, Y_train_transformed)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [96]:
# Compute predictions for testing set
predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

accuracy_score(mlb.fit_transform(Y_test), predicted)

0.52062784115275751

In [97]:
testing_dataframe = pandas.DataFrame({
    'product_name': X_test,
    'original_labels': Y_test,
    'guessed_labels': all_labels
}, columns=['product_name', 'original_labels', 'guessed_labels'])
testing_dataframe.to_csv('testing.csv', index=False)
testing_dataframe

Unnamed: 0,product_name,original_labels,guessed_labels
0,Fernandes Cherry Soda - Ga Mee Naar Suruname &...,"[en:beverages, en:sugared-beverages]","(en:beverages, en:plant-based-foods-and-bevera..."
1,Compotes allégée en sucres Pommes Carrefour,"[en:plant-based-foods-and-beverages, en:plant-...","(en:desserts, en:fruits-and-vegetables-based-f..."
2,"Tortellini 4 Fromages, LunchBox","[en:plant-based-foods-and-beverages, en:plant-...","(en:cereals-and-potatoes, en:cereals-and-their..."
3,Orange and mango squash,[en:beverages],"(en:beverages, en:plant-based-foods-and-bevera..."
4,Crackers toast nature,[en:salty-snacks],"(en:cereals-and-potatoes, en:salty-snacks)"
5,Miellats du Maquis cru d'été 2016,"[en:spreads, en:breakfasts]",()
6,Cebollas &quot;Juan de Dios&quot;,"[en:plant-based-foods-and-beverages, en:plant-...","(en:fresh-foods, en:fruits-and-vegetables-base..."
7,Moules fraîches de Hollande,[en:seafood],"(en:seafood,)"
8,Thé citron,"[en:plant-based-foods-and-beverages, en:bevera...","(en:beverages, en:non-sugared-beverages, en:pl..."
9,Rêves de chocolat Assortiment de chocolats fin...,"[en:sugary-snacks, en:confectioneries, en:dess...","(en:chocolates, en:desserts, en:sugary-snacks)"


In [98]:
testing_diffs = []
for _, row in testing_dataframe.iterrows():
    diff = set(row.original_labels) - set(row.guessed_labels)
    if len(diff) > 0:
        testing_diffs.append({
            'product_name': row.product_name,
            'good_guessed_labels': set(row.original_labels) - diff,
            'extra_guessed_labels': [label for label in row.guessed_labels if label not in row.original_labels],
            'missing_guessed_labels': [label for label in row.original_labels if label not in row.guessed_labels]
        })
pandas.DataFrame(testing_diffs,
                 columns=['product_name', 'good_guessed_labels', 'extra_guessed_labels', 'missing_guessed_labels'])

Unnamed: 0,product_name,good_guessed_labels,extra_guessed_labels,missing_guessed_labels
0,Fernandes Cherry Soda - Ga Mee Naar Suruname &...,{en:beverages},[en:plant-based-foods-and-beverages],[en:sugared-beverages]
1,Miellats du Maquis cru d'été 2016,{},[],"[en:spreads, en:breakfasts]"
2,Thé citron,"{en:non-sugared-beverages, en:plant-based-food...",[],[en:plant-based-foods]
3,Rêves de chocolat Assortiment de chocolats fin...,"{en:chocolates, en:desserts, en:sugary-snacks}",[],[en:confectioneries]
4,Couscous royal poulet merguez,{en:meals},[],[en:canned-foods]
5,Baguette Céréales Carrefour,"{en:plant-based-foods, en:cereals-and-potatoes...",[],[en:cereals-and-their-products]
6,Angeliter Zitronenlimonade,{},[],"[en:beverages, en:sugared-beverages]"
7,Agar-Agar,{},[],"[en:plant-based-foods-and-beverages, en:plant-..."
8,Velamints,{},[],"[en:sugary-snacks, en:confectioneries]"
9,Vegemil Black Bean Soymilk,{},[en:plant-based-foods-and-beverages],[en:dairies]


In [87]:
# Dump the classifier
joblib.dump((mlb, classifier), 'offClassifier.pkl')

['offClassifier.pkl']

## Predict!

In [99]:
def batch(iterable, size):
    """
    Get items from a sequence a batch at a time.

    :param iterable: The iterable to get the items from.
    :param size: The size of the batches.
    :return: A new iterable.
    """
    sourceiter = iter(iterable)
    while True:
        batchiter = itertools.islice(sourceiter, size)
        yield itertools.chain([next(batchiter)], batchiter)

In [100]:
# Load the classifier
mlb, classifier = joblib.load('offClassifier.pkl')

In [101]:
X_predicted = products_without_categories['product_name'].values.astype('U')

all_labels = []

for i in batch(X_predicted, 30000):
    predicted = classifier.predict(list(i))
    all_labels.extend(mlb.inverse_transform(predicted))

  """


In [102]:
prediction_dataframe = pandas.DataFrame({
    'product_name': products_without_categories['product_name'].values.astype('U'),
    'guessed_labels': all_labels
}, columns=['product_name', 'guessed_labels'])
prediction_dataframe.to_csv('prediction.csv', index=False)
prediction_dataframe

Unnamed: 0,product_name,guessed_labels
0,Farine de blé noir,"(en:cereals-and-potatoes, en:cereals-and-their..."
1,Banana Chips Sweetened (Whole),"(en:appetizers, en:chips-and-fries, en:crisps,..."
2,Peanuts,"(en:legumes, en:legumes-and-their-products, en..."
3,Organic Salted Nut Mix,()
4,Organic Polenta,"(en:cereals-and-potatoes, en:cereals-and-their..."
5,Breadshop Honey Gone Nuts Granola,"(en:breakfasts, en:cereals-and-potatoes, en:ce..."
6,Organic Long Grain White Rice,"(en:cereal-grains, en:cereals-and-potatoes, en..."
7,Organic Muesli,"(en:breakfast-cereals, en:breakfasts, en:cerea..."
8,Organic Dark Chocolate Minis,"(en:chocolates, en:dark-chocolates, en:sugary-..."
9,Organic Sunflower Oil,"(en:fats, en:plant-based-foods, en:plant-based..."


In [103]:
prediction_dataframe['guessed_labels'].str.len() > 0

0          True
1          True
2          True
3         False
4          True
5          True
6          True
7          True
8          True
9          True
10         True
11         True
12        False
13         True
14        False
15         True
16         True
17         True
18         True
19        False
20         True
21         True
22         True
23         True
24         True
25         True
26         True
27         True
28         True
29        False
          ...  
254702    False
254703     True
254704     True
254705    False
254706     True
254707     True
254708    False
254709    False
254710     True
254711     True
254712    False
254713    False
254714    False
254715    False
254716    False
254717    False
254718    False
254719     True
254720    False
254721     True
254722    False
254723     True
254724     True
254725     True
254726     True
254727     True
254728     True
254729     True
254730     True
254731     True
Name: guessed_labels, Le