In [161]:
import pandas as pd
import numpy as np 
import re
from datasketch import MinHash, MinHashLSHForest
import time


Concentrate first on the REWE dataset from bavaria as it is the largest dataset that we have currently available.

In [162]:
df_bav = pd.read_csv('data/bavaria.csv')
df_bav.nunique()

name        19438
brand        1810
ean         19498
price         420
grammage     5153
category       19
sale            2
image       19499
dtype: int64

In [163]:
df_bav.shape 

(22040, 8)

We see that we have 2602 duplicated names, which indicates that some products occur multiple times in the dataset. This is because the products are classified into more than one category. Also, there are more unique EAN numbers than product names, which indicates that some products are listed under the same name, altough they might differ (e.g. by taste or color like in the case of "Barilla Pesto alla Genovese 190g" - which occurs 4 times: there are two different EAN numbers and the product appears in the two categories "International" and "Öle, Soßen & Gewürze"). 

In [164]:
df_bav[df_bav.name == 'Barilla Pesto alla Genovese 190g']

Unnamed: 0,name,brand,ean,price,grammage,category,sale,image
45,Barilla Pesto alla Genovese 190g,Barilla,8076809513746,3.59,"190g (1 kg = 18,89 €)",International,False,https://img.rewe-static.de/4840313/1435860_dig...
711,Barilla Pesto alla Genovese 190g,Barilla,8076809580724,3.59,"190g (1 kg = 18,89 €)",International,False,https://img.rewe-static.de/7718084/38308979_di...
15942,Barilla Pesto alla Genovese 190g,Barilla,8076809513746,3.59,"190g (1 kg = 18,89 €)","Öle, Soßen & Gewürze",False,https://img.rewe-static.de/4840313/1435860_dig...
16488,Barilla Pesto alla Genovese 190g,Barilla,8076809580724,3.59,"190g (1 kg = 18,89 €)","Öle, Soßen & Gewürze",False,https://img.rewe-static.de/7718084/38308979_di...


We will try LSH (**L**ocally **s**ensitive **h**ashing) on the dataset to determine if it can be used to predict the real product names from the abbreviated ones on the scanned receipts. For that we will need to preprocess (remove punctuation, lowercase all text and create n-gram shingles by separating the name strings) the data in order to improve the performance.

In [165]:
# Preprocess will split a string of text into individual tokens/shingles of length k that only contain lowercase letters
def preprocess(text:str,k:int):
    tokens = []
    for i in range(len(text)-k+1):
        tokens.append(text.lower()[i:i+k])
    return set(tokens)

In [166]:
# Number of permutations
permutations = 256

# Number of recommendations to return
num_recommendatuions = 6

In [180]:
# Function for creating MinHash forest for queries
def get_forest(data, perms):
    start_time = time.time()
    minhash = []

    # Pass in a dataframe with every string you want to query
    for text in data['combi']: 
        tokens = preprocess(text,4) # Preprocess the string using the preprocessing defined above
        m = MinHash(num_perm = perms) # Set the number of permutations in your MinHash
        for s in tokens:
            m.update(s.encode('utf8')) # MinHash the string of your shingles in the string
        minhash.append(m) # Store the MinHash of the string

    # Build a forest of all the MinHashed strings
    forest = MinHashLSHForest(num_perm = perms)

    for i,m in enumerate(minhash):
        forest.add(i,m)
    # Index your forest to make it searchable
    forest.index()

    print('It took %s second to build the forest.' %round(time.time()-start_time,3))

    return forest

In [168]:
# Function that evaluates queries
def predict(text, database, perms, num_results, forest):
    start_time = time.time()

    tokens = preprocess(text,4) # Preprocess the query text into shingles
    m = MinHash(num_perm = perms) # Set the same number of permutations for MinHash as was used to build the forest.
    for s in tokens:
        m.update(s.encode('utf8')) # Create your MinHash on the text using all your shingles

    idx_array = np.array(forest.query(m,num_results)) # Query the forest with your MinHash and retrun number of requested recommendations
    if len(idx_array) == 0:
        return None  # if your query is empty, return none
    
    result = database.iloc[idx_array]['name'] # provide the names of the recommended products
    print('It took %s seconds to query forest.' %round(time.time()-start_time,3))

    return result

In [169]:
df_bav.head()

Unnamed: 0,name,brand,ean,price,grammage,category,sale,image
0,Cherry Romatomaten 250g,REWE,8714467001556,0.99,"250g (1 kg = 3,96 €)",International,False,https://img.rewe-static.de/1033906/23292477_di...
1,Bio Ingwer ca. 100g,,22865830,0.79,"1 Stück ca. 100 g (1 kg = 7,90 €)",International,False,https://img.rewe-static.de/1218820/24569461_di...
2,Süßkartoffel ca. 300g,,20475963,0.69,"1 Stück ca. 300 g (1 kg = 2,29 €)",International,True,https://img.rewe-static.de/0475963/24569668_di...
3,REWE Bio Rispentomaten 500g,REWE Bio,22610171,2.29,"500g (1 kg = 4,58 €)",International,False,https://img.rewe-static.de/1041181/21289418_di...
4,"Orangen 1,5kg im Netz",REWE Beste Wahl,22634276,2.49,"1,50kg (1 kg = 1,66 €)",International,True,https://img.rewe-static.de/1057913/21306094_di...


In [170]:
df_bav.brand.fillna('',inplace=True)

In [171]:
df_bav['price'] = df_bav.price.apply(lambda x: str(x))

In [172]:
df_bav['combi']=df_bav.name + ' ' + df_bav.brand +' '+df_bav.price

In [173]:
df_bav.head()

Unnamed: 0,name,brand,ean,price,grammage,category,sale,image,combi
0,Cherry Romatomaten 250g,REWE,8714467001556,0.99,"250g (1 kg = 3,96 €)",International,False,https://img.rewe-static.de/1033906/23292477_di...,Cherry Romatomaten 250g REWE 0.99
1,Bio Ingwer ca. 100g,,22865830,0.79,"1 Stück ca. 100 g (1 kg = 7,90 €)",International,False,https://img.rewe-static.de/1218820/24569461_di...,Bio Ingwer ca. 100g 0.79
2,Süßkartoffel ca. 300g,,20475963,0.69,"1 Stück ca. 300 g (1 kg = 2,29 €)",International,True,https://img.rewe-static.de/0475963/24569668_di...,Süßkartoffel ca. 300g 0.69
3,REWE Bio Rispentomaten 500g,REWE Bio,22610171,2.29,"500g (1 kg = 4,58 €)",International,False,https://img.rewe-static.de/1041181/21289418_di...,REWE Bio Rispentomaten 500g REWE Bio 2.29
4,"Orangen 1,5kg im Netz",REWE Beste Wahl,22634276,2.49,"1,50kg (1 kg = 1,66 €)",International,True,https://img.rewe-static.de/1057913/21306094_di...,"Orangen 1,5kg im Netz REWE Beste Wahl 2.49"


In [174]:
price

'[0.99 0.79 0.69 ... 5.49 2.49 2.49]'

In [181]:
forest = get_forest(df_bav,permutations)

It took 23.425 second to build the forest.


In [195]:
name = 'ja! H-milch 1.5'
result = predict(name,df_bav,permutations,num_recommendatuions,forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.004 seconds to query forest.

 Top Recommendation(s) is(are) 
 12359    Sternenfair Fettarme H-Milch 1,5% 1l
13096      Allgäuer Hof-Milch H-Milch 1,5% 1l
12681         Weihenstephan H-Milch 1,5% 0,5l
13615                     ja! H-Milch 1,5% 1l
17426    Sternenfair Fettarme H-Milch 1,5% 1l
12221       REWE Bio Fettarme H-Milch 1,5% 1l
Name: name, dtype: object
