Импорты

In [1]:
import json
import pandas as pd
from transliterate  import translit
from yargy.tokenizer import MorphTokenizer
import pymorphy2
from yargy import (
    Parser,
    rule, or_, and_, not_,
)

from yargy.predicates import (
    caseless, type, gram, normalized,
    in_, in_caseless, dictionary
)

from yargy.pipelines import (
    caseless_pipeline,
    morph_pipeline
)
from yargy.interpretation import (
    fact,
    attribute
)
from yargy import interpretation as interp
INT = type('INT')

Загрузка файлов

In [2]:
bread = pd.read_json('bread-probe.json')
milk = pd.read_json('milk-normal.json')

Функция получения производителей

In [3]:
def getVendors(data):
    vendorlist = []
    for i in range(0,len(data['manufacturer'])):
        item = data['manufacturer'][i]['name'];
        if(item not in vendorlist):
            vendorlist.append(item)
            alt = translit(item, 'ru');
            if(alt != item):
                vendorlist.append(alt)
    return vendorlist

Функция поиска вхождений слов

In [4]:
def findWordOccurance(data):
    occ = {}
    for name in data['name'].values:
        words = name.split()
        for word in words:
            word = word.upper().strip()
            if not word in occ:
                occ[word] = 0
            occ[word] = occ[word]+1
    sorted_dict = {}
    sorted_keys = sorted(occ, key=occ.get,reverse=True)

    for key in sorted_keys:
        sorted_dict[key] = occ[key]
    return sorted_dict

Нормализация слов

In [5]:
def normalizeKeys(orgKeys,vendorlist=[]):
    morph = pymorphy2.MorphAnalyzer()
    tokenizer = MorphTokenizer()
    for key in list(orgKeys):
        keyData = list(tokenizer(key))
        p = morph.parse(key)[0]
    
        if(key in vendorlist):
            orgKeys.remove(key)
            continue
        if(len(key) <= 1):
            orgKeys.remove(key)
            continue
        if(len(keyData)>1):
            orgKeys.remove(key)
            continue
        if({'UNKN'} in p.tag):
            orgKeys.remove(key)
            continue
        if({'NUMB'} in p.tag):
            orgKeys.remove(key)
            continue
    return orgKeys

Перевод слова в транслит

In [6]:
def translitNames(names):
    for name in names:
        alt = translit(name, 'ru');
        if(alt != name):
            names.append(alt)
    return names

**Дальше алгоритм**

*Определяем категории*

In [7]:
def getCategoriesAndStuff(data):
    vendorlist = getVendors(data)
    words = findWordOccurance(data)
    keys = list((words.keys()))
    keysNormalized = normalizeKeys(keys,vendorlist)
    categoryName = list(keysNormalized)[0]
    keysNormalized.remove(categoryName)
    return {"name":categoryName,"keywords":keysNormalized,"vendorlist":vendorlist}

In [8]:
milkCategory = getCategoriesAndStuff(milk)
breadCategory = getCategoriesAndStuff(bread)

Непосредстваенно алгоритм поиска

In [9]:
allCategories = {}
allCategories[milkCategory['name']] = milkCategory
allCategories[breadCategory['name']] = breadCategory

Этап 1.Определение категории

In [10]:
text = ''' 3 буханки Еврохлеба, 4 хлеба Гербер  3 молока Простоквашино'''

In [11]:
Item = fact(
    'item',
    ['amount','type', 'vendor']
)

In [12]:
TYPE = morph_pipeline(list(allCategories.keys()))
ITEM_Category = rule(
    TYPE.interpretation(
        Item.type
    )
).interpretation(
    Item
)

In [13]:
parser = Parser(ITEM_Category)
match = parser.find(text)
for match in parser.findall(text):
    category = [_.value for _ in match.tokens][0]

Этап 2. Определение производителя

In [14]:
morph = pymorphy2.MorphAnalyzer()
categoryName = morph.parse(category)[0].normal_form.upper()
categoryName

'МОЛОКО'

In [15]:
allCategories[categoryName]["vendorlist"]

['VALIO',
 'ВАЛИО',
 'RIOBA',
 'РИОБА',
 'ARO',
 'АРО',
 'PARMALAT',
 'ПАРМАЛАТ',
 'G-BALANCE',
 'Г-БАЛАНЦЕ',
 'ДОМИК В ДЕРЕВНЕ',
 'ПРОСТОКВАШИНО',
 'БЕЛЫЙ ГОРОД',
 'ПЕТМОЛ',
 'ТЕМА',
 'ЭКОНИВА',
 'АСЕНЬЕВСКАЯ ФЕРМА',
 'ПИСКАРЕВСКОЕ',
 'ФРУТОНЯНЯ',
 'PROMILKER',
 'ПРОМИЛКЕР',
 'СВЕЖЕЕ ЗАВТРА',
 'КЛЕВЕР',
 'БОЛЬШАЯ КРУЖКА',
 'ВЕСЕЛЫЙ МОЛОЧНИК',
 'ВКУСНОТЕЕВО',
 'УГЛЕЧЕ ПОЛЕ',
 'FASOL',
 'ФАСОЛ',
 'FINE LIFE',
 'ФИНЕ ЛИФЕ',
 'АИСФЕР',
 'БРЯНСКИЙ МОЛОЧНЫЙ КОМБИНАТ',
 'СЕЛО ЗЕЛЕНОЕ',
 'BETTAMILK',
 'БЕТТАМИЛК',
 'ХОЗЯЙСТВО ВАСИЛЬЕВОЙ АВ',
 'MOLOKO GROUP',
 'МОЛОКО ГРОУП']

In [16]:
VENDOR = morph_pipeline(allCategories[categoryName]["vendorlist"])

In [17]:
ITEM_Vendor = rule(
    VENDOR.interpretation(
        Item.vendor
    )
).interpretation(
    Item
)

In [18]:
parser = Parser(ITEM_Vendor)
match = parser.find(text)
display(match)
for match in parser.findall(text):
    vendor = [_.value for _ in match.tokens][0]

Match(
    tokens=[MorphToken(
         value='Простоквашино',
         span=[47, 60),
         type='RU',
         forms=[Form('простоквашин', Grams(ADJF,Poss,neut,nomn,sing)),
          Form('простоквашин', Grams(ADJF,Poss,accs,neut,sing)),
          Form('простоквашино', Grams(Geox,NOUN,Sgtm,inan,neut,nomn,sing)),
          Form('простоквашино', Grams(Geox,NOUN,Sgtm,accs,inan,neut,sing))]
     )],
    span=[47, 60)
)

Этап 3 определение кол-ва и прочего

In [19]:
TYPE = morph_pipeline([categoryName])

In [20]:
AMOUNT =  INT.interpretation(
    interp.custom(int)
)

In [21]:
ITEM_MISC = rule(
    AMOUNT.interpretation(
       Item.amount
    ).optional(),
    TYPE.interpretation(
        Item.type
    ),
    VENDOR.interpretation(
        Item.vendor
    ).optional()
).interpretation(
    Item
)

In [22]:
parser = Parser(ITEM_MISC)
match = parser.find(text)
display(match)
for match in parser.findall(text):
    vendor = [_.value for _ in match.tokens][0]

Match(
    tokens=[Token(
         value='3',
         span=[38, 39),
         type='INT'
     ),
     MorphToken(
         value='молока',
         span=[40, 46),
         type='RU',
         forms=[Form('молоко', Grams(NOUN,Sgtm,gent,inan,neut,sing)),
          Form('молока', Grams(NOUN,femn,inan,nomn,sing))]
     ),
     MorphToken(
         value='Простоквашино',
         span=[47, 60),
         type='RU',
         forms=[Form('простоквашин', Grams(ADJF,Poss,neut,nomn,sing)),
          Form('простоквашин', Grams(ADJF,Poss,accs,neut,sing)),
          Form('простоквашино', Grams(Geox,NOUN,Sgtm,inan,neut,nomn,sing)),
          Form('простоквашино', Grams(Geox,NOUN,Sgtm,accs,inan,neut,sing))]
     )],
    span=[38, 60)
)

In [25]:
match.fact.vendor
match.fact.type
match.fact.amount

3

Заключительный этап. Поиск по БД

In [34]:
milk['manufacturer']

0           {'id': 1783, 'name': 'VALIO', 'image': None}
1           {'id': 1710, 'name': 'RIOBA', 'image': None}
2           {'id': 1710, 'name': 'RIOBA', 'image': None}
3             {'id': 8553, 'name': 'ARO', 'image': None}
4             {'id': 8553, 'name': 'ARO', 'image': None}
                             ...                        
105    {'id': 22039, 'name': 'MOLOKO GROUP', 'image':...
106    {'id': 2775, 'name': 'БОЛЬШАЯ КРУЖКА', 'image'...
107    {'id': 2775, 'name': 'БОЛЬШАЯ КРУЖКА', 'image'...
108    {'id': 1825, 'name': 'БЕЛЫЙ ГОРОД', 'image': N...
109    {'id': 1825, 'name': 'БЕЛЫЙ ГОРОД', 'image': N...
Name: manufacturer, Length: 110, dtype: object

In [41]:
milk[milk['manufacturer'].item()['name'] == "VALIO"]

  """Entry point for launching an IPython kernel.


ValueError: can only convert an array of size 1 to a Python scalar

In [27]:
milk[milk['manufacturer'].name.upper() ==  match.fact.vendor]

KeyError: False