In [211]:
import pandas as pd
import re
from re import compile, search
import warnings
warnings.filterwarnings('ignore')

In [None]:
file_name = 'sample_100_pages.json'
df_sample = pd.read_json(file_name)

In [None]:
regexps = {
    'name': compile('име обект: (?P<name>.*?) вид обект'),
    'category': compile('вид обект: (?P<category>.*?) град'),
    'city': compile('град: (?P<city>.*?) адрес'),
    'address': compile('адрес: (?P<address>.*?) описание')
}

def match(prop, text):
    m = regexps[prop].search(text)
    if m is not None:
        return m.group(prop).strip()
    else:
        return None

def match_name(text):
    return match('name', text)

In [None]:
df_sample['matched_name'] = df_sample['description'].apply(lambda x: match('name', x))
df_sample['matched_category'] = df_sample['description'].apply(lambda x: match('category', x))
df_sample['matched_city'] = df_sample['description'].apply(lambda x: match('city', x))
df_sample['matched_address'] = df_sample['description'].apply(lambda x: match('address', x))

In [None]:
df_sample['matched_name'].count()

In [None]:
df_sample['matched_city'].count()

In [None]:
df_sample['matched_address'].count()

In [None]:
df_sample['matched_category'].count()

In [None]:
categories = list(filter(lambda x: x is not None, df_sample['matched_category'].unique()))

cat_group = "|".join(categories)
regex = "(?P<name>({}).*?),".format(cat_group)

r = compile(regex, flags=re.IGNORECASE)

def match_title(title):
    m = r.search(title)
    if m is not None:
        return m.group('name')

for row in df_sample['title'].apply(match_title).unique():
    print(row)
    
    


In [None]:
df_sample['matched_title'] = df_sample['title'].apply(match_title)

In [None]:
df_sample[['id', 'matched_title', 'matched_name']].dropna(how='all')

In [183]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import re

def parse_category(category):
    splitted = re.split(r'[-\\\/,]', category)
    return [w.strip().lower() for w in splitted]

def flatten(l):
    return sum(l, [])

all_cats = flatten([ parse_category(category) for category in categories])
unique_cats = set(all_cats)
unique_cats

{'bar',
 'bar and dinner',
 'bar and food',
 'bar and grill',
 'bar&dinner',
 'bar&grill',
 'cafeteria & gelateria & pasticeria',
 'club',
 'diner & bar',
 'kafe bar',
 'kafene',
 'pizza & restaurant',
 'playground',
 'автобус',
 'автобусна линия 83',
 'административна сграда',
 'аператив',
 'аператив  център',
 'аперитив',
 'аперитив и магазин',
 'арабски ресторант',
 'бaр',
 'бар',
 'бар билярд',
 'бар енд динър',
 'бар и грил',
 'бар и кафе',
 'бар и клуб',
 'бар и ресторант',
 'бар и хапване',
 'бар кафе',
 'бар клуб',
 'бар ресторант',
 'барбекю',
 'барове',
 'бензиностанция',
 'билярд клуб',
 'бирария',
 'бирария и механа',
 'бистро',
 'бистро (кръчма)',
 'бистро и бензиностанция',
 'бистро+бензиностанция',
 'болница',
 'боулинг',
 'боулинг бар',
 'висше учебно зaведение',
 'вход',
 'входа на блока',
 'детска градина',
 'детска площадка',
 'детската площадка',
 'дискотекa',
 'дискотека',
 'една от големите зали на приземен етаж',
 'електронно казино',
 'жилищен блок',
 'заведение

In [232]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag, RegexpTagger
from nltk.chunk.regexp import RegexpChunkParser, RegexpChunkRule
from nltk.tree import Tree

cat_group = "|".join(unique_cats)
regexp_tagger = RegexpTagger([
    ("^{}$".format(cat_group), 'PLACE_MARKER'),
    (r'^[,]$', 'DT'),
    (r'^в$', 'BEG'),
    (r'^[A-Za-z0-9 &]+$', 'LATIN_WORD'),
    (r'^[а-яА-Я0-9 &]+$', 'CYRILLIC_WORD')
])

def tokenize(title):
    words = word_tokenize(title)
    tokens = regexp_tagger.tag(words)
    return [token for token in tokens if token[1] is not None]

chuncker = RegexpChunkParser([
        RegexpChunkRule.fromstring(r'{<BEG> (<LATIN_WORD>|<CYRILLIC_WORD>)+ <PLACE_MARKER>+ (<LATIN_WORD>|<CYRILLIC_WORD>)* (?=<DT>|<BEG>)}'),
        RegexpChunkRule.fromstring(r'{(<PLACE_MARKER>+ (<LATIN_WORD>|<CYRILLIC_WORD>)+)+ <PLACE_MARKER>* (?=<DT>|<BEG>)}'),
        RegexpChunkRule.fromstring(r'{<PLACE_MARKER>+ (<LATIN_WORD>|<CYRILLIC_WORD>)+ (?=<DT>|<BEG>)}'),
    ],
    chunk_label='Name'
)

def get_words(tree):
    return [word[0] for word in tree.leaves()]

def find_name(title):
    if title is "":
        return None
    
    title = title.lower()
    tokens = tokenize(title)
    tree = chuncker.parse(tokens)
    chunk_tree = [get_words(t) for t in tree if type(t) is Tree]
    flattened = sum(chunk_tree, [])
    filtered = [w for w in flattened if w != "в"]
    return " ".join(filtered)

df_sample['matched_title_2'] = df_sample['title'].apply(find_name)

In [233]:
df_sample[['id', 'matched_title', 'matched_title_2']][100:150]

Unnamed: 0,id,matched_title,matched_title_2
100,9931,заведение Маки,заведение маки
101,9932,ресторант SASA Paradise Mall,ресторант sasa paradise mall
102,9933,бар Cheers,бар cheers
103,9938,"механа ""Одаята""",механа одаята
104,10034,"бирария ""Маки """,бирария маки
105,10036,"ресторант ""Mr. Pizza""",ресторант pizza
106,10035,"бар ""Градини""",ретро бар градини
107,10032,жилищен блок,
108,10041,пицария Amaro,пицария amaro
109,10039,ресторант Ариана,ресторант ариана


In [228]:
title = 'Нарушение в PM Club, град София'.lower()
tokens = tokenize(title)
tree = chuncker.parse(tokens)
chunk_tree = [get_words(t) for t in tree if type(t) is Tree]
flattened = sum(chunk_tree, [])
filtered = [w for w in flattened if w != "в"]

tokens

[('нарушение', 'CYRILLIC_WORD'),
 ('в', 'BEG'),
 ('pm', 'LATIN_WORD'),
 ('club', 'PLACE_MARKER'),
 (',', 'DT'),
 ('град', 'CYRILLIC_WORD'),
 ('софия', 'CYRILLIC_WORD')]