In [1]:
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer

In [2]:
stopwords_sample = pd.read_json('stopwords-bg.json')[259:]

In [3]:
short_stop_word = stopwords_sample[stopwords_sample[0].apply(lambda x: len(x) <= 3)][0]

In [4]:
full_sample = pd.read_json('sample_100_pages_names.json')

In [5]:
full_sample.columns

Index(['categories', 'date', 'description', 'files', 'id', 'location', 'title',
       'matched_name', 'matched_category', 'matched_city', 'matched_address',
       'matched_title', 'matched_title_2'],
      dtype='object')

In [6]:
sample = full_sample

In [7]:
columns_for_index = ['matched_name', 'matched_title', 'matched_title_2']

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

sample[columns_for_index] = sample[columns_for_index].fillna("")

for col in columns_for_index:
    sample[col + '_tokens'] = sample[col].apply(tokenizer.tokenize)
    

In [8]:
sample['tokens'] = sample['matched_name_tokens'] + sample['matched_title_tokens'] + sample['matched_title_2_tokens']

In [9]:
sample[['id', 'tokens']]

Unnamed: 0,id,tokens
0,10244,"[сладкарница, малинка, сладкарница, малинка]"
1,10249,[]
10,10207,"[заведение, златна, белка, заведение, златна, ..."
100,9931,"[заведение, маки, заведение, маки]"
1000,9848,"[пицария, ветрило, пицария, ветрило]"
1001,9847,"[ресторант, хасиенда, ресторант, хасиенда]"
1002,9852,"[corner, bar, corner, bar]"
1003,9842,[]
1004,9845,[]
1005,9844,[]


### Get (token, id) pairs

In [10]:
def get_report(row):
    return row[1]

def get_token_report_id_pairs(reports):
    pairs = []
    for report in reports:
        pairs += [(token, report.id) for token in report.tokens]
            
    return pairs
    
    
token_id_pairs = get_token_report_id_pairs(map(get_report, sample[['id', 'tokens']].iterrows()))
token_id_pairs[:20]

[('сладкарница', 10244),
 ('малинка', 10244),
 ('сладкарница', 10244),
 ('малинка', 10244),
 ('заведение', 10207),
 ('златна', 10207),
 ('белка', 10207),
 ('заведение', 10207),
 ('златна', 10207),
 ('белка', 10207),
 ('заведение', 9931),
 ('маки', 9931),
 ('заведение', 9931),
 ('маки', 9931),
 ('пицария', 9848),
 ('ветрило', 9848),
 ('пицария', 9848),
 ('ветрило', 9848),
 ('ресторант', 9847),
 ('хасиенда', 9847)]

In [11]:
from operator import itemgetter
sorted_token_id = sorted(token_id_pairs, key=itemgetter(0))
sorted_token_id[-10:]

[('ягода', 7653),
 ('ямас', 9277),
 ('ямас', 9277),
 ('янка', 8891),
 ('янка', 8891),
 ('янка', 8472),
 ('янка', 8472),
 ('янтра', 10562),
 ('янтра', 10562),
 ('янтра', 10562)]

### Merge token occurences for each report

In [12]:
def merge_token_in_report(sorted_token_id):
    token_id_freq = []
    for token, id in sorted_token_id:
        if token_id_freq:
            prev_tok, prev_id, prev_freq = token_id_freq[-1]
            if prev_tok == token and prev_id == id:     
                token_id_freq[-1] = (token, id, prev_freq+1)
            else:
                token_id_freq.append((token, id, 1))
        else:
            token_id_freq.append((token, id, 1))
    return token_id_freq

In [13]:
token_id_freq = merge_token_in_report(sorted_token_id)
token_id_freq[-10:]

[('южен', 10425, 3),
 ('южния', 7923, 2),
 ('юнион', 6859, 2),
 ('ябълка', 7840, 2),
 ('ягода', 7651, 4),
 ('ягода', 7653, 2),
 ('ямас', 9277, 2),
 ('янка', 8891, 2),
 ('янка', 8472, 2),
 ('янтра', 10562, 3)]

### Create Dictionary and Postings

In [14]:
from collections import defaultdict
dictionary = defaultdict(lambda: (0, 0))
postings = defaultdict(lambda: [])

for token, id, freq in token_id_freq:
    dictionary[token] = (dictionary[token][0] + 1, dictionary[token][1] + freq)

#postings
for token, id, freq in token_id_freq:
    postings[token].append((id, freq))

In [15]:
dictionary['пица']

(9, 23)

In [16]:
postings['pizza']

[(10036, 3),
 (9096, 3),
 (9236, 3),
 (9282, 3),
 (8435, 2),
 (8278, 3),
 (8388, 3),
 (8422, 3),
 (8328, 3),
 (8346, 2),
 (8501, 2),
 (8465, 3),
 (8662, 3),
 (10151, 3),
 (10355, 3),
 (7682, 3),
 (6896, 2),
 (9473, 1)]

Sort the postings

In [17]:
for key, values in postings.items():
    postings[key] = sorted(values, key=itemgetter(0))

In [18]:
import numpy as np
from functools import reduce

def and_query(words):
    """
    Finds all the documents that contain all the words with the frequescies summed
    """
    occurences = [{id: freq for id, freq in postings[word]} for word in words]
    common = reduce(
        set.intersection,
        [{id for id, freq in occ.items()} for occ in occurences])
    return {id: sum([occ[id] for occ in occurences]) for id in common}

In [19]:
and_query(['mr', 'pizza'])

{9282: 6,
 8388: 6,
 8422: 5,
 9096: 6,
 6896: 3,
 8465: 5,
 8435: 3,
 10036: 5,
 10355: 5,
 8662: 5,
 8346: 3}

### TF-IDF

In [20]:
columns_for_search = ['matched_name', 'matched_title', 'matched_title_2', 'short_desc']

In [21]:
search_sample = full_sample

In [22]:
search_sample['short_desc'] = search_sample['description'].apply(lambda x: x.split('*')[0])

In [23]:
search_sample[columns_for_index] = full_sample[columns_for_index].fillna("")

for col in columns_for_search:
    search_sample[col + '_tokens'] = search_sample[col].apply(tokenizer.tokenize)

In [24]:
search_sample['search_tokens'] = (
    search_sample['matched_name_tokens'] * 5 ### most trustworthy
    + search_sample['matched_title_tokens'] * 4
    + search_sample['matched_title_2_tokens'] * 2
    + search_sample['short_desc_tokens']
)

In [25]:
search_sample['search_tokens'][:3]

0     [сладкарница, малинка, сладкарница, малинка, с...
1     [в, глори, бар, и, грил, се, пуши, дори, през,...
10    [заведение, златна, белка, заведение, златна, ...
Name: search_tokens, dtype: object

In [26]:
search_sample['search_tokens_text'] = search_sample['search_tokens'].apply(lambda x: ' '.join(x))

In [27]:
search_sample['search_tokens_text'][:3]

0     сладкарница малинка сладкарница малинка сладка...
1           в глори бар и грил се пуши дори през деня .
10    заведение златна белка заведение златна белка ...
Name: search_tokens_text, dtype: object

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
count_vectorizer = CountVectorizer(stop_words=short_stop_word.tolist())

In [30]:
count_vectorizer.fit(search_sample['search_tokens_text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['а', 'аз', 'ако', 'ала', 'бе', 'без', 'би', 'бил', 'в', 'вас', 'ваш', 'ви', 'вие', 'все', 'във', 'г', 'ги', 'го', 'д', 'да', 'два', 'две', 'ден', 'дни', 'до', 'е', 'ето', 'за', 'зад', 'и', 'из', 'или', 'им', 'има', 'й', 'как', 'кой', 'към', 'ли', 'лош', 'м', 'май', 'ме', 'мек', 'мен', 'м..., 'те', 'ти', 'то', 'той', 'три', 'тук', 'тъй', 'тя', 'тях', 'у', 'ч', 'че', 'ще', 'щом', 'я', 'як'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [31]:
# print(count_vectorizer.vocabulary_)

In [32]:
count_vectorizer.transform(search_sample['search_tokens_text']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=2, stop_words=short_stop_word.tolist())
tfidf_vectorizer.fit_transform(search_sample['search_tokens_text']).toarray()
tfidf_vectorizer.vocabulary_
0

0

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
cosine_similarity([[1.1,2,2]], [[1,2,2]])

array([[0.99951732]])

In [36]:
from fuzzywuzzy import fuzz

In [37]:
fuzz.ratio('лаплаза', 'плаза')

83

In [38]:
def get_closest_documents(query, vectorizer, train_corpus_vectors, top_n=10):
    """Vectorizer should be fit on the documents beforehand.
        Returns tuples of (similarity, indexes) of closest documents"""
    # compute similarity to all sentences in the training corpus
    similarities = cosine_similarity(vectorizer.transform([query]), train_corpus_vectors).flatten()
    # get indexes of top n closest sentences
    related_docs_indices = similarities.argsort()[:-top_n-1:-1]
    # return tuples of (similarity score, document id)
    return [(similarities[idx], idx)  for idx in related_docs_indices]

In [39]:
tfidf_vectorizer.vocabulary_

{'сладкарница': 4105,
 'пушат': 3658,
 'навсякъде': 2458,
 'глори': 1065,
 'бар': 570,
 'грил': 1130,
 'пуши': 3675,
 'дори': 1335,
 'през': 3489,
 'деня': 1204,
 'заведение': 1524,
 'златна': 1651,
 'залата': 1579,
 'заведението': 1525,
 'поднасят': 3229,
 'пластмасови': 3168,
 'чаши': 4806,
 'вода': 912,
 'вместо': 906,
 'пепелници': 3104,
 'гости': 1101,
 'своите': 3981,
 'деца': 1221,
 'въпреки': 997,
 'това': 4477,
 'маки': 2235,
 '24': 88,
 '11': 22,
 '2017': 69,
 '19': 55,
 '30': 99,
 'часа': 4793,
 '20': 60,
 'маси': 2272,
 'спокойно': 4223,
 'пицария': 3150,
 'ветрило': 808,
 'необезпокоявано': 2644,
 'съвсем': 4362,
 'нагло': 2463,
 'ресторант': 3854,
 'хасиенда': 4699,
 'съобщавам': 4383,
 'неспазване': 2672,
 'закона': 1566,
 'цигарите': 4773,
 'съответното': 4389,
 'corner': 229,
 'bar': 187,
 'искам': 1825,
 'подам': 3221,
 'сигнал': 4052,
 'системно': 4082,
 'нарушаване': 2567,
 'забраната': 1515,
 'тютюнопушене': 4562,
 'намиращ': 2503,
 'гр': 1112,
 'пловдив': 3180,
 '

In [40]:
train_corpus_vectors = tfidf_vectorizer.transform(search_sample['search_tokens_text'])

In [41]:
train_corpus_vectors

<3000x4912 sparse matrix of type '<class 'numpy.float64'>'
	with 67120 stored elements in Compressed Sparse Row format>

In [42]:
query = 'плаза варна'

In [43]:
sorted(tfidf_vectorizer.transform([query]))

[<1x4912 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>]

In [44]:
closest_documents = get_closest_documents(query, tfidf_vectorizer, train_corpus_vectors)

In [45]:
closest_documents

[(0.7711498216065449, 2864),
 (0.7596328477906219, 1732),
 (0.7529446438772612, 1969),
 (0.714017970072695, 2417),
 (0.6971656188536786, 2987),
 (0.2985200223861522, 2458),
 (0.2607980646007543, 2234),
 (0.23579959521817004, 1688),
 (0.23003808925793026, 2893),
 (0.18353721896845113, 1497)]

In [46]:
for prob, doc in closest_documents:
    if prob > 0.70:
        print(search_sample.iloc[doc]['title'])

Пушене в заведение "Плаза", град София
пушене в Нощен Клуб "Плаза", гр. София, Студентски Град (мобилен сигнал)
Пушене в ресторант "Плаза", град Монтана
Тютюнопушене в дискотека Плаза, град Варна


In [47]:
title_words = set(sum(list(full_sample['matched_title_tokens']), []))
title_words

{'kitchen',
 'тилилей',
 'ракова',
 'секвоя',
 'шопа',
 'калабрия',
 'bee',
 'palace',
 'библиотека',
 'бъфало',
 'счетоводство-каса',
 'превръща',
 'дик',
 'маймунарника',
 'мукатини',
 'pasha',
 'тенис',
 'snake',
 'proya',
 'плаза',
 'крез',
 '60',
 'градски',
 'велико',
 'майчин',
 'nerra',
 'варна',
 'кристал',
 'рафи',
 'chill',
 'calgary',
 'новооткрития',
 'аракс',
 'marionette',
 'южния',
 'fun',
 'етажни',
 'biad',
 'тюлбето',
 'маестро',
 'димитровград',
 'мохито',
 'като',
 'кофеин',
 'перфект',
 'т',
 'липите',
 'ресторент',
 'за',
 'лейбъл',
 'vino',
 'иван',
 'кос',
 'ескада',
 'крос',
 'ресторанта',
 'кольо',
 'maze',
 'dirty',
 '8',
 'спортен',
 'eisha',
 'нестинарка',
 'go',
 'traffic',
 'ибър',
 'be',
 'envy',
 'арена',
 'time',
 'тютюнопушене',
 'даяна',
 'от',
 'таралежат',
 'хелена',
 'уиски',
 'mia',
 'шопите',
 'барове',
 'колйнс',
 'бонини',
 'генгер',
 'бирария',
 'марина',
 'mamma',
 'светулки',
 '101',
 'festivalna',
 'зона',
 'diner',
 'caffe',
 'антик',
 '

In [48]:
def find_closest_word(word, set_of_words):
    suggested_word = ''
    coeff = -1
    if word not in set_of_words:
        for sw in set_of_words:
            fuzz_coeff = fuzz.ratio(sw, word)
            if fuzz_coeff > coeff:
                coeff = fuzz_coeff
                suggested_word = sw
        return suggested_word
    return word

In [49]:
def edit_query(query):
    new_query = []
    nq = False
    for w in query.split():
        new_query.append(find_closest_word(w, title_words))
    
    new_query = ' '.join(new_query)
    
    if query != new_query:
        ans = input('Do you want to search for: {} instead?'.format(new_query))
        if ans == 'y':
            return new_query
    
    return query

In [50]:
query_2 = 'бирария чиърс'

In [51]:
query_2 = edit_query(query_2)

In [52]:
closest_documents_2 = get_closest_documents(query_2, tfidf_vectorizer, train_corpus_vectors)

In [53]:
for prob, doc in closest_documents_2:
    if prob > 0.70:
        print(search_sample.iloc[doc]['title'])

Пушене в бирария Талпа, град София  (мобилен сигнал)
Тютюнопушене на закрито в бирария 1516, град София
Пушене в бирария Камбаната, град Велико Търново (мобилен сигнал)
Пушене в бирария "6-ти май", град Русе
Пушене в БИРАРИЯ ЧИЪРС, град ПЛОВДИВ (мобилен сигнал)
Тютюнопушене в ресторант бирария Даикс, град Пловдив
Нарушение на забраната за пушене в бирария "Кръста", град София
Пушене в бирария/ресторант  "Бирария 43", град София (мобилен сигнал)
Пушене в бирария и механа Мока и Генгер, град Айтос (мобилен сигнал)
