In [1]:
from os.path import join
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer
from operator import itemgetter
from collections import defaultdict
from functools import reduce
import operator

In [2]:
sample = pd.read_json(join('..', 'data', 'ner', 'sample_100_pages_names_tokens.json'))
sample = sample.reset_index(drop=True)

In [3]:
columns_for_index = ['matched_name', 'matched_title_str', 'matched_title_2_str', 'location']

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

sample[columns_for_index] = sample[columns_for_index].fillna("")

for col in columns_for_index:
    sample[col + '_tokens'] = sample[col].apply(tokenizer.tokenize)
    

Combine tokens from index columns

In [4]:
sample['tokens'] = (sample['matched_name_tokens'] +
                sample['matched_title_str_tokens'] +
                sample['matched_title_2_str_tokens'] +
                sample['location_tokens'])

In [5]:
sample[['id', 'tokens']][:20]

Unnamed: 0,id,tokens
0,10244,"[сладкарница, малинка, сладкарница, малинка, г..."
1,10249,"[град, софия, ,, ул, ., пирински, проход, 24, а]"
2,10207,"[заведение, златна, белка, заведение, златна, ..."
3,9931,"[заведение, маки, заведение, маки, град, сливе..."
4,9848,"[пицария, ветрило, пицария, ветрило, град, соф..."
5,9847,"[ресторант, хасиенда, ресторант, хасиенда, гра..."
6,9852,"[corner, bar, corner, bar, град, пловдив, ,, у..."
7,9842,"[град, софия, ,, ул, битоля, 4а]"
8,9845,"[град, софия, ,, пл, ., „, народно, събрание, ..."
9,9844,"[град, софия, ,, бул, ., иван, гешов, №, 15]"


### Get (token, id) pairs

In [6]:
def get_report(row):
    return row[1]

def get_token_report_id_pairs(reports):
    pairs = []
    for report in reports:
        pairs += [(token, report.id) for token in report.tokens]
            
    return pairs
    
    
token_id_pairs = get_token_report_id_pairs(map(get_report, sample[['id', 'tokens']].iterrows()))
token_id_pairs[:20]

[('сладкарница', 10244),
 ('малинка', 10244),
 ('сладкарница', 10244),
 ('малинка', 10244),
 ('град', 10244),
 ('софия', 10244),
 (',', 10244),
 ('кв', 10244),
 ('.', 10244),
 ('младост', 10244),
 (',', 10244),
 ('сп', 10244),
 ('.', 10244),
 ('окръжна', 10244),
 ('болница', 10244),
 (',', 10244),
 ('до', 10244),
 ('детската', 10244),
 ('градина', 10244),
 ('град', 10249)]

In [7]:
from operator import itemgetter
sorted_token_id = sorted(token_id_pairs, key=itemgetter(0))

### Merge token occurences for each report

In [8]:
def merge_token_in_report(sorted_token_id):
    token_id_freq = []
    for token, id in sorted_token_id:
        if token_id_freq:
            prev_tok, prev_id, prev_freq = token_id_freq[-1]
            if prev_tok == token and prev_id == id:     
                token_id_freq[-1] = (token, id, prev_freq+1)
            else:
                token_id_freq.append((token, id, 1))
        else:
            token_id_freq.append((token, id, 1))
    return token_id_freq

In [9]:
token_id_freq = merge_token_in_report(sorted_token_id)
token_id_freq[30700:30720]

[('участък', 8678, 1),
 ('учебни', 9890, 2),
 ('училище', 9349, 1),
 ('училище', 9692, 2),
 ('уши', 9595, 3),
 ('ф', 6884, 1),
 ('фaкултет', 7541, 1),
 ('фабрика', 8653, 1),
 ('фабрика', 6869, 1),
 ('фабриката', 7995, 2),
 ('фаворит', 6818, 2),
 ('фамилия', 9503, 3),
 ('фантазия', 8178, 2),
 ('фара', 10371, 1),
 ('фарът', 10371, 3),
 ('федора', 8904, 2),
 ('фейсис', 8807, 1),
 ('фейсис', 9066, 3),
 ('фейсис', 7707, 1),
 ('фейсис', 6795, 2)]

### Create Dictionary and Postings

In [10]:
from collections import defaultdict
dictionary = defaultdict(lambda: (0, 0))
postings = defaultdict(lambda: [])

for token, id, freq in token_id_freq:
    dictionary[token] = (dictionary[token][0] + 1, dictionary[token][1] + freq)

#postings
for token, id, freq in token_id_freq:
    postings[token].append((id, freq))

In [11]:
dictionary['pizza']

(18, 48)

In [12]:
postings['pizza']

[(10036, 3),
 (9096, 3),
 (9236, 3),
 (9282, 3),
 (8435, 2),
 (8278, 3),
 (8388, 3),
 (8422, 3),
 (8328, 3),
 (8346, 2),
 (8501, 2),
 (8465, 3),
 (8662, 3),
 (10151, 3),
 (10355, 3),
 (7682, 3),
 (6896, 2),
 (9473, 1)]

Sort the postings

In [13]:
for key, values in postings.items():
    postings[key] = sorted(values, key=itemgetter(0))

### "And" query for the postings

In [14]:
import numpy as np
from functools import reduce

def and_query(words):
    """
    Finds all the documents that contain all the words with the frequescies summed
    """
    occurences = [{id: freq for id, freq in postings[word]} for word in words]
    common = reduce(
        set.intersection,
        [{id for id, freq in occ.items()} for occ in occurences])
    return {id: sum([occ[id] for occ in occurences]) for id in common}

In [20]:
id_dict = and_query(['mr', 'pizza'])
sorted(id_dict.items(), key=operator.itemgetter(1), reverse=True)

[(9282, 6),
 (8388, 6),
 (9096, 6),
 (8422, 5),
 (8465, 5),
 (10036, 5),
 (10355, 5),
 (8662, 5),
 (6896, 3),
 (8435, 3),
 (8346, 3)]

In [21]:
def parse_query(query_string):
    return tokenizer.tokenize(query_string)

def ids_to_rows(ids):
    return sample[sample.apply(lambda x: x.id in ids, axis=1)]

def find_matches(words):
    id_dict = and_query(words)
    sorted_by_freq = sorted(id_dict.items(), key=operator.itemgetter(1), reverse=True)
    ids = [id for id, freq in sorted_by_freq]
    return ids_to_rows(ids)

In [24]:
words = parse_query('Mr Pizza')
find_matches(words)[['id', 'title']]

Unnamed: 0,id,title
58,10036,"Пушене в ресторант ""Mr. Pizza"", град София (м)"
283,9096,"Пушене в ресторант Mr Pizza, град София (моб..."
470,9282,"Пушене в ресторант Mr Pizza, град София (моб..."
711,8435,"Пуши се в ресторант Mr. Pizza, град София"
840,8388,"Пушене в ресторант Mr Pizza, град София (мобил..."
867,8422,"Пушене в пицария Mr. Pizza, град София (мобиле..."
894,8346,"Пушене в ресторант Mr. Pizza, град София"
938,8465,"Пушене в ресторант Mr. Pizza, град София (моби..."
1074,8662,"Пушене в ресторант Mr. Pizza - Младост, град С..."
1112,10355,"Пушене в ресторант ""Mr. Pizza"", град София (м)"
