In [1]:
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer

In [2]:
stopwords_sample = pd.read_json('stopwords-bg.json')[259:]

In [3]:
sample = pd.read_json('sample_100_pages_names.json')

In [4]:
sample.columns

Index(['categories', 'date', 'description', 'files', 'id', 'location', 'title',
       'matched_name', 'matched_category', 'matched_city', 'matched_address',
       'matched_title', 'matched_title_2'],
      dtype='object')

In [5]:
columns_for_index = ['matched_name', 'matched_title', 'matched_title_2']

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

sample[columns_for_index] = sample[columns_for_index].fillna("")

for col in columns_for_index:
    sample[col + '_tokens'] = sample[col].apply(tokenizer.tokenize)
    

In [6]:
sample['tokens'] = sample['matched_name_tokens'] + sample['matched_title_tokens'] + sample['matched_title_2_tokens']

In [7]:
sample[['id', 'tokens']]

Unnamed: 0,id,tokens
0,10244,"[сладкарница, малинка, сладкарница, малинка]"
1,10249,[]
10,10207,"[заведение, златна, белка, заведение, златна, ..."
100,9931,"[заведение, маки, заведение, маки]"
1000,9848,"[пицария, ветрило, пицария, ветрило]"
1001,9847,"[ресторант, хасиенда, ресторант, хасиенда]"
1002,9852,"[corner, bar, corner, bar]"
1003,9842,[]
1004,9845,[]
1005,9844,[]


### Get (token, id) pairs

In [8]:
def get_report(row):
    return row[1]

def get_token_report_id_pairs(reports):
    pairs = []
    for report in reports:
        pairs += [(token, report.id) for token in report.tokens]
            
    return pairs
    
    
token_id_pairs = get_token_report_id_pairs(map(get_report, sample[['id', 'tokens']].iterrows()))
token_id_pairs[:20]

[('сладкарница', 10244),
 ('малинка', 10244),
 ('сладкарница', 10244),
 ('малинка', 10244),
 ('заведение', 10207),
 ('златна', 10207),
 ('белка', 10207),
 ('заведение', 10207),
 ('златна', 10207),
 ('белка', 10207),
 ('заведение', 9931),
 ('маки', 9931),
 ('заведение', 9931),
 ('маки', 9931),
 ('пицария', 9848),
 ('ветрило', 9848),
 ('пицария', 9848),
 ('ветрило', 9848),
 ('ресторант', 9847),
 ('хасиенда', 9847)]

In [9]:
from operator import itemgetter
sorted_token_id = sorted(token_id_pairs, key=itemgetter(0))
sorted_token_id[-10:]

[('ягода', 7653),
 ('ямас', 9277),
 ('ямас', 9277),
 ('янка', 8891),
 ('янка', 8891),
 ('янка', 8472),
 ('янка', 8472),
 ('янтра', 10562),
 ('янтра', 10562),
 ('янтра', 10562)]

### Merge token occurences for each report

In [10]:
def merge_token_in_report(sorted_token_id):
    token_id_freq = []
    for token, id in sorted_token_id:
        if token_id_freq:
            prev_tok, prev_id, prev_freq = token_id_freq[-1]
            if prev_tok == token and prev_id == id:     
                token_id_freq[-1] = (token, id, prev_freq+1)
            else:
                token_id_freq.append((token, id, 1))
        else:
            token_id_freq.append((token, id, 1))
    return token_id_freq

In [11]:
token_id_freq = merge_token_in_report(sorted_token_id)
token_id_freq[-10:]

[('южен', 10425, 3),
 ('южния', 7923, 2),
 ('юнион', 6859, 2),
 ('ябълка', 7840, 2),
 ('ягода', 7651, 4),
 ('ягода', 7653, 2),
 ('ямас', 9277, 2),
 ('янка', 8891, 2),
 ('янка', 8472, 2),
 ('янтра', 10562, 3)]

### Create Dictionary and Postings

In [19]:
from collections import defaultdict
dictionary = defaultdict(lambda: (0, 0))
postings = defaultdict(lambda: [])

for token, id, freq in token_id_freq:
    dictionary[token] = (dictionary[token][0] + 1, dictionary[token][0] + freq)

#postings
for token, id, freq in token_id_freq:
    postings[token].append((id, freq))

In [23]:
dictionary['малинка']

(1, 2)

In [21]:
postings['малинка']

[(10244, 2)]