In [325]:
import pandas as pd
import numpy as np
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import OrderedDict
import unicodedata

In [326]:
STOPWORDS = set(stopwords.words("portuguese"))
stemmer = PorterStemmer()

In [327]:
attributes = ["title", "price", "colors", "sizes", "description", "url", "html"]
df = pd.read_csv("../Results/wrapper_docs.csv").drop(["Unnamed: 0"], axis=1)
df["price"] = df["price"].replace('[\/R$,]', '', regex=True).apply(lambda x: float(x) / 100)
prices = df["price"]
qrts = [prices.quantile(.25), prices.quantile(.5), prices.quantile(.75), prices.quantile(1)]
columns = df.columns

In [328]:
def quartile(value):
    qrt = ""
    if value >= qrts[3]:
        qrt = "4q"
    elif value >= qrts[2]:
        qrt = "3q"
    elif value >= qrts[1]:
        qrt = "2q"
    elif value > 0:
        qrt = "1q"
    else:
        qrt = ""
    return qrt

In [329]:
df["price"]  = df["price"].apply(lambda x: quartile(x))

In [330]:
def filter_lossy(words):
    words = [w for w in words if w not in STOPWORDS]
    return [stemmer.stem(w) for w in words]

In [331]:
def tokenize(word):
    nfkd = unicodedata.normalize('NFKD', str(word))
    doc = nfkd.encode('ASCII', 'ignore').decode('ASCII')
    d = re.sub("[^a-zA-Z0-9]", " ", doc)
    words = d.lower().split()
    return filter_lossy(words)

In [332]:
# Extrai os termos dos documents, ordena os termos e retorna a ED: termo.campo -> (Id do document, frequencia)
def extract_terms(documents):
    terms = []
    for idx, doc in enumerate(documents):
        for field, info in enumerate(doc[:-2]):
            if not pd.isnull(info):
                tokens = list(map(lambda x: x.lower(), info)) if isinstance(info, list) else tokenize(info)
                for token in tokens:
                    field_index = token + '.' + columns[field]
                    pair = (field_index, (idx, tokens.count(token)))
                    if pair not in terms:
                        terms.append(pair)
    return sorted(terms)

In [333]:
# Comprime os postings
def compress_postings(postings):
    compressed = [postings[0]]
    last = postings[0][0]
    for posting in postings[1:]:
        doc_id, freq = posting
        compressed.append((doc_id - last, freq))
        last = doc_id
    return compressed

In [334]:
def decompress_postings(postings):
    decompress = [postings[0]]
    last = postings[0][0]
    for posting in postings[1:]:
        doc_id, freq = posting
        decompress.append((doc_id + last, freq))
        last += doc_id
    return decompress

In [335]:
# Cria o Íncide como uma ED termo -> postings
def create_index(documents, use_compression=True):
    terms = extract_terms(documents)
    index = OrderedDict()
    for term, doc_id in terms:
        index.setdefault(term, []).append(doc_id)
    if use_compression:
        for term, postings in index.items():
            index[term] = compress_postings(postings)
    return index

In [341]:
with open('compress.json', 'w') as f:
    f.write(json.dumps(create_index(df.values)))

In [337]:
df

Unnamed: 0,title,price,colors,sizes,description,url,html
0,,,,,,https://www.damyller.com.br/moda-feminina/blus...,www.damyller.com.br_0.html
1,,,,,,https://www.damyller.com.br/moda-feminina/blus...,www.damyller.com.br_1.html
2,,,,,,https://www.damyller.com.br/moda-masculina/cam...,www.damyller.com.br_2.html
3,,,,,,https://www.damyller.com.br/moda-masculina/cam...,www.damyller.com.br_3.html
4,,,,,,https://www.damyller.com.br/moda-masculina/cam...,www.damyller.com.br_4.html
5,,,,,,https://www.damyller.com.br/moda-masculina/cam...,www.damyller.com.br_5.html
6,Blusa Decote V com Estampa Feminina,2q,"[""LARANJA/MARINHO""]","[""PP"",""P"",""M"",""G"",""GG""]",Com o shape soltinho e uma estampa super moder...,https://www.damyller.com.br/blusa-decote-v-com...,www.damyller.com.br_6.html
7,Body Canelado com Estampa Floral,2q,"[""OFF-WHITE/VERDE""]","[""PP"",""P"",""M"",""G""]",O body é aquela peça prática e indispensável d...,https://www.damyller.com.br/body-canelado-com-...,www.damyller.com.br_7.html
8,Blusa com Abotoamento Frontal,2q,"[""Azul/Preto""]","[""PP"",""P"",""M"",""G""]","Com shape soltinho e abotoamentos frontais, a ...",https://www.damyller.com.br/blusa-com-abotoame...,www.damyller.com.br_8.html
9,Top Cropped com Estampa Floral,2q,"[""OFF-WHITE/VERDE""]","[""PP"",""P"",""M""]",O shape de ciganinha aliado ao tecido vazado d...,https://www.damyller.com.br/top-cropped-com-es...,www.damyller.com.br_9.html
