In [497]:
import pandas as pd
import numpy as np
import json
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import OrderedDict
import unicodedata

In [498]:
STOPWORDS = set(stopwords.words("portuguese"))
stemmer = PorterStemmer()

In [499]:
attributes = ["title", "price", "colors", "sizes", "description", "url", "html"]
df = pd.read_csv("../Results/wrapper_docs.csv").drop(["Unnamed: 0"], axis=1)
df["price"] = df["price"].replace('[\/R$,]', '', regex=True).apply(lambda x: float(x) / 100)
prices = df["price"]
qrts = [prices.quantile(.25), prices.quantile(.5), prices.quantile(.75), prices.quantile(1)]
columns = df.columns

In [500]:
def quartile(value):
    qrt = ""
    if value >= qrts[3]:
        qrt = "4q"
    elif value >= qrts[2]:
        qrt = "3q"
    elif value >= qrts[1]:
        qrt = "2q"
    elif value > 0:
        qrt = "1q"
    else:
        qrt = ""
    return qrt
df["price"]  = df["price"].apply(lambda x: quartile(x))

In [501]:
def colors(color):
    color = ",".join(color.split("/"))
    if "white" in color:
        color = color.replace("white", "branco")
    elif "blue" in color:
        color = color.replace("blue", "azul")
    elif "tamanho único" in color:
        color = color.replace(",tamanho único", "")
    return color

In [502]:
df["colors"] = df["colors"].apply(lambda x: x.lower() if not pd.isnull(x) else x)
df["sizes"] = df["sizes"].apply(lambda x: x.lower() if not pd.isnull(x) else x)
df.fillna("", inplace=True)
df["colors"] = df["colors"].apply(lambda x: colors(x))

In [476]:
def filter_lossy(words):
    words = [w for w in words if w not in STOPWORDS]
    return [stemmer.stem(w) for w in words]

In [477]:
def tokenize(word):
    nfkd = unicodedata.normalize('NFKD', str(word))
    doc = nfkd.encode('ASCII', 'ignore').decode('ASCII')
    d = re.sub("[^a-zA-Z0-9]", " ", doc)
    words = d.lower().split()
    return filter_lossy(words)

In [478]:
# Extrai os termos dos documents, ordena os termos e retorna a ED: termo.campo -> (Id do document, frequencia)
def extract_terms(documents):
    terms = []
    for idx, doc in enumerate(documents):
        for field, info in enumerate(doc[:-2]):
            if not pd.isnull(info):
                tokens = list(map(lambda x: x.lower(), info)) if isinstance(info, list) else tokenize(info)
                for token in tokens:
                    field_index = token + '.' + columns[field]
                    pair = (field_index, (idx, tokens.count(token)))
                    if pair not in terms:
                        terms.append(pair)
    return sorted(terms)

In [479]:
# Comprime os postings
def compress_postings(postings):
    compressed = [postings[0]]
    last = postings[0][0]
    for posting in postings[1:]:
        doc_id, freq = posting
        compressed.append((doc_id - last, freq))
        last = doc_id
    return compressed

In [480]:
def decompress_postings(postings):
    decompress = [postings[0]]
    last = postings[0][0]
    for posting in postings[1:]:
        doc_id, freq = posting
        decompress.append((doc_id + last, freq))
        last += doc_id
    return decompress

In [481]:
# Cria o Íncide como uma ED termo -> postings
def create_index(documents, use_compression=True):
    terms = extract_terms(documents)
    index = OrderedDict()
    for term, doc_id in terms:
        index.setdefault(term, []).append(doc_id)
    if use_compression:
        for term, postings in index.items():
            index[term] = compress_postings(postings)
    return index

In [504]:
with open('no-compress.json', 'w') as f:
    f.write(json.dumps(create_index(df.values, False)))

In [505]:
obj = {'content' : 'something goes here'}
json_obj = json.load(open('./compress.json'), object_pairs_hook=OrderedDict)
json_size = len(json_obj)
print(json_size)

1708


In [510]:
rows = []

statinfo = os.stat("../Results/wrapper_docs.csv")
size = int(statinfo.st_size / 1024)
rows.append(["List Documents", size])

statinfo = os.stat("./compress.json")
size = int(statinfo.st_size / 1024)
rows.append(["Index Compressed", size])

statinfo = os.stat("./no-compress.json")
size = int(statinfo.st_size / 1024)
rows.append(["Index Uncompressed", size])

In [512]:
pd.DataFrame(rows, columns=["Data", "Size (KB)"]).to_csv(path_or_buf=r"index_results.csv")