## Importação das Bibliotecas

In [155]:
import os
import json
import pandas as pd
from elasticsearch import Elasticsearch, helpers
import multiprocessing
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to C:\Users\Tuby
[nltk_data]     Neto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Tuby
[nltk_data]     Neto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Funções de Indexação e Busca

In [156]:
def request_create_index():
    return {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 1,
        },
        "mappings": {
            "properties": {
              "product_uid": { "type": "text" },
              "product_title": { "type": "text" },
              "product_description": { "type": "text" },
            }
        }
    }


def create_request_document(json_text, index):  
    description = json_text["product_description"]
    return {
        "_op_type": "create",
        "_index": index,
        "_source": {
            "product_uid": json_text["product_uid"],
            "product_title": json_text["product_title"],
            "product_description": description,
        },
    } 

def request_search(query):
    return {
        "from":0,
        "size" : 1000000,
        "_source": ["product_uid", "product_title", "product_description"],
            "query":{
                 "multi_match": {
                    "query":    query,
                    "fields": ["product_title^2", "product_description"]
                }
            }
        }

def request_match_all():
    body = {"query": {"match_all": {}}}
    return body

## Classe do Elastic Search

In [157]:
class Elasticsearch_service:
    def __init__(self, index_name, ip="localhost", timeout=1000000):
        self.ip = ip
        self.es = Elasticsearch(hosts=ip)
        self.timeout = timeout
        self.index_name = index_name

    def create_index(self):
        request_body = request_create_index()
        try:
            ret = self.es.indices.create(
                index=self.index_name, body=request_body, request_timeout=self.timeout
            )
        except Exception as e:
            return (e)
        return ret["acknowledged"]

    def indexing(self, list_json_text):
        
        request_body = [create_request_document(doc, self.index_name) for doc in list_json_text]
        try:
            res = helpers.bulk(self.es, request_body, request_timeout=self.timeout)
        except:
            return False
        return True

    def index_exists(self):
        return self.es.indices.exists(index=self.index_name)

    def delete_index(self):
        self.es.indices.delete(index=self.index_name)

    def search(self, query):
        request_body = request_search(query)
        try:
            text_return = self.es.search(
                body=request_body, index=self.index_name, request_timeout=self.timeout
            )
        except:
            raise ValueError("Search Error!")

        hits = []
        for hit in text_return["hits"]["hits"]:
            hits.append({"product_uid": hit["_source"]["product_uid"],"product_title": hit["_source"]["product_title"], "product_description": hit["_source"]["product_description"]})
        
        return hits

## Leitura do Dataset a ser indexado

In [158]:
df = pd.read_csv('products_crowdedflower.csv')

## Renomeando as colunas

In [159]:
df.rename(columns={'product_title': 'product_title', '_unit_id': 'product_uid', 'product_description': 'product_description'}, inplace=True)

In [160]:
df

Unnamed: 0,product_uid,product_title,product_description
0,711158459,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,The PlayStation 4 system opens the door to an ...
1,711158460,Sony PlayStation 4 (Latest Model)- 500 GB Jet ...,The PlayStation 4 system opens the door to an ...
2,711158461,Sony PlayStation 4 PS4 500 GB Jet Black Console,The PlayStation 4 system opens the door to an ...
3,711158462,Sony - PlayStation 4 500GB The Last of Us Rema...,
4,711158465,BRAND NEW Sony PlayStation 4 BUNDLE 500gb,
...,...,...,...
18889,711179068,NewMetro Design KA-6LR KitchenAid 6-Quart Bowl...,Features - An Enticing Elixir Eau De Toilette ...
18890,711179069,NewMetro Design KA-THPRO KitchenAid Tilt-Head ...,The ultimate hands-free mixing blade endures t...
18891,711179070,KitchenAid Flex Edge Beater,Upgrade your kitchen accessory collection with...
18892,711179071,KitchenAid Sausage Stuffer Kit,If you love to create mouth-watering food at h...


## Substituição de float('NaN') por string vazia

In [161]:
df['product_description'] = df['product_description'].replace(float("NaN"), '')

In [162]:
df

Unnamed: 0,product_uid,product_title,product_description
0,711158459,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,The PlayStation 4 system opens the door to an ...
1,711158460,Sony PlayStation 4 (Latest Model)- 500 GB Jet ...,The PlayStation 4 system opens the door to an ...
2,711158461,Sony PlayStation 4 PS4 500 GB Jet Black Console,The PlayStation 4 system opens the door to an ...
3,711158462,Sony - PlayStation 4 500GB The Last of Us Rema...,
4,711158465,BRAND NEW Sony PlayStation 4 BUNDLE 500gb,
...,...,...,...
18889,711179068,NewMetro Design KA-6LR KitchenAid 6-Quart Bowl...,Features - An Enticing Elixir Eau De Toilette ...
18890,711179069,NewMetro Design KA-THPRO KitchenAid Tilt-Head ...,The ultimate hands-free mixing blade endures t...
18891,711179070,KitchenAid Flex Edge Beater,Upgrade your kitchen accessory collection with...
18892,711179071,KitchenAid Sausage Stuffer Kit,If you love to create mouth-watering food at h...


## Tokenização 

In [163]:
df['tokenized_title'] = df.apply(lambda row: nltk.word_tokenize(row['product_title']), axis=1)

In [164]:
df['tokenized_description'] = df.apply(lambda row: nltk.word_tokenize(row['product_description']), axis=1)

## Função de Remoção de StopWords

In [165]:
def remove_stopwords(palavras):
    palavras = [palavra.lower() for palavra in palavras]
    result = []
    for palavra in palavras:
        if palavra not in stop_words:
            result.append(palavra)
    return result

## Função de Stemming

In [166]:
stemmer = nltk.stem.RSLPStemmer()
def stemming(palavras):
    result = []
    for w in palavras:
        result.append(stemmer.stem(w))
    return result

## Função de Criação das Bases

In [167]:
def create_base(df_prod, processings):
    d_title = df_prod['tokenized_title']
    d_desc = df_prod['tokenized_description']

    if "stopwords" in processings:
        d_title = d_title.apply(lambda row: remove_stopwords(row))

    
    if "stemming" in processings:
        d_title = d_title.apply(lambda row: stemming(row))

    df_final = pd.DataFrame()
    df_final['product_uid'] = df_prod['product_uid']
    df_final['product_title'] = d_title.str.join(" ")
    df_final['product_description'] = d_desc.str.join(" ")
    return df_final

## Instanciação das Bases

In [168]:
Base1 = create_base(df, [])
#Base2 = create_base(df, ['stopwords'])
#Base3 = create_base(df, ['stemming'])
#Base4 = create_base(df, ['stopwords','stemming'])

In [169]:
Base1

Unnamed: 0,product_uid,product_title,product_description
0,711158459,Sony PlayStation 4 ( PS4 ) ( Latest Model ) - ...,The PlayStation 4 system opens the door to an ...
1,711158460,Sony PlayStation 4 ( Latest Model ) - 500 GB J...,The PlayStation 4 system opens the door to an ...
2,711158461,Sony PlayStation 4 PS4 500 GB Jet Black Console,The PlayStation 4 system opens the door to an ...
3,711158462,Sony - PlayStation 4 500GB The Last of Us Rema...,
4,711158465,BRAND NEW Sony PlayStation 4 BUNDLE 500gb,
...,...,...,...
18889,711179068,NewMetro Design KA-6LR KitchenAid 6-Quart Bowl...,Features - An Enticing Elixir Eau De Toilette ...
18890,711179069,NewMetro Design KA-THPRO KitchenAid Tilt-Head ...,The ultimate hands-free mixing blade endures t...
18891,711179070,KitchenAid Flex Edge Beater,Upgrade your kitchen accessory collection with...
18892,711179071,KitchenAid Sausage Stuffer Kit,If you love to create mouth-watering food at h...


# Base 1

In [170]:
b1 = Base1.to_dict('records')
b1

[{'product_uid': 711158459,
  'product_title': 'Sony PlayStation 4 ( PS4 ) ( Latest Model ) - 500 GB Jet Black Console',
  'product_description': 'The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community . Step into living , breathing worlds where you are hero of your epic journey . Explore gritty urban environments , vast galactic landscapes , and fantastic historical settings brought to life on an epic scale , without limits . With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console . The PS4 system is developer inspired , gamer focused . The PS4 system learns how you play and intuitively curates the content you use most often . Fire it up , and your PS4 system points the way to new , amazing experiences you can jump into alone or with friends . Create your own legend using a sophisticat

## Criação do Índice para a Base 1

In [171]:
es = Elasticsearch_service("database6")

In [145]:
#es.indexing(b1)

True

## Instanciando Buscas

In [172]:
df_test = pd.read_csv('df_test_crowdedflower2.csv')

In [173]:
df_test

Unnamed: 0,_unit_id,product_title,product_description,query,relevance
0,711158634,MEN'S NWT VINTAGE REEBOK BRETT FAVRE NEW YORK ...,,Brett Favre NY Titans jersey blue,3.33
1,711158635,Reebok NFL Equipment NY Jets Brett Favre Green...,,Brett Favre NY Titans jersey blue,2.20
2,711158636,Brett Favre New York Titans NFL Equipment Reeb...,,Brett Favre NY Titans jersey blue,3.00
3,711158637,Brett Favre autographed signed NY Titans Throw...,<strong>triplecrownauthentics store</strong>,Brett Favre NY Titans jersey blue,2.67
4,711158638,Brett Favre NY New York Jets Wholesale NFL Foo...,,Brett Favre NY Titans jersey blue,2.67
...,...,...,...,...,...
4032,711178872,Panasonic 10-Cup Rice Cooker / Steamer,,rice cooker,4.00
4033,711178873,Aroma 8-Cup Digital Rice Cooker and Food Steamer,Easily make restaurant-quality white and brown...,rice cooker,3.67
4034,711178874,Tiger 10-Cup Electric Rice Cooker,The Tiger 10-Cup Electric Rice Cooker is capab...,rice cooker,3.67
4035,711178875,Aroma 12-Cup Cool-Touch Digital Egg-Shaped Ric...,About this item\nEnjoy easy home cooking with ...,rice cooker,4.00


In [174]:
search_term_list = df_test['query'].unique()
len(search_term_list)

50

In [175]:
search_term_list

array(['Brett Favre NY Titans jersey blue', 'yankee candle',
       'storage ottoman', 'coffee for nespresso', 'stick vacuum',
       'Single serve coffee maker', 'Phillips coffee maker',
       'playstation vita system', 'aveeno shampoo', 'notebook paper',
       'an extremely goofy movie', 'seiko monster', 'lego star wars',
       'candle lantern', 'plantronics corded headset',
       'dual headset splitter', 'flea and tick control for dogs',
       'batman', 'rice cooker', 'tote bag', 'nike womens',
       'toddler sandals', 'cowboy boots', 'portable hard drive',
       'bike lock', 'Watch women fossil', 'skechers womens shoes',
       'khaki pants', 'reusable straws', 'high heels shoes',
       'flannel sheets', 'macbook case 13 case', 'Cocoa Butter',
       'yankees', 'car window sticker', 'harley-davidson',
       "leather men's briefcase", 'glitter vials', 'long prom dress',
       'iphone 4 case', 'coffee cup', 'flower bulbs',
       'outdoor table cover', 'barbie', 'infinity s

In [187]:
import re
query_id = {}

def escape_elasticsearch_query(query):
    return re.sub('(\+|\-|\=|&&|\|\||\>|\<|\!|\(|\)|\{|\}|\[|\]|\^|"|~|\*|\?|\:|\\\|\/)', '\\\\\\1', query)


for query in search_term_list:
    
    returned_products = []
    returned_ids = []
    
    #query = escape_elasticsearch_query(query)
    returned_products = es.search(query)

    
    for i in range(len(returned_products)):
        returned_ids.append(returned_products[i]['product_uid'])
        
    if (len(returned_ids) >= 500):
        query_id[str(query)] = returned_ids[0:500]
    else:
        query_id[str(query)] = returned_ids + [-1]*(500 - len(returned_ids))


query_id

{'Brett Favre NY Titans jersey blue': [711158642,
  711158640,
  711158643,
  711158653,
  711158637,
  711158636,
  711158639,
  711158641,
  711158635,
  711158649,
  711158656,
  711158634,
  711158638,
  711158644,
  711158646,
  711158654,
  711158648,
  711158650,
  711158657,
  711158658,
  711158645,
  711165729,
  711158647,
  711158651,
  711165728,
  711158652,
  711158655,
  711165732,
  711165741,
  711165734,
  711165731,
  711165736,
  711165737,
  711174643,
  711174645,
  711172674,
  711174639,
  711176617,
  711174638,
  711174642,
  711165733,
  711171337,
  711174641,
  711165730,
  711168255,
  711174637,
  711174640,
  711174644,
  711165735,
  711168872,
  711168256,
  711161448,
  711164760,
  711165011,
  711170743,
  711174653,
  711166447,
  711173930,
  711174368,
  711174651,
  711177761,
  711165289,
  711161437,
  711166463,
  711170568,
  711170746,
  711177771,
  711161989,
  711169104,
  711174646,
  711174648,
  711174650,
  711174654,
  711177764,
 

In [188]:
result_query = pd.DataFrame(query_id)
result_query

Unnamed: 0,Brett Favre NY Titans jersey blue,yankee candle,storage ottoman,coffee for nespresso,stick vacuum,Single serve coffee maker,Phillips coffee maker,playstation vita system,aveeno shampoo,notebook paper,...,coffee cup,flower bulbs,outdoor table cover,barbie,infinity scarf,nike flip flops,fuji bike shirt,oakley polarized radar,pencil skirt,victoria secret pink shorts
0,711158642,711166826,711167102,711167487,711167524,711168017,711159763,711173315,711168639,711168796,...,711164126,711178091,711170517,711178629,711164779,711167743,711177582,711170440,711173826,711173535
1,711158640,711166806,711167118,711167493,711159463,711176416,711163039,711159822,711168643,711163606,...,711164127,711178086,711170515,711173868,711164780,711167747,711177584,711170441,711173831,711173534
2,711158643,711166817,711162306,711159433,711176034,711176276,711168017,711159829,711168645,711163607,...,711169783,711178077,711170516,711164647,711170750,711167744,711177585,711170442,711170574,711173533
3,711158653,711166821,711162311,711159439,711159467,711167996,711172725,711159837,711168646,711163625,...,711169788,711178081,711170518,711178646,711170752,711167753,711177586,711170450,711173829,711158777
4,711158637,711166809,711162312,711159449,711159464,711168003,711166921,711159821,711168647,711163618,...,711172716,711178083,711170520,711178639,711164771,711172370,711177587,711170451,711170572,711173640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,711176618,-1,711175646,711159102,-1,711160492,711171095,711165507,-1,-1,...,711162569,-1,711165883,-1,-1,-1,711173326,-1,-1,711174862
496,711158730,-1,711175647,711159182,-1,711160496,711171096,711169070,-1,-1,...,711165506,-1,711167560,-1,-1,-1,711173877,-1,-1,711176076
497,711158732,-1,711175648,711159186,-1,711169005,711171105,711162331,-1,-1,...,711167476,-1,711167785,-1,-1,-1,711174488,-1,-1,711176078
498,711158799,-1,711175651,711159188,-1,711173466,711171107,711162575,-1,-1,...,711167477,-1,711170315,-1,-1,-1,711175991,-1,-1,711176330


In [189]:
result_query.reset_index().to_csv("result_search_crowdedflower.csv", index=False)