## Importação das Bibliotecas

In [1]:
import os
import json
import pandas as pd
from elasticsearch import Elasticsearch, helpers
import multiprocessing
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to C:\Users\Tuby
[nltk_data]     Neto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Tuby
[nltk_data]     Neto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Funções de Indexação e Busca

In [2]:
def request_create_index():
    return {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 1,
        },
        "mappings": {
            "properties": {
              "product_uid": { "type": "text" },
              "product_title": { "type": "text" },
              "product_description": { "type": "text" },
            }
        }
    }


def create_request_document(json_text, index):  
    description = json_text["product_description"]
    return {
        "_op_type": "create",
        "_index": index,
        "_source": {
            "product_uid": json_text["product_uid"],
            "product_title": json_text["product_title"],
            "product_description": description,
        },
    } 

def request_search(query):
    return {
        "from":0,
        "size" : 500,
        "_source": ["product_uid", "product_title", "product_description"],
            "query":{
                 "multi_match": {
                    "query":    query,
                    "fields": ["product_title^2", "product_description"]
                }
            }
        }

def request_match_all():
    body = {"query": {"match_all": {}}}
    return body

## Classe do Elastic Search

In [3]:
class Elasticsearch_service:
    def __init__(self, index_name, ip="localhost", timeout=1000000):
        self.ip = ip
        self.es = Elasticsearch(hosts=ip)
        self.timeout = timeout
        self.index_name = index_name

    def create_index(self):
        request_body = request_create_index()
        try:
            ret = self.es.indices.create(
                index=self.index_name, body=request_body, request_timeout=self.timeout
            )
        except Exception as e:
            return (e)
        return ret["acknowledged"]

    def indexing(self, list_json_text):
        
        request_body = [create_request_document(doc, self.index_name) for doc in list_json_text]
        try:
            res = helpers.bulk(self.es, request_body, request_timeout=self.timeout)
        except:
            return False
        return True

    def index_exists(self):
        return self.es.indices.exists(index=self.index_name)

    def delete_index(self):
        self.es.indices.delete(index=self.index_name)

    def search(self, query):
        request_body = request_search(query)
        try:
            text_return = self.es.search(
                body=request_body, index=self.index_name, request_timeout=self.timeout
            )
        except:
            raise ValueError("Search Error!")

        hits = []
        for hit in text_return["hits"]["hits"]:
            hits.append({"product_uid": hit["_source"]["product_uid"],"product_title": hit["_source"]["product_title"], "product_description": hit["_source"]["product_description"]})
        
        return hits

## Leitura do Dataset a ser indexado

In [4]:
df = pd.read_csv('products_wands.csv')

## Renomeando as colunas

In [5]:
df.rename(columns={'product_name': 'product_title', 'product_id': 'product_uid', 'product_description': 'product_description'}, inplace=True)

In [6]:
df

Unnamed: 0,product_uid,product_title,product_description
0,38855,abheer floor shelf coffee table,when it comes to rounding out your living ense...
1,9929,addaly abstract coffee table,anchor your living room in mid-century style w...
2,5235,adoni coffee table,
3,26772,ahart frame coffee table,your coffee table is a style centerpiece for y...
4,3510,ahern coffee table,the ahern 4 legs coffee table with storage is ...
...,...,...,...
42564,40245,podgorni hanging wine glass rack,display and protect your delicate wine or marg...
42565,40244,kena hanging wine glass rack,spruce up your farmhouse kitchen decor with th...
42566,13019,garrow hanging wine glass rack,save cabinet space and display favorite glasse...
42567,42846,esquire 2 5/8 '' length bar knob,


## Substituição de float('NaN') por string vazia

In [7]:
df['product_description'] = df['product_description'].replace(float("NaN"), '')

In [8]:
df

Unnamed: 0,product_uid,product_title,product_description
0,38855,abheer floor shelf coffee table,when it comes to rounding out your living ense...
1,9929,addaly abstract coffee table,anchor your living room in mid-century style w...
2,5235,adoni coffee table,
3,26772,ahart frame coffee table,your coffee table is a style centerpiece for y...
4,3510,ahern coffee table,the ahern 4 legs coffee table with storage is ...
...,...,...,...
42564,40245,podgorni hanging wine glass rack,display and protect your delicate wine or marg...
42565,40244,kena hanging wine glass rack,spruce up your farmhouse kitchen decor with th...
42566,13019,garrow hanging wine glass rack,save cabinet space and display favorite glasse...
42567,42846,esquire 2 5/8 '' length bar knob,


## Tokenização 

In [9]:
df['tokenized_title'] = df.apply(lambda row: nltk.word_tokenize(row['product_title']), axis=1)

In [10]:
df['tokenized_description'] = df.apply(lambda row: nltk.word_tokenize(row['product_description']), axis=1)

## Função de Remoção de StopWords

In [11]:
def remove_stopwords(palavras):
    palavras = [palavra.lower() for palavra in palavras]
    result = []
    for palavra in palavras:
        if palavra not in stop_words:
            result.append(palavra)
    return result

## Função de Stemming

In [12]:
stemmer = nltk.stem.RSLPStemmer()
def stemming(palavras):
    result = []
    for w in palavras:
        result.append(stemmer.stem(w))
    return result

## Função de Criação das Bases

In [13]:
def create_base(df_prod, processings):
    d_title = df_prod['tokenized_title']
    d_desc = df_prod['tokenized_description']

    if "stopwords" in processings:
        d_title = d_title.apply(lambda row: remove_stopwords(row))

    
    if "stemming" in processings:
        d_title = d_title.apply(lambda row: stemming(row))

    df_final = pd.DataFrame()
    df_final['product_uid'] = df_prod['product_uid']
    df_final['product_title'] = d_title.str.join(" ")
    df_final['product_description'] = d_desc.str.join(" ")
    return df_final

## Instanciação das Bases

In [14]:
Base1 = create_base(df, [])
#Base2 = create_base(df, ['stopwords'])
#Base3 = create_base(df, ['stemming'])
#Base4 = create_base(df, ['stopwords','stemming'])

In [15]:
Base1

Unnamed: 0,product_uid,product_title,product_description
0,38855,abheer floor shelf coffee table,when it comes to rounding out your living ense...
1,9929,addaly abstract coffee table,anchor your living room in mid-century style w...
2,5235,adoni coffee table,
3,26772,ahart frame coffee table,your coffee table is a style centerpiece for y...
4,3510,ahern coffee table,the ahern 4 legs coffee table with storage is ...
...,...,...,...
42564,40245,podgorni hanging wine glass rack,display and protect your delicate wine or marg...
42565,40244,kena hanging wine glass rack,spruce up your farmhouse kitchen decor with th...
42566,13019,garrow hanging wine glass rack,save cabinet space and display favorite glasse...
42567,42846,esquire 2 5/8 `` length bar knob,


# Base 1

In [16]:
b1 = Base1.to_dict('records')
b1

[{'product_uid': 38855,
  'product_title': 'abheer floor shelf coffee table',
  'product_description': "when it comes to rounding out your living ensemble , nothing does the job quite like a coffee table ! perfect for gathering your room 's style while keeping the remote in easy reach , this coffee table is a great option for adding a modern touch to your home . crafted from manufactured wood , this piece features asymmetric metal legs for an on-trend mixed material look . and with the open lower shelf , it 's a great option for lending your space a touch of extra storage ."},
 {'product_uid': 9929,
  'product_title': 'addaly abstract coffee table',
  'product_description': 'anchor your living room in mid-century style with this coffee table . founded on gleaming legs , this coffee table has two cylindrical ottomans with padded seats out from under the table when you need an extra seat .'},
 {'product_uid': 5235,
  'product_title': 'adoni coffee table',
  'product_description': ''},
 {

## Criação do Índice para a Base 1

In [17]:
es = Elasticsearch_service("database11")

In [18]:
es.indexing(b1)

True

## Instanciando Buscas

In [19]:
df_test = pd.read_csv('df_test_wands.csv')

In [20]:
df_test

Unnamed: 0,product_id,product_name,product_description,query,label
0,25434,21.7 '' w waiting room chair with wood frame,"this is a salon chair , barber chair for a hai...",salon chair,1.0
1,12088,22.5 '' wide polyester side chair,add a beautiful accent to any room with this m...,salon chair,0.0
2,42931,24.4 '' w metal lounge chair with metal frame,the heavy duty barber chair is built to last ....,salon chair,1.0
3,2636,25 '' wide faux leather manual swivel standard...,this is a chair designed for your barbershop ....,salon chair,1.0
4,42923,27.6 '' w antimicrobial leather seat waiting r...,,salon chair,1.0
...,...,...,...,...,...
62143,11181,henrika upholstered arm chair dining chair in ...,looking for comfortable dining seating with th...,bubble guppies chair,0.5
62144,15439,fellsburg linen upholstered parsons chair,,bubble guppies chair,0.5
62145,451,olin upholstered side chair,if you are looking for a simple yet sleek dini...,bubble guppies chair,0.5
62146,30764,barbay lounge chair cushion,,bubble guppies chair,0.0


In [21]:
search_term_list = df_test['query'].unique()
len(search_term_list)

96

In [22]:
search_term_list

array(['salon chair', 'dinosaur', 'turquoise pillows',
       'sofa with ottoman', 'ombre rug', 'outdoor privacy wall',
       'beds that have leds', 'chrome bathroom 4 light vanity light',
       'gurney  slade 56', 'sunflower', 'enclosed shoe rack',
       '70s inspired furniture', 'beach blue headboard',
       'porcelain loaf pan', 'kohen 5 drawer dresser', 'broadway lever',
       '7qt slow cooker', 'gnome fairy garden', 'novara pergola',
       'osgood mirror', 'wall sconce with usb port', '3/4 size mattress',
       'rose gold lounge', 'antique silver entry table',
       'entertainment stand end table', 'bar room wall decor',
       'gracie oaks 62 oller 14 ceiling fan', 'kitchen anti fatigue mats',
       'mila task chair', 'welcome sign', 'king size bed',
       'boho bed frame', 'wine bar', 'kitchen wooden stand', 'rug plum',
       'industrial', 'wishbone chair', 'bohemian', 'velvet chaise',
       'almost heaven sauna', 'promo codes or discounts',
       'bedroom wall deco

In [24]:
import re
query_id = {}

def escape_elasticsearch_query(query):
    return re.sub('(\+|\-|\=|&&|\|\||\>|\<|\!|\(|\)|\{|\}|\[|\]|\^|"|~|\*|\?|\:|\\\|\/)', '\\\\\\1', query)

counter = 0
for query in search_term_list:
    
    counter += 1
    returned_products = []
    returned_ids = []
    
    #query = escape_elasticsearch_query(query)
    returned_products = es.search(query)

    
    for i in range(len(returned_products)):
        returned_ids.append(returned_products[i]['product_uid'])
        
    if (len(returned_ids) >= 500):
        query_id[str(query)] = returned_ids[0:500]
    else:
        query_id[str(query)] = returned_ids + [-1]*(500 - len(returned_ids))
    print(counter)

query_id

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96


{'salon chair': [7465,
  24010,
  9234,
  25431,
  39428,
  39461,
  24008,
  19456,
  24007,
  24009,
  18270,
  27443,
  24006,
  36910,
  20026,
  22130,
  33690,
  33689,
  33691,
  39429,
  18273,
  42329,
  18272,
  25132,
  2187,
  7468,
  22390,
  18276,
  19905,
  26070,
  7506,
  40996,
  42928,
  18271,
  31557,
  18277,
  31556,
  26069,
  18275,
  42330,
  26068,
  42929,
  209,
  1616,
  5450,
  6167,
  6168,
  28058,
  28059,
  31555,
  40997,
  18274,
  4034,
  4329,
  4330,
  25432,
  27534,
  26071,
  36869,
  7467,
  39414,
  36895,
  36894,
  36893,
  19170,
  29744,
  25434,
  36892,
  37072,
  37071,
  35793,
  21360,
  25433,
  22394,
  2636,
  11184,
  27541,
  41156,
  31311,
  29132,
  22462,
  36887,
  5938,
  39407,
  36868,
  3720,
  36867,
  31157,
  31159,
  1198,
  251,
  18203,
  39413,
  29746,
  39409,
  22391,
  4410,
  39408,
  34536,
  39415,
  2795,
  22994,
  19498,
  42637,
  39411,
  36870,
  39425,
  39423,
  8137,
  21190,
  31310,
  29585,
 

In [26]:
result_query = pd.DataFrame(query_id)
result_query

Unnamed: 0,salon chair,dinosaur,turquoise pillows,sofa with ottoman,ombre rug,outdoor privacy wall,beds that have leds,chrome bathroom 4 light vanity light,gurney slade 56,sunflower,...,kitchen storage cabinet,trundle daybed,shoe closet,parsons chairs,wood bar stools,card table,bubble guppies chair,town & country living curtains,garage sports storage rack,hardwood beds
0,7465,34735,21028,38543,8536,8633,40178,32459,5123,37203,...,4765,1752,41969,21597,7157,40273,32872,34913,29141,775
1,24010,34737,11452,20619,9145,36851,25156,17491,5155,40761,...,6536,18848,22446,23847,35772,32808,6676,30353,27361,23849
2,9234,24094,5998,9687,8537,30538,28494,19632,16504,27500,...,38548,41699,39391,34816,3719,6980,38549,33434,33624,24024
3,25431,195,25658,9909,28172,7098,13599,15892,17994,35283,...,39061,842,22445,18161,14255,27022,38550,28697,39474,20641
4,39428,11840,30279,33252,25752,40883,13182,10259,9058,11848,...,35410,36264,39396,18329,21958,14186,14492,6212,31989,23848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,8994,-1,21238,15299,22295,13328,12438,29653,-1,-1,...,17545,-1,5605,37085,4446,3095,38522,24695,38228,4108
496,29410,-1,21409,15285,24400,13319,40220,5818,-1,-1,...,19282,-1,37008,27426,26972,16685,13799,9738,39152,4100
497,24159,-1,42798,15301,20093,13313,32705,24136,-1,-1,...,19217,-1,37009,39742,21442,16691,17405,18359,20213,6008
498,23696,-1,12226,15288,42194,13339,42915,10612,-1,-1,...,14509,-1,1623,19788,38993,29097,13790,11743,7911,9451


In [27]:
result_query.reset_index().to_csv("result_search_wands.csv", index=False)