In [1]:
import numpy as np
from PyDictionary import PyDictionary
import string
import eventlet
import socketio
import csv
from math import log

In [2]:
import gensim
from gensim.models import KeyedVectors

print("Running")
model = KeyedVectors.load_word2vec_format("enwiki_20180420_100d.txt", binary=False)  # C text format


Running


In [3]:
class Product:
    def __init__(self, name, url, price, review_score, num_reviews, description, category):
        if len(price) == 0:
            price = '0'
        if len(review_score) == 0:
            review_score = '0'
        if len(num_reviews) == 0:
            num_reviews = '0'
        
        self.name = name
        self.url = url
        self.price = float(price)
        self.review_score = float(review_score)
        self.num_reviews = int(num_reviews)
        self.description = description
        self.category = category
    
    def __lt__(self, other):
        return self.name < other.name

    def to_dict(self):
        res = dict()
        res['name'] = self.name
        res['url'] = self.url
        res['price'] = float(self.price)
        res['review_score'] = float(self.review_score)
        res['num_reviews'] = int(self.num_reviews)
        res['description'] = self.description
        res['category'] = self.category
        
        return res

In [4]:
csvfile = open('products.csv', 'r', encoding="utf-8")
reader = csv.reader(csvfile, delimiter=',',quotechar='"')
products = dict()
count = 0
for row in reader:
    if count == 0:
        count += 1
        continue
        
    prod = Product(row[0], row[1], row[2], row[3], row[4], row[5] + " " + row[6], row[7])
    products[row[1]] = prod
    


In [5]:
word1 = "scott"
word2 = "alex"
try: 
    print(model.similarity(word1, word2)) 
except: 
    print("One of the words not in the dictionary")

0.76212156


In [6]:
bad_words = set(["to", "i", "am", "the", "you", "me", "is", "it", "this", "and", "or", "in", "who", "whom", "their", "there", "they're", "Donald Trump","a","about","all","also","and","as","at","be","because","but","by","can","come","could","day","do","even","find","first","for","from","get","give","go","have","he","her","here","him","his","how","I","if","in","into","it","its","just","know","like","look","make","man","many","me","more","my","new","no","not","now","of","on","one","only","or","other","our","out","people","say","see","she","so","some","take","tell","than","that","the","their","them","then","there","these","they","thing","think","this","those","time","to","two","up","use","very","want","way","we","well","what","when","which","who","will","with","would","year","you","your"])

def comparePhrases(phrase1, phrase2):
    phrase1 = phrase1.lower().translate(string.punctuation).split()
    phrase2 = phrase2.lower().translate(string.punctuation).split()

    vec1 = np.zeros(shape=(100,))
    vec1.setflags(write=1)

    for i in range(0, len(phrase1)):
        word = phrase1[i]
        if word in bad_words:
            continue
        if word not in model.vocab:
            #print(word)
            continue
        vec1 += model[word].copy()

    vec1 = vec1 / np.linalg.norm(vec1)

    vec2 = np.zeros(shape=(100,))
    vec2.setflags(write=1)

    for i in range(0, len(phrase2)):
        word = phrase2[i]
        if word in bad_words:
            continue
        if word not in model.vocab:
            #print(word)
            continue
        vec2 += model[word].copy()

    vec2 = vec2 / np.linalg.norm(vec2)
    res = 0
    for i in range(len(vec2)):
        res += vec1[i] * vec2[i]

    return res

def relevanceHeuristic(searchTerm, product):
    # Return relevance score between search term and product phrase
    heuristic = comparePhrases(searchTerm, product.name) #+ comparePhrases(searchTerm, product.description)
    if heuristic < 0.6:
        return 0
    return heuristic

def popularityHeuristic(product, relevanceScore):
    # Return popularity score for product that has already passed relevance test
    if product.num_reviews > 0:
        popularity = log(product.num_reviews, 100) / 10 + 0.5
        popularity *= product.review_score
    else:
        return relevanceScore
    popularity *= 3 * (relevanceScore - 0.1)**10
    
    return relevanceScore + popularity
    

In [None]:
sio = socketio.Server(cors_allowed_origins='*')
app = socketio.WSGIApp(sio, static_files={
    '/': {'content_type': 'text/html', 'filename': 'index.html'}
})

@sio.event
def connect(sid, environ):
    print('connect ', sid)

@sio.event
def my_message(sid, data):
    print('message ', data)

@sio.event
def disconnect(sid):
    print('disconnect ', sid)

@sio.event
def execute_search(sid, data):
    print(sid, data)
    if(str(data) == "exit"):
        sio.stop()
        
    # Sort products by relevance
    relevance = []
    for product in products:
        relevanceScore = relevanceHeuristic(str(data), products[product])
        if relevanceScore > 0:
            popularityScore = popularityHeuristic(products[product], relevanceScore)
            relevance.append((popularityScore, relevanceScore, products[product]))
    relevance = sorted(relevance)
    maxSize = max(200, len(relevance))
    # Get products as list of dictionaries to send
    returnList = []
    for i in range(len(relevance) - 1, len(relevance) - maxSize, -1):
        if maxSize == 0:
            break
        returnList.append(relevance[i][2].to_dict())
    sio.emit('reply', {'response': returnList}, room=sid)

if __name__ == '__main__':
    eventlet.wsgi.server(eventlet.listen(('', 12345)), app)

(13404) wsgi starting up on http://0.0.0.0:12345
(13404) accepted ('127.0.0.1', 49204)
127.0.0.1 - - [18/Oct/2020 07:03:37] "GET /socket.io/?EIO=3&transport=polling&t=NKxb-zE HTTP/1.1" 200 439 0.000998


connect  e0df20fa050f4140a7e7b791c545fb54
connect  79b1d8c79c694ae59094436e5c2b552d


127.0.0.1 - - [18/Oct/2020 07:03:37] "GET /socket.io/?EIO=3&transport=polling&t=NKxb_Oi HTTP/1.1" 200 423 0.000996
(13404) accepted ('127.0.0.1', 49202)
(13404) accepted ('127.0.0.1', 49212)
(13404) accepted ('127.0.0.1', 49213)
(13404) accepted ('127.0.0.1', 49203)
127.0.0.1 - - [18/Oct/2020 07:03:38] "GET /socket.io/?EIO=3&transport=polling&t=NKxb_mE&sid=79b1d8c79c694ae59094436e5c2b552d HTTP/1.1" 200 235 0.421180
(13404) accepted ('127.0.0.1', 49215)
127.0.0.1 - - [18/Oct/2020 07:03:38] "GET /socket.io/?EIO=3&transport=polling&t=NKxb_mE&sid=e0df20fa050f4140a7e7b791c545fb54 HTTP/1.1" 200 251 0.165239


e0df20fa050f4140a7e7b791c545fb54 computer




e0df20fa050f4140a7e7b791c545fb54 laptop
e0df20fa050f4140a7e7b791c545fb54 cookie jar
e0df20fa050f4140a7e7b791c545fb54 cookie dough
e0df20fa050f4140a7e7b791c545fb54 cookie dough
e0df20fa050f4140a7e7b791c545fb54 cheese
79b1d8c79c694ae59094436e5c2b552d pet food
