In [1]:
import spacy

import numpy as np

import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_lg')

In [5]:
processed_data = pd.read_csv('./processed_product_data.csv', index_col='id', usecols=['id','all'])

In [8]:
processed_data['all'].head()

id
e7a780c3-502c-405a-9d2c-9e9eb0b638f5    Fresh flowers Mum Disbud H30 Regular Wide Xmas...
ea3ed24e-8912-4424-a517-88202697b094    Collateral materials Curly Minuet Medium Narro...
85fe47e7-a44d-498e-82cf-36d654668360    Premade bouquets Wreath Magical Bluebells Doub...
04f206e1-3198-4fdb-975a-2138aba547b4    Collateral materials Flax White Needle Mega Re...
b4ac0ef3-1a7f-4549-b6f8-80fcd8b7d3ed    Cut Greens Rose Cream Cascade 1 Bl SPX Yellow ...
Name: all, dtype: object

In [14]:
all_text = processed_data['all'].values

In [16]:
# Covert our text to vectors

def get_word_vectors(words):
    # converts a list of words into their word vectors
    return [nlp(word).vector for word in words]

all_text_vectors = get_word_vectors(all_text)

In [25]:
# Build a Nearest Neighbors model that will do all of the similarity searching stuff for us 😱

# import the NearestNeighbors module form sklearn

from sklearn.neighbors import NearestNeighbors

NUMBER_OF_NEIGHBORS = 5

neig_model = NearestNeighbors(n_neighbors=NUMBER_OF_NEIGHBORS)

In [26]:
# fit the model to your vectors
neig_model.fit(all_text_vectors)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [28]:
dist, ind = neig_model.kneighbors([nlp('Fresh flowers Mum Disbud H30 Regular Wide Xmas Green 110 Procona').vector])

# the indices list are wrapped in a list, that's why we need to use ind[0]
np.take(all_text, ind[0]) 

array(['Fresh flowers Mum Disbud H30 Regular Wide Xmas Green 110 Procona',
       'Fresh flowers Centaurea Manon Bundle Xmas Green 35 Procona',
       'Fresh flowers Germini Pacific Bernary 250 Grms Orange Garden 12192 FB',
       'Fresh flowers Eryngium Itsaparty Regular Bicolor Creme Pink 45 Tray',
       'Fresh flowers Aethiopica Diletta Cream 7 plus BL Purple Bicolor 48 Box Type'],
      dtype=object)

In [98]:
dist, ind = neig_model.kneighbors([nlp('Fresher flowers Mum Disbud H30 Regular Wide Xmas Green 110 Procona').vector])
np.take(all_text, ind[0])

array(['Fresh flowers Mum Disbud H30 Regular Wide Xmas Green 110 Procona',
       'Fresh flowers Centaurea Manon Bundle Xmas Green 35 Procona',
       'Fresh flowers Germini Pacific Bernary 250 Grms Orange Garden 12192 FB',
       'Fresh flowers Amaranthus Hanging Cote DAzur 2022 cm Green Bicolor 80 Procona Large',
       'Fresh flowers Eryngium Itsaparty Regular Bicolor Creme Pink 45 Tray'],
      dtype=object)

In [96]:
def get_product_id(products, df, col='all'):
    index_l = list()
    for each_product in products:
        index_l.append(df[df[col] == each_product].index[0])
    return index_l