In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import json


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
with open('data/train_for_transformer.json') as f:
    data = json.load(f)

In [4]:
products_data = pd.read_parquet('data/company-data-products.snappy.parquet', engine='pyarrow')

In [5]:
services_data = pd.read_parquet('data/company-data-services.snappy.parquet', engine='pyarrow')

In [10]:
products_data.head()

Unnamed: 0,root_domain,products
0,1-877-quikdry.com,"[Tile Grout Clean, Thermal Imaging, Carpets Cl..."
1,10torsions.com,"[Messagerie, Escape Game, Nan, Alimentations, ..."
2,13xlradio.com,"[Soul Music, After Dark Grooves, Music Related..."
3,1801-za.all.biz,"[Electrical Heating, Pedicure Treatment, Metal..."
4,1st-caliber-bail-bonds-llc.business.site,"[1st Caliber Bail Bonds, Bail Bonds, Bail Bond..."


In [5]:
train_words = list(data.keys())

In [25]:
emb_train = model.encode(train_words, show_progress_bar=True)

Batches:   0%|          | 0/1655 [00:00<?, ?it/s]

In [6]:
emb_train_2 = model.encode(train_words, batch_size=64, show_progress_bar=True, convert_to_tensor=True)

Batches:   0%|          | 0/828 [00:00<?, ?it/s]

In [164]:
site_index = np.random.randint(0, len(services_data))

In [165]:
site = products_data['root_domain'][site_index]
site

'getprintbox.com'

In [108]:
products = products_data[products_data['root_domain'] == site]['products'].values[0]
products

array(['Wood Pellets A1 Enplus Wood Pellet',
       'Wood Pellets Acacia Wood Pellet',
       'Wood Pellets Beech Wood Pellet For Sale',
       'Wood Pellets Ruf Bark Briquettes For Sale', 'Lignetics Pellets',
       'Bio Fuels', 'Wood Pellets', 'Enplus Wood', 'Heating System',
       'Wood Pellets Ruf', 'Mixed Wood', 'Pure Wood Pellets',
       'Pine Firewood', 'Pine Wood Pellet', 'Beech Wood Pellets',
       'Din Plus Wood Pellets', 'Wood Pellets A1', 'Driving Suits',
       'Biomass Pellets Fuel', 'Wood Pellets Beech Wood Pellet',
       'Charcoal Hexagonal Charcoal For Sale',
       'Charcoal Hardwood Charcoal', 'Charcoal Hexagonal Charcoal',
       'Fireplace Inserts', 'Pellet Stoves', 'Energex Pellets',
       'Wood Pellets Pine Wood Pellet', 'Products Tagged',
       'Birch Firewood', 'Pine Firewood Birch Firewood',
       'Lignetics Pellets Pine Firewood',
       'Wood Pellets Spruce Wood Pellet', 'Polar Firewood', 'Fire Wood',
       'Fire Wood Polar Firewood', 'A1 Enplus Wood

In [167]:
site = services_data['root_domain'][site_index]
site

'haciendarenacer.org'

In [168]:
services = services_data[services_data['root_domain'] == site]['services'].values[0]
products = services

In [7]:
def classify(product_names): #returns a tuple with the predicted category alongside the similarity score(could be considered accuracy)
    emb1 = model.encode(product_names)
    similarity = [util.cos_sim(emb, emb_train_2) for emb in emb1]
    return [(data[train_words[np.argmax(similarity[i])]], max(similarity[i][0]).item()) for i in range(len(similarity))]

In [75]:
def classify_bad(product_names, k=3): #returns a tuple with the predicted category alongside the similarity score(could be considered accuracy)
    emb1 = model.encode(product_names)
    similarity = [util.cos_sim(emb, emb_train_2) for emb in emb1]
    print(np.bincount(np.argsort(similarity[0])[0][:3]))
    return [(data[train_words[np.argmax(np.bincount(np.argsort(similarity[i])[0][:k]))]], max(similarity[i][0]).item()) for i in range(len(similarity))]

In [8]:
with open('data/categories_correspondance.josn') as f:
    categories_correspondance = json.load(f)

In [None]:
for site_index in range(1000):
    site = products_data['root_domain'][site_index]
    products = products_data[products_data['root_domain'] == site]['products'].values[0]
    sim_words = classify(products)
    subcategories = {}
    subcategories['other'] = []
    for i in range(len(sim_words)):
        if sim_words[i][1] <= 0.5:
            continue
        if sim_words[i][1]  >= 0.7:
            if sim_words[i][0] not in subcategories.keys():
                subcategories[sim_words[i][0]] = [{'title': products[i]}]
            else:
                subcategories[sim_words[i][0]].append({'title': products[i]})
        else:
            subcategories['other'].append({'title': products[i]})
        print(i, sim_words[i][0], sim_words[i][1])
    categories_dict = {}
    categories_dict['other'] = []
    for key in subcategories.keys():
        if key == 'other':
            categories_dict[key].append({'title':key, 'products': subcategories[key]})
        else:
            if categories_correspondance[key] not in categories_dict.keys():
                categories_dict[categories_correspondance[key]] = [{'title':key, 'products': subcategories[key]}]
            else:
                categories_dict[categories_correspondance[key]].append({'title':key, 'products': subcategories[key]})
    final_json = {'categories': []}
    for key in categories_dict.keys():
        final_json['categories'].append({'title': key, 'subcategories': categories_dict[key]})
    with open(f'data/predictions/categ_{site_index}.json', 'w') as f:
        json.dump(final_json, f, indent=4)

# sim_words = classify(['eating healthy'])

In [170]:
len(sim_words)


11

In [171]:
products

array(['Internet Service Provider', 'Statistics Statistics',
       'Marketing Marketing', 'Servicios De Capacitación',
       'Reiki Y Aromaterapia', 'Galería', 'Misión', 'Visión',
       'De Servicios', 'Eventos', 'Marketing'], dtype=object)

In [172]:
subcategories = {}
subcategories['other'] = []
for i in range(len(sim_words)):
    if sim_words[i][1] <= 0.5:
        continue
    if sim_words[i][1]  >= 0.7:
        if sim_words[i][0] not in subcategories.keys():
            subcategories[sim_words[i][0]] = [{'title': products[i]}]
        else:
            subcategories[sim_words[i][0]].append({'title': products[i]})
    else:
        subcategories['other'].append({'title': products[i]})
    print(i, sim_words[i][0], sim_words[i][1])


0 computer services 0.8463644981384277
1 statistics 0.9407881498336792
2 marketing and distribution 0.7853572964668274
3 oil and gas restoration and reclamation services 0.5646058320999146
7 patient exam and monitoring products 0.6296977996826172
8 livestock services 0.5326584577560425
9 fresh vegetables 0.6064268350601196
10 advertising 0.7617133259773254


In [143]:
subcategories['fuels']

[{'title': 'Bio Fuels'},
 {'title': 'Charcoal Hardwood Charcoal'},
 {'title': 'Charcoal Hexagonal Charcoal'},
 {'title': 'Hexagonal Charcoal'}]

In [144]:
import json

with open('data/categories_correspondance.josn') as f:
    categories_correspondance = json.load(f)

In [145]:
categories_dict = {}
categories_dict['other'] = []
for key in subcategories.keys():
    if key == 'other':
        categories_dict[key].append({'title':key, 'products': subcategories[key]})
    else:
        if categories_correspondance[key] not in categories_dict.keys():
            categories_dict[categories_correspondance[key]] = [{'title':key, 'products': subcategories[key]}]
        else:
            categories_dict[categories_correspondance[key]].append({'title':key, 'products': subcategories[key]})

In [146]:
categories_dict

{'other': [{'title': 'other',
   'products': [{'title': 'Wood Pellets A1 Enplus Wood Pellet'},
    {'title': 'Wood Pellets Acacia Wood Pellet'},
    {'title': 'Wood Pellets Beech Wood Pellet For Sale'},
    {'title': 'Wood Pellets Ruf Bark Briquettes For Sale'},
    {'title': 'Lignetics Pellets'},
    {'title': 'Enplus Wood'},
    {'title': 'Wood Pellets Ruf'},
    {'title': 'Pure Wood Pellets'},
    {'title': 'Beech Wood Pellets'},
    {'title': 'Din Plus Wood Pellets'},
    {'title': 'Wood Pellets A1'},
    {'title': 'Biomass Pellets Fuel'},
    {'title': 'Wood Pellets Beech Wood Pellet'},
    {'title': 'Charcoal Hexagonal Charcoal For Sale'},
    {'title': 'Pellet Stoves'},
    {'title': 'Energex Pellets'},
    {'title': 'Wood Pellets Pine Wood Pellet'},
    {'title': 'Products Tagged'},
    {'title': 'Lignetics Pellets Pine Firewood'},
    {'title': 'Wood Pellets Spruce Wood Pellet'},
    {'title': 'Polar Firewood'},
    {'title': 'A1 Enplus Wood Pellet'},
    {'title': 'Enplus Woo

In [147]:
final_json = {'categories': []}
for key in categories_dict.keys():
    final_json['categories'].append({'title': key, 'subcategories': categories_dict[key]})

In [148]:
final_json

{'categories': [{'title': 'other',
   'subcategories': [{'title': 'other',
     'products': [{'title': 'Wood Pellets A1 Enplus Wood Pellet'},
      {'title': 'Wood Pellets Acacia Wood Pellet'},
      {'title': 'Wood Pellets Beech Wood Pellet For Sale'},
      {'title': 'Wood Pellets Ruf Bark Briquettes For Sale'},
      {'title': 'Lignetics Pellets'},
      {'title': 'Enplus Wood'},
      {'title': 'Wood Pellets Ruf'},
      {'title': 'Pure Wood Pellets'},
      {'title': 'Beech Wood Pellets'},
      {'title': 'Din Plus Wood Pellets'},
      {'title': 'Wood Pellets A1'},
      {'title': 'Biomass Pellets Fuel'},
      {'title': 'Wood Pellets Beech Wood Pellet'},
      {'title': 'Charcoal Hexagonal Charcoal For Sale'},
      {'title': 'Pellet Stoves'},
      {'title': 'Energex Pellets'},
      {'title': 'Wood Pellets Pine Wood Pellet'},
      {'title': 'Products Tagged'},
      {'title': 'Lignetics Pellets Pine Firewood'},
      {'title': 'Wood Pellets Spruce Wood Pellet'},
      {'title

In [149]:
with open('data/categories.json', 'w') as f:
    json.dump(final_json, f, indent=4)