In [1]:
import os
import re
import json
import gzip
import typing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [1]:
import sys
sys.path.append('../')

SHARED_DATA_FOLDER = '../shared/data'


Would need to check if the folder exists, if not create it.

In [None]:
from shared.model.data.utils import (
    pre_process_initial_file, 
    get_shared_data_folder_children,
    get_map_engineering_features, 
    EngineeringFeatures,
    EmbeddingFeatures
)

### Loading data

We would need to check if that file is already pre-processed, or it is a new one.

In [4]:
df = pre_process_initial_file(SHARED_DATA_FOLDER)

## Extracting Features

### Engineering Features

For the engineering features, we first have to compute maps.

In [5]:
shared_data_folder_images, shared_data_folder_brand, shared_data_folder_price = get_shared_data_folder_children(SHARED_DATA_FOLDER)

In [6]:
price_df, brand_df = get_map_engineering_features(
    df, 
    shared_data_folder_price, 
    shared_data_folder_brand, 
    False, 
    False
)

After this, we are already able to compute the features we need from the dataframe.

In [7]:
engineering_features = EngineeringFeatures(price_df, brand_df)
embedding_features = EmbeddingFeatures(shared_data_folder_images)

Some weights of the model checkpoint at nateraw/vit-base-beans were not used when initializing ViTModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at nateraw/vit-base-beans and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
for idx, row in df.iterrows():
    print(engineering_features.get_features(
        row['price'],
        row['brand'],
        row['also_buy'],
        row['also_view'],
    ))
    break

(0.17834236430869266, dict_values([0.0, 0.0, 0.0, 0.0, 0.0, 0.02, 0.04, 0.0, 0.07, 0.0, 0.02, 0.05, 0.0, 0.0, 0.76, 0.0, 0.0, 0.0, 0.02, 0.01, 0.0, 0.0]), -0.5, -0.5)


In [9]:
df.head(3)

Unnamed: 0,also_buy,also_view,asin,brand,description,feature,image,price,title,main_cat
0,,,B00ADZ3WUM,NSI,('Bumpersticker: A day without sunshine is lik...,('Official Licensed Die-Cut Sticker Designed b...,,4.68,"NSI - A Day Without Sunshine is Like, Well, Ni...",Automotive
1,,,B005VII5IU,General Motors,('This is the official Genuine General Motors ...,('This is the official Genuine General Motors ...,,213.16,Genuine GM Parts 10341533 Rear Bumper Valance ...,Automotive
2,,,B001QTEKVO,JLM,('HID Xenon lights are designed to be at least...,"('Will run for approx 2500 hours', 'Produces 2...",('https://images-na.ssl-images-amazon.com/imag...,,JLM HID Conversion Kit H13 (9008) Dual Tube B...,Automotive


### Embedding Features

In [10]:
for idx, row in df.iterrows():
    print(await embedding_features.get_features(
        row['image'],
        row['description'],
        row['feature'],
        row['title'],
    ))
    break

(None, tensor([ 2.8307e-01, -6.5253e-02,  4.2729e-01,  3.6580e-01, -3.6476e-01,
        -5.0739e-01, -1.2076e-01,  1.7040e-01, -1.8815e-01, -2.8772e-01,
        -6.8744e-02, -4.1055e-02, -7.6253e-03, -4.8804e-02, -1.9893e-01,
         1.5596e-01,  1.1401e-01, -2.0138e-02, -2.7762e-01, -1.8110e-01,
         2.9588e-02,  2.7988e-02, -1.3970e-01, -9.2100e-02,  2.0511e-01,
         2.5154e-02,  5.1148e-02,  3.0217e-01, -1.5906e-01,  1.3345e-01,
         1.9112e-01,  6.2872e-02,  1.4071e-01, -1.3780e-01, -3.0413e-01,
         1.1887e-01, -3.1925e-01,  6.9954e-02, -6.5650e-03, -1.9932e-01,
         1.8792e-01,  1.6417e-02, -1.1066e-01, -7.3005e-02,  2.2577e-01,
        -3.8447e-01, -2.1897e-01,  5.2561e-01,  4.7725e-01,  1.5984e-01,
         2.5258e-01, -6.0068e-02,  1.2148e-01, -3.4540e-02,  1.6923e-01,
         3.8092e-01,  8.9622e-02, -2.4234e-01,  9.3073e-02,  5.6356e-02,
         5.7953e-01, -1.5897e-01, -5.8894e-02, -2.2424e-01, -4.7628e-01,
        -1.6453e-01, -1.5320e-01,  2.6317e-0

# All Features

We can check the final length of the numeric representation that we have (2329 at most).

In [14]:
# We would join everything that we have
for idx, row in df[3:].iterrows():

    price, brand, also_buy, also_view = engineering_features.get_features(
        row['price'],
        row['brand'],
        row['also_buy'],
        row['also_view'],
    )
    image, description, feature, title = await embedding_features.get_features(
        row['image'],
        row['description'],
        row['feature'],
        row['title'],
    )

    print(1 + len(brand) + 2 + len(image) + len(description) + len(feature) + len(title))
    break
    # total amount of features: 2329

2329
