In [None]:
from clean_tabular_data import *
import pandas as pd
#load dataframe with products from csv
products_df = pd.read_csv('data/Products.csv',lineterminator ='\n')  
# drop nulls and cast pricing to numeric  
products_df = clean_product_data(products_df)

In [None]:
import json
def get_encoder_and_decoder(products_df):
        unique_categories = list(products_df["label"].unique())
        encoder = {k: v for v,k in enumerate(unique_categories)}
        decoder = {v: k for v,k in enumerate(unique_categories)}
        return (encoder,decoder)

# create new label column
products_df["label"] = products_df["category"].str.split(r"\/", expand=True)[0].str.strip()
#get encoder and decoder
encoder,decoder = get_encoder_and_decoder(products_df)
#save decoder to file
with open("image_decoder.pkl", "w") as outfile:
    json.dump(decoder, outfile)

In [None]:
images_df = pd.read_csv('data/Images.csv',lineterminator ='\n')
# merge images and products to get new label column for images
training_df = pd.merge(images_df, products_df[['id', 'label']], left_on='product_id', right_on='id', how='left', suffixes=('','_y')).drop('id_y', axis=1).drop('product_id', axis=1)
training_df = training_df.iloc[: , 1:]
training_df['label'] = training_df['label'].replace(encoder)
training_df.to_csv('data/training_data.csv')

In [None]:
from clean_images import *
# path specific to my local machine
clean_image_data('D:/Documents/AICore/images_fb/images')

In [None]:
from FBMTrainer import FBMTrainer
from ray import tune
from ray.air import session,Checkpoint

trainer = FBMTrainer()
config = {        
        "lr": tune.loguniform(1e-2,1e-1),
        "batch_size": tune.choice([8]),
        "hyperparameter_tuning_on": False,
        "is_feature_extraction_model": True
    }
tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(trainer.train_fbm),
             resources=tune.PlacementGroupFactory([{"CPU": 2,"GPU": 1}])
        ),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            num_samples=1,
        ),
        param_space=config
    )

results = tuner.fit()
best_result = results.get_best_result("loss", "min")
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
    best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_result.metrics["accuracy"]))


In [None]:
from image_processor import ImageProcessor
from FBMClassifier import FBMClassifier
import pandas as pd
import os
from PIL import Image
import torch
import json
from json_numpy import default
    
base_dir = "D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system"
images = pd.read_csv(os.path.join(base_dir,"data/training_data.csv"))
    
# get image_model.pt, load into FBMClassifier
feature_extractor = FBMClassifier(True)
image_processor = ImageProcessor()
state = torch.load("final_model/image_model.pt")
feature_extractor.load_state_dict(state)

# run through all data and create new json
result = {}
for index in range(len(images)):
    # for each label (image name), assign output of the model
    image_uid = images.loc[index, 'id']
    img_ext = image_uid + '.jpg'
    img_path = os.path.join(os.path.join(base_dir,"cleaned_images", img_ext))
    image = Image.open(img_path)     

    feature = image_processor.process(image)
    # get prediction from feature extraction model, using unsqueeze to add a placeholder for batch size
    image_embedding = feature_extractor(feature.unsqueeze(0))
    # convert the tensor prediction to a numpy array
    image_embedding = image_embedding.detach().numpy()
    result[image_uid] = image_embedding

# save as json file 
with open("model_information/image_embeddings.json", "w") as outfile:
    json.dump(result, outfile, default=default)

In [None]:
import faiss                   # make faiss available
from json_numpy import object_hook
import json
import pickle

f = open("model_information/image_embeddings.json") 
data = json.load(f, object_hook=object_hook)

index = faiss.IndexFlatL2(1000)   # build the index
print(index.is_trained)

for k,v in data.items():    
    index.add(faiss.normalize_L2(v))                  # add vectors to the index

chunk = faiss.serialize_index(index)
with open("model_information/index.pkl", "wb") as f:
    pickle.dump(chunk, f)    
print(index.ntotal)