In [None]:
from clean_tabular_data import *
import pandas as pd
#load dataframe with products from csv
products_df = pd.read_csv('Products.csv',lineterminator ='\n')  
# drop nulls and cast pricing to numeric  
products_df = clean_product_data(products_df)

In [None]:
def get_encoder_and_decoder(products_df):
        unique_categories = list(products_df["label"].unique())
        encoder = {k: v for v,k in enumerate(unique_categories)}
        decoder = {v: k for v,k in enumerate(unique_categories)}
        return (encoder,decoder)

# create new label column
products_df["label"] = products_df["category"].str.split(r"\/", expand=True)[0].str.strip()
#get encoder and decoder
encoder,decoder = get_encoder_and_decoder(products_df)
#save decoder to file
f = open("image_decoder.pkl","w")
f.write(str(decoder))
f.close()

In [None]:
images_df = pd.read_csv('Images.csv',lineterminator ='\n')
# merge images and products to get new label column for images
training_df = pd.merge(images_df, products_df[['id', 'label']], left_on='product_id', right_on='id', how='left', suffixes=('','_y')).drop('id_y', axis=1).drop('product_id', axis=1)
training_df = training_df.iloc[: , 1:]
training_df['label'] = training_df['label'].replace(encoder)
training_df.to_csv('training_data.csv')

In [None]:
from clean_images import *
# path specific to my local machine
clean_image_data('D:/Documents/AICore/images_fb/images')

In [None]:
from torchvision import transforms
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from FBMDataset import FBMDataset

def get_datasets(training_data_dir, cleaned_images_dir):    
    transform_list = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], 
                                [0.229, 0.224, 0.225])
        ])
    with FileLock(os.path.expanduser("~/.data.lock")):
        dataset = FBMDataset(training_data_dir,cleaned_images_dir,transform=transform_list)
        #obtain the list of targets
        train_dataset,test_dataset,val_dataset = random_split(dataset, [0.7,0.15,0.15])
    return (train_dataset, test_dataset, val_dataset)

In [None]:
import torch
import os
from datetime import datetime
import torch.nn.functional as F
from torch.optim import SGD
from FBMClassifier import FBMClassifier
from ray import tune
from ray.air import session,Checkpoint
#from torch.utils.tensorboard import SummaryWriter

def create_model_dir_path():
    parent_dir = 'model_evaluation'
    current_datetime = datetime.now().strftime('%y%m%d%H%M%S')
    child_dir = 'weights'
    path = os.path.join(os.getcwd(), parent_dir, current_datetime, child_dir)
    return path

def training_loop(model, optimiser, train_loader, epoch_num,device=None):  
    # writer = SummaryWriter()
    batch_id = 0     
    # Set the model to run on the device
    model = model.to(device)
    model.train(True)     
    print(f'Beginning Batches for epoch {epoch_num}')
    print(len(train_loader))
    for batch in train_loader:
        # get features and labels from the batch
        features,labels = batch
        features = features.to(device)
        labels = labels.to(device, non_blocking=True)
        # loss.backward does not overwrite, it adds. To stop this, we set the gradients back to zero. sets the .grad of all optimized tensors to zero
        optimiser.zero_grad()
        # make a prediction
        prediction = model(features)
        # calculate loss
        criterion = F.cross_entropy(prediction,labels)
        # backward function calculates the gradient of the current tensor w.r.t graph leaves
        criterion.backward()
        # moves each parameter in the opposite direction of the gradient, proportional to the learning rate
        optimiser.step()
        # writer.add_scalar('Loss', criterion.item(), batch_id)
        batch_id += 1
        print(f"completed: {batch_id}")
    print("Completed")
    

def validate(model, val_loader, device):
    # Set the model to evaluation mode
    model.eval()
    val_loss = 0.0
    val_steps = 0
    total = 0
    correct = 0
    for i, data in enumerate(val_loader, 0):
        with torch.no_grad():
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            loss = F.cross_entropy(outputs, labels)
            val_loss += loss.cpu().numpy()
            val_steps += 1
    return (val_loss / val_steps),(correct / total)


In [None]:
def train_fbm(config):    
    hyperparameter_tuning_on=config["hyperparameter_tuning_on"]
    is_feature_extraction_model=config["is_feature_extraction_model"]
    model = FBMClassifier(is_feature_extraction_model)
    optimiser = SGD(model.resnet50.fc.parameters(), lr = config["lr"])    
    base_dir = "D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system"
    train_dataset, test_dataset, val_dataset = get_datasets(os.path.join(base_dir,"training_data.csv"),os.path.join(base_dir,"cleaned_images"))
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    # Set the model to run on the device
    model = model.to(device)

    train_loader = DataLoader(train_dataset,batch_size=config["batch_size"],shuffle=True)
    test_loader = DataLoader(test_dataset,batch_size=config["batch_size"],shuffle=True)
    val_loader = DataLoader(val_dataset,batch_size=config["batch_size"],shuffle=True)
    
    path = create_model_dir_path()
    os.makedirs(path)
    
    for epoch in range(1):
        print(f"Beginning {epoch} ...")
        # tune.utils.wait_for_gpu()
        training_loop(model,optimiser,train_loader, epoch, device=device)
        print('Training complete ...')
        loss, accuracy = validate(model, val_loader, device=device)
        print('Validation complete ...')        
        print(f'Epoch {epoch} - Average Loss: {loss}')
        print(f'Epoch {epoch} - Accuracy: {accuracy}')

        if hyperparameter_tuning_on:
            os.makedirs("my_model", exist_ok=True)
            torch.save(
                (model.state_dict(), optimiser.state_dict()), "my_model/checkpoint.pt")
            checkpoint = Checkpoint.from_directory("my_model")
            session.report({"loss": loss, "accuracy": accuracy}, checkpoint=checkpoint)
        elif is_feature_extraction_model:
            torch.save(model.state_dict(), path + f'/image_model.pt')
        else:    
            torch.save(model.state_dict(), path + f'/epoch_{epoch}.pt')
        print(f"Ending {epoch} ...")

config = {        
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16]),
        "hyperparameter_tuning_on": True,
        "is_feature_extraction_model": False
    }
tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_fbm),
             resources=tune.PlacementGroupFactory([{"CPU": 2,"GPU": 1}])
        ),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            num_samples=2,
        ),
        param_space=config
    )

results = tuner.fit()
best_result = results.get_best_result("loss", "min")
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
    best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_result.metrics["accuracy"]))


In [29]:
from image_processor import ImageProcessor
from FBMClassifier import FBMClassifier
import pandas as pd
import os
from PIL import Image
import torch
import json
from json_numpy import default
    
base_dir = "D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system"
images = pd.read_csv(os.path.join(base_dir,"training_data.csv"))
    
# get image_model.pt, load into FBMClassifier
feature_extractor = FBMClassifier(True)
image_processor = ImageProcessor()
state = torch.load("final_model/image_model.pt")
feature_extractor.load_state_dict(state)

# run through all data and create new json
result = {}
for index in range(len(images)):
    # for each label (image name), assign output of the model
    image_uid = images.loc[index, 'id']
    img_ext = image_uid + '.jpg'
    img_path = os.path.join(os.path.join(base_dir,"cleaned_images", img_ext))
    image = Image.open(img_path)     

    feature = image_processor.process(image)
    # get prediction from feature extraction model, using unsqueeze to add a placeholder for batch size
    image_embedding = feature_extractor(feature.unsqueeze(0))
    # convert the tensor prediction to a numpy array
    image_embedding = image_embedding.detach().numpy()
    result[image_uid] = image_embedding

# save as json file 
with open("image_embeddings.json", "w") as outfile:
    json.dump(result, outfile, default=default)

Using cache found in C:\Users\user/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub


Feature extraction turned on


In [4]:
import faiss                   # make faiss available
from json_numpy import object_hook
import json
import pickle

f = open("facebook-marketplaces-recommendation-ranking-system/image_embeddings.json") 
data = json.load(f, object_hook=object_hook)

index = faiss.IndexFlatL2(1000)   # build the index
print(index.is_trained)

for k,v in data.items():
    index.add(v)                  # add vectors to the index

chunk = faiss.serialize_index(index)
with open("index.pkl", "wb") as f:
    pickle.dump(chunk, f)    
print(index.ntotal)

True
12604
