In [8]:
from clean_tabular_data import *
import pandas as pd
#load dataframe with products from csv
products_df = pd.read_csv('Products.csv',lineterminator ='\n')  
# drop nulls and cast pricing to numeric  
products_df = clean_product_data(products_df)

In [2]:
def get_encoder_and_decoder(products_df):
        unique_categories = list(products_df["label"].unique())
        encoder = {k: v for v,k in enumerate(unique_categories)}
        decoder = {v: k for v,k in enumerate(unique_categories)}
        return (encoder,decoder)

# create new label column
products_df["label"] = products_df["category"].str.split(r"\/", expand=True)[0].str.strip()
#get encoder and decoder
encoder,decoder = get_encoder_and_decoder(products_df)
#save decoder to file
f = open("image_decoder.pkl","w")
f.write(str(decoder))
f.close()

NameError: name 'products_df' is not defined

In [10]:
images_df = pd.read_csv('Images.csv',lineterminator ='\n')
# merge images and products to get new label column for images
training_df = pd.merge(images_df, products_df[['id', 'label']], left_on='product_id', right_on='id', how='left', suffixes=('','_y')).drop('id_y', axis=1).drop('product_id', axis=1)
training_df = training_df.iloc[: , 1:]
training_df['label'] = training_df['label'].replace(encoder)
training_df.to_csv('training_data.csv')

In [11]:
from clean_images import *
# path specific to my local machine
clean_image_data('D:/Documents/AICore/images_fb/images')

KeyboardInterrupt: 

In [1]:
from torchvision import transforms
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from FBMDataset import FBMDataset

def get_datasets(training_data_dir, cleaned_images_dir):    
    transform_list = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], 
                                [0.229, 0.224, 0.225])
        ])
    with FileLock(os.path.expanduser("~/.data.lock")):
        dataset = FBMDataset(training_data_dir,cleaned_images_dir,transform=transform_list)
        #obtain the list of targets
        train_dataset,test_dataset,val_dataset = random_split(dataset, [0.7,0.15,0.15])
    return (train_dataset, test_dataset, val_dataset)

In [2]:
import torch
import os
from datetime import datetime
import torch.nn.functional as F
from torch.optim import SGD
from FBMClassifier import FBMClassifier
from ray import tune
from ray.air import session,Checkpoint
#from torch.utils.tensorboard import SummaryWriter

def create_model_dir_path():
    parent_dir = 'model_evaluation'
    current_datetime = datetime.now().strftime('%y%m%d%H%M%S')
    child_dir = 'weights'
    path = os.path.join(os.getcwd(), parent_dir, current_datetime, child_dir)
    return path

def training_loop(model, optimiser, train_loader, epoch_num,device=None):  
    # writer = SummaryWriter()
    batch_id = 0     
    # Set the model to run on the device
    model = model.to(device)
    model.train(True)     
    print(f'Beginning Batches for epoch {epoch_num}')
    print(len(train_loader))
    for batch in train_loader:
        # get features and labels from the batch
        features,labels = batch
        features = features.to(device)
        labels = labels.to(device, non_blocking=True)
        # loss.backward does not overwrite, it adds. To stop this, we set the gradients back to zero. sets the .grad of all optimized tensors to zero
        optimiser.zero_grad()
        # make a prediction
        prediction = model(features)
        # calculate loss
        criterion = F.cross_entropy(prediction,labels)
        # backward function calculates the gradient of the current tensor w.r.t graph leaves
        criterion.backward()
        # moves each parameter in the opposite direction of the gradient, proportional to the learning rate
        optimiser.step()
        # writer.add_scalar('Loss', criterion.item(), batch_id)
        batch_id += 1
        print(f"completed: {batch_id}")
    print("Completed")
    

def validate(model, val_loader, device):
    # Set the model to evaluation mode
    model.eval()
    val_loss = 0.0
    val_steps = 0
    total = 0
    correct = 0
    for i, data in enumerate(val_loader, 0):
        with torch.no_grad():
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            loss = F.cross_entropy(outputs, labels)
            val_loss += loss.cpu().numpy()
            val_steps += 1
    return (val_loss / val_steps),(correct / total)


In [3]:
def train_fbm(config):    
    hyperparameter_tuning_on=config["hyperparameter_tuning_on"]
    is_feature_extraction_model=config["is_feature_extraction_model"]
    model = FBMClassifier(is_feature_extraction_model)
    optimiser = SGD(model.resnet50.fc.parameters(), lr = config["lr"])    
    base_dir = "D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system"
    train_dataset, test_dataset, val_dataset = get_datasets(os.path.join(base_dir,"training_data.csv"),os.path.join(base_dir,"cleaned_images"))
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    # Set the model to run on the device
    model = model.to(device)

    train_loader = DataLoader(train_dataset,batch_size=config["batch_size"],shuffle=True)
    test_loader = DataLoader(test_dataset,batch_size=config["batch_size"],shuffle=True)
    val_loader = DataLoader(val_dataset,batch_size=config["batch_size"],shuffle=True)
    
    path = create_model_dir_path()
    os.makedirs(path)
    
    for epoch in range(1):
        print(f"Beginning {epoch} ...")
        # tune.utils.wait_for_gpu()
        training_loop(model,optimiser,train_loader, epoch, device=device)
        print('Training complete ...')
        loss, accuracy = validate(model, val_loader, device=device)
        print('Validation complete ...')        
        print(f'Epoch {epoch} - Average Loss: {loss}')
        print(f'Epoch {epoch} - Accuracy: {accuracy}')

        if hyperparameter_tuning_on:
            os.makedirs("my_model", exist_ok=True)
            torch.save(
                (model.state_dict(), optimiser.state_dict()), "my_model/checkpoint.pt")
            checkpoint = Checkpoint.from_directory("my_model")
            session.report({"loss": loss, "accuracy": accuracy}, checkpoint=checkpoint)
        elif is_feature_extraction_model:
            torch.save(model.state_dict(), path + f'/image_model.pt')
        else:    
            torch.save(model.state_dict(), path + f'/epoch_{epoch}.pt')
        print(f"Ending {epoch} ...")

config = {        
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16]),
        "hyperparameter_tuning_on": True,
        "is_feature_extraction_model": False
    }
tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_fbm),
             resources=tune.PlacementGroupFactory([{"CPU": 2,"GPU": 1}])
        ),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            num_samples=2,
        ),
        param_space=config
    )

results = tuner.fit()
best_result = results.get_best_result("loss", "min")
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
    best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_result.metrics["accuracy"]))


2023-07-27 14:43:23,292	INFO worker.py:1636 -- Started a local Ray instance.
2023-07-27 14:43:25,601	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.


0,1
Current time:,2023-07-27 14:47:13
Running for:,00:03:47.95
Memory:,11.8/15.9 GiB

Trial name,status,loc,batch_size,lr,iter,total time (s),loss,accuracy
train_fbm_8f926_00000,TERMINATED,127.0.0.1:7580,8,0.000680138,1,85.7141,2.50453,0.192063
train_fbm_8f926_00001,TERMINATED,127.0.0.1:7580,2,0.00183746,1,122.53,2.25018,0.316402


[2m[36m(train_fbm pid=7580)[0m Using cache found in C:\Users\user/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub


[2m[36m(train_fbm pid=7580)[0m D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system\training_data.csv
[2m[36m(train_fbm pid=7580)[0m cuda
[2m[36m(train_fbm pid=7580)[0m Beginning 0 ...
[2m[36m(train_fbm pid=7580)[0m Beginning Batches for epoch 0
[2m[36m(train_fbm pid=7580)[0m 1103
[2m[36m(train_fbm pid=7580)[0m completed: 1
[2m[36m(train_fbm pid=7580)[0m completed: 2
[2m[36m(train_fbm pid=7580)[0m completed: 3
[2m[36m(train_fbm pid=7580)[0m completed: 4
[2m[36m(train_fbm pid=7580)[0m completed: 5
[2m[36m(train_fbm pid=7580)[0m completed: 6
[2m[36m(train_fbm pid=7580)[0m completed: 7
[2m[36m(train_fbm pid=7580)[0m completed: 8
[2m[36m(train_fbm pid=7580)[0m completed: 9
[2m[36m(train_fbm pid=7580)[0m completed: 10
[2m[36m(train_fbm pid=7580)[0m completed: 11
[2m[36m(train_fbm pid=7580)[0m completed: 12
[2m[36m(train_fbm pid=7580)[0m completed: 13
[2m[36m(train_fbm pid=7580)[0m completed: 14
[2m[36m(train_fbm p

Trial name,accuracy,date,done,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
train_fbm_8f926_00000,0.192063,2023-07-27_14-44-58,True,"0_batch_size=8,lr=0.0007",mesh-pc,1,2.50453,127.0.0.1,7580,True,85.7141,85.7141,85.7141,1690465498,1,8f926_00000
train_fbm_8f926_00001,0.316402,2023-07-27_14-47-07,True,"1_batch_size=2,lr=0.0018",mesh-pc,1,2.25018,127.0.0.1,7580,True,122.53,122.53,122.53,1690465627,1,8f926_00001


[2m[36m(train_fbm pid=7580)[0m 2023-07-27 14:44:58,850	ERROR syncer.py:466 -- Caught sync error: Sync process failed: [WinError 32] Failed copying 'C:/Users/user/ray_results/train_fbm_2023-07-27_14-43-19/train_fbm_8f926_00000_0_batch_size=8,lr=0.0007_2023-07-27_14-43-25/checkpoint_000000/.is_checkpoint' to 'c:///Users/user/ray_results/train_fbm_2023-07-27_14-43-19/train_fbm_8f926_00000_0_batch_size=8,lr=0.0007_2023-07-27_14-43-25/checkpoint_000000/.is_checkpoint'. Detail: [Windows error 32] The process cannot access the file because it is being used by another process.
[2m[36m(train_fbm pid=7580)[0m . Retrying after sleeping for 1.0 seconds...
[2m[36m(train_fbm pid=7580)[0m 2023-07-27 14:44:59,896	ERROR syncer.py:466 -- Caught sync error: Sync process failed: [WinError 32] Failed copying 'C:/Users/user/ray_results/train_fbm_2023-07-27_14-43-19/train_fbm_8f926_00000_0_batch_size=8,lr=0.0007_2023-07-27_14-43-25/checkpoint_000000/.is_checkpoint' to '/Users/user/ray_results/train_

[2m[36m(train_fbm pid=7580)[0m Ending 0 ...
[2m[36m(train_fbm pid=7580)[0m D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system\training_data.csv
[2m[36m(train_fbm pid=7580)[0m cuda
[2m[36m(train_fbm pid=7580)[0m Beginning 0 ...
[2m[36m(train_fbm pid=7580)[0m Beginning Batches for epoch 0
[2m[36m(train_fbm pid=7580)[0m 4412
[2m[36m(train_fbm pid=7580)[0m completed: 1
[2m[36m(train_fbm pid=7580)[0m completed: 2
[2m[36m(train_fbm pid=7580)[0m completed: 3
[2m[36m(train_fbm pid=7580)[0m completed: 4
[2m[36m(train_fbm pid=7580)[0m completed: 5
[2m[36m(train_fbm pid=7580)[0m completed: 6
[2m[36m(train_fbm pid=7580)[0m completed: 7
[2m[36m(train_fbm pid=7580)[0m completed: 8
[2m[36m(train_fbm pid=7580)[0m completed: 9
[2m[36m(train_fbm pid=7580)[0m completed: 10
[2m[36m(train_fbm pid=7580)[0m completed: 11
[2m[36m(train_fbm pid=7580)[0m completed: 12
[2m[36m(train_fbm pid=7580)[0m completed: 13
[2m[36m(train_fbm pi

[2m[36m(train_fbm pid=7580)[0m 
[2m[36m(train_fbm pid=7580)[0m 2023-07-27 14:47:07,510	ERROR syncer.py:466 -- Caught sync error: Sync process failed: [WinError 32] Failed copying 'C:/Users/user/ray_results/train_fbm_2023-07-27_14-43-19/train_fbm_8f926_00001_1_batch_size=2,lr=0.0018_2023-07-27_14-43-25/checkpoint_000000/.is_checkpoint' to 'c:///Users/user/ray_results/train_fbm_2023-07-27_14-43-19/train_fbm_8f926_00001_1_batch_size=2,lr=0.0018_2023-07-27_14-43-25/checkpoint_000000/.is_checkpoint'. Detail: [Windows error 32] The process cannot access the file because it is being used by another process.
[2m[36m(train_fbm pid=7580)[0m . Retrying after sleeping for 1.0 seconds...
[2m[36m(train_fbm pid=7580)[0m 2023-07-27 14:47:08,526	ERROR syncer.py:466 -- Caught sync error: Sync process failed: [WinError 32] Failed copying 'C:/Users/user/ray_results/train_fbm_2023-07-27_14-43-19/train_fbm_8f926_00001_1_batch_size=2,lr=0.0018_2023-07-27_14-43-25/checkpoint_000000/.is_checkpoint'

[2m[36m(train_fbm pid=7580)[0m Ending 0 ...


[2m[36m(train_fbm pid=7580)[0m 2023-07-27 14:47:13,576	ERROR trainable.py:671 -- Could not upload checkpoint to c://\Users\user\ray_results\train_fbm_2023-07-27_14-43-19\train_fbm_8f926_00001_1_batch_size=2,lr=0.0018_2023-07-27_14-43-25 even after 3 retries.Please check if the credentials expired and that the remote filesystem is supported. For large checkpoints or artifacts, consider increasing `SyncConfig(sync_timeout)` (current value: 1800 seconds).
2023-07-27 14:47:13,604	INFO tune.py:1111 -- Total run time: 228.00 seconds (227.94 seconds for the tuning loop).


Best trial config: {'lr': 0.0018374559964463406, 'batch_size': 2, 'hyperparameter_tuning_on': True, 'is_feature_extraction_model': False}
Best trial final validation loss: 2.250175235889576
Best trial final validation accuracy: 0.3164021164021164
