In [8]:
from clean_tabular_data import *
import pandas as pd
#load dataframe with products from csv
products_df = pd.read_csv('Products.csv',lineterminator ='\n')  
# drop nulls and cast pricing to numeric  
products_df = clean_product_data(products_df)

In [2]:
def get_encoder_and_decoder(products_df):
        unique_categories = list(products_df["label"].unique())
        encoder = {k: v for v,k in enumerate(unique_categories)}
        decoder = {v: k for v,k in enumerate(unique_categories)}
        return (encoder,decoder)

# create new label column
products_df["label"] = products_df["category"].str.split(r"\/", expand=True)[0].str.strip()
#get encoder and decoder
encoder,decoder = get_encoder_and_decoder(products_df)
#save decoder to file
f = open("image_decoder.pkl","w")
f.write(str(decoder))
f.close()

NameError: name 'products_df' is not defined

In [10]:
images_df = pd.read_csv('Images.csv',lineterminator ='\n')
# merge images and products to get new label column for images
training_df = pd.merge(images_df, products_df[['id', 'label']], left_on='product_id', right_on='id', how='left', suffixes=('','_y')).drop('id_y', axis=1).drop('product_id', axis=1)
training_df = training_df.iloc[: , 1:]
training_df['label'] = training_df['label'].replace(encoder)
training_df.to_csv('training_data.csv')

In [11]:
from clean_images import *
# path specific to my local machine
clean_image_data('D:/Documents/AICore/images_fb/images')

KeyboardInterrupt: 

In [1]:
from torchvision import transforms
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from FBMDataset import FBMDataset

def get_datasets(training_data_dir, cleaned_images_dir):    
    transform_list = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], 
                                [0.229, 0.224, 0.225])
        ])
    with FileLock(os.path.expanduser("~/.data.lock")):
        dataset = FBMDataset(training_data_dir,cleaned_images_dir,transform=transform_list)
        #obtain the list of targets
        train_dataset,test_dataset,val_dataset = random_split(dataset, [0.7,0.15,0.15])
    return (train_dataset, test_dataset, val_dataset)

# base_dir = "D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system"
# training_dir, img_dir = (os.path.join(base_dir,"training_data.csv"),os.path.join(base_dir,"cleaned_images"))
# tr,t,v = get_datasets(training_dir, img_dir)
# tr[0]

D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system\training_data.csv
<class 'PIL.JpegImagePlugin.JpegImageFile'>


In [2]:
import torch
import os
from datetime import datetime
import torch.nn.functional as F
from torch.optim import SGD
from FBMClassifier import FBMClassifier
from ray import tune
from ray.air import session
#from torch.utils.tensorboard import SummaryWriter

def create_model_dir_path():
    parent_dir = 'model_evaluation'
    current_datetime = datetime.now().strftime('%y%m%d%H%M%S')
    child_dir = 'weights'
    path = os.path.join(os.getcwd(), parent_dir, current_datetime, child_dir)
    return path

def training_loop(model, optimiser, train_loader, epoch_num,device=None):  
    # writer = SummaryWriter()
    batch_id = 0     
    # Set the model to run on the device
    model = model.to(device)
    model.train(True)     
    print(f'Beginning Batches for epoch {epoch_num}')
    print(len(train_loader))
    for batch in train_loader:
        # get features and labels from the batch
        features,labels = batch
        if(epoch_num >= 1):
            print(f"features + labels extracted: {batch_id}")
        features = features.to(device)
        labels = labels.to(device, non_blocking=True)
        if(epoch_num >= 1):
            print(f"features + labels added to device: {batch_id}")
        # loss.backward does not overwrite, it adds. To stop this, we set the gradients back to zero. sets the .grad of all optimized tensors to zero
        optimiser.zero_grad()
        if(epoch_num >= 1):
            print(f"zero grad: {batch_id}")
        # make a prediction
        prediction = model(features)
        if(epoch_num >= 1):
            print(f"prediction: {batch_id}")
        # calculate loss
        criterion = F.cross_entropy(prediction,labels)
        if(epoch_num >= 1):
            print(f"cross entropy: {batch_id}")
        # backward function calculates the gradient of the current tensor w.r.t graph leaves
        criterion.backward()
        if(epoch_num >= 1):
            print(f"criterion backward: {batch_id}")
        
        # print(criterion.item())
        # moves each parameter in the opposite direction of the gradient, proportional to the learning rate
        optimiser.step()
        if(epoch_num >= 1):
            print(f"optimiser: {batch_id}")
        # writer.add_scalar('Loss', criterion.item(), batch_id)
        batch_id += 1
        print(f"completed: {batch_id}")
    print("Completed")
    

def validate(model, val_loader, device):
    # Set the model to evaluation mode
    model.eval()
    running_vloss = 0.0
    with torch.no_grad():
        for i, vdata in enumerate(val_loader):
            vinputs, vlabels = vdata                
            vinputs = vinputs.to(device)
            vlabels = vlabels.to(device, non_blocking=True)
            
            voutputs = model(vinputs)
            vloss = F.cross_entropy(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    return avg_vloss

#Steps for transfer learning: get pre trained model, change architecture to fit our problem, fine tune for our problem

# model = FBMClassifier()
# optimiser = SGD(model.parameters(), lr = 0.0001)    
# train_loader, test_loader, val_loader = get_data_loaders("training_data.csv","cleaned_images")
# training_loop(model,optimiser, 3)


In [3]:
def train_fbm(config, device=None):    
    model = FBMClassifier()
    optimiser = SGD(model.resnet50.fc.parameters(), lr = config["lr"])    
    base_dir = "D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system"
    train_dataset, test_dataset, val_dataset = get_datasets(os.path.join(base_dir,"training_data.csv"),os.path.join(base_dir,"cleaned_images"))
    
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")
    print(device)
    # Set the model to run on the device
    model = model.to(device)

    train_loader = DataLoader(train_dataset,batch_size=config["batch_size"],shuffle=True)
    test_loader = DataLoader(test_dataset,batch_size=config["batch_size"],shuffle=True)
    val_loader = DataLoader(val_dataset,batch_size=config["batch_size"],shuffle=True)
    
    path = create_model_dir_path()
    os.makedirs(path)
    
    for epoch in range(10):
        print(f"Beginning {epoch} ...")
        # tune.utils.wait_for_gpu()
        training_loop(model,optimiser,train_loader, epoch, device=device)
        print('Training complete ...')
        loss = validate(model, val_loader, device=device)
        print('Validation complete ...')        
        print(f'Epoch {epoch} - Average Loss: {loss}')
        session.report(metrics={"loss": loss})#tune.report(mean_accuracy=loss)
        torch.save(model.state_dict(), path + f'/epoch_{epoch}.pt')
        #torch.save(model.state_dict(), f'/image_model.pt')
        print(f"Ending {epoch} ...")

# analysis = tune.run(train_fbm, config={"lr": tune.loguniform(1e-4, 1e-1),"batch_size": tune.choice([2, 4, 8, 16])})
# print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

from ray.tune.schedulers import ASHAScheduler
from ray.air import RunConfig
config = {        
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
scheduler = ASHAScheduler(
        max_t=10,
        grace_period=1,
        reduction_factor=2)
tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_fbm),
             resources=tune.PlacementGroupFactory([{"CPU": 10}])
        ),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            scheduler=scheduler,
            num_samples=3,
        ),
        param_space=config,
        run_config= RunConfig(verbose=3)
    )

results = tuner.fit()
best_result = results.get_best_result("loss", "min")
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
    best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_result.metrics["accuracy"]))


2023-07-22 12:00:25,060	INFO worker.py:1636 -- Started a local Ray instance.
2023-07-22 12:00:27,490	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.


0,1
Current time:,2023-07-22 12:02:44
Running for:,00:02:17.18
Memory:,13.9/15.9 GiB

Trial name,status,loc,batch_size,lr
train_fbm_f74c6_00000,RUNNING,127.0.0.1:20192,4,0.0142945
train_fbm_f74c6_00001,RUNNING,127.0.0.1:7988,4,0.00328217
train_fbm_f74c6_00002,PENDING,,16,0.0220845


[2m[36m(pid=20192)[0m D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system\training_data.csv
[2m[36m(pid=20192)[0m <class 'PIL.JpegImagePlugin.JpegImageFile'>


[2m[36m(train_fbm pid=20192)[0m Using cache found in C:\Users\user/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub


[2m[36m(train_fbm pid=20192)[0m cuda
[2m[36m(train_fbm pid=20192)[0m Beginning 0 ...
[2m[36m(train_fbm pid=20192)[0m Beginning Batches for epoch 0
[2m[36m(train_fbm pid=20192)[0m 2206
[2m[36m(train_fbm pid=20192)[0m completed: 1
[2m[36m(train_fbm pid=7988)[0m D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system\training_data.csv[32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[2m[36m(pid=7988)[0m completed: 1
[2m[36m(train_fbm pid=7988)[0m completed: 4
[2m[36m(train_fbm pid=7988)[0m completed: 8
[2m[36m(train_fbm pid=7988)[0m completed: 8
[2m[36m(train_fbm pid=7988)[0m completed: 8
[2m[36m(train_fbm pid=20192)[0m completed: 61[32m [repeated 129x across cluster][0m
[2m[36m(train_fbm pid=7988)[0m completed: 161[32m [repeated 190x across cluster]

Trial name,date,done,hostname,iterations_since_restore,loss,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
train_fbm_f74c6_00000,2023-07-22_12-02-47,False,mesh-pc,1,1.89645,127.0.0.1,20192,133.381,133.381,133.381,1690023767,1,f74c6_00000


[2m[36m(train_fbm pid=20192)[0m Ending 0 ...
[2m[36m(train_fbm pid=20192)[0m Beginning 1 ...
[2m[36m(train_fbm pid=20192)[0m Beginning Batches for epoch 1
[2m[36m(train_fbm pid=20192)[0m 2206
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 0
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 0
[2m[36m(train_fbm pid=20192)[0m zero grad: 0
[2m[36m(train_fbm pid=20192)[0m prediction: 0
[2m[36m(train_fbm pid=20192)[0m cross entropy: 0
[2m[36m(train_fbm pid=20192)[0m criterion backward: 0
[2m[36m(train_fbm pid=20192)[0m optimiser: 0
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 1
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 1
[2m[36m(train_fbm pid=20192)[0m zero grad: 1
[2m[36m(train_fbm pid=20192)[0m prediction: 1
[2m[36m(train_fbm pid=20192)[0m cross entropy: 1
[2m[36m(train_fbm pid=20192)[0m criterion backward: 1
[2m[36m(train_fbm pid=20192)[0m optimiser: 1
[2m[36m(tr

[2m[36m(train_fbm pid=7988)[0m 2023-07-22 12:02:52,083	ERROR syncer.py:466 -- Caught sync error: Sync process failed: GetFileInfo() yielded path 'C:/Users/user/ray_results/train_fbm_2023-07-22_12-00-21/train_fbm_f74c6_00001_1_batch_size=4,lr=0.0033_2023-07-22_12-00-27/events.out.tfevents.1690023633.mesh-pc', which is outside base dir 'C:\Users\user\ray_results\train_fbm_2023-07-22_12-00-21\train_fbm_f74c6_00001_1_batch_size=4,lr=0.0033_2023-07-22_12-00-27\'. Retrying after sleeping for 1.0 seconds...
[2m[36m(train_fbm pid=7988)[0m 2023-07-22 12:02:52,083	ERROR syncer.py:466 -- Caught sync error: Sync process failed: GetFileInfo() yielded path 'C:/Users/user/ray_results/train_fbm_2023-07-22_12-00-21/train_fbm_f74c6_00001_1_batch_size=4,lr=0.0033_2023-07-22_12-00-27/events.out.tfevents.1690023633.mesh-pc', which is outside base dir 'C:\Users\user\ray_results\train_fbm_2023-07-22_12-00-21\train_fbm_f74c6_00001_1_batch_size=4,lr=0.0033_2023-07-22_12-00-27\'. Retrying after sleeping f

[2m[36m(train_fbm pid=20192)[0m prediction: 63
[2m[36m(train_fbm pid=20192)[0m cross entropy: 63
[2m[36m(train_fbm pid=20192)[0m criterion backward: 63
[2m[36m(train_fbm pid=20192)[0m optimiser: 63
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 64
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 64
[2m[36m(train_fbm pid=20192)[0m zero grad: 64
[2m[36m(train_fbm pid=20192)[0m prediction: 64
[2m[36m(train_fbm pid=20192)[0m cross entropy: 64
[2m[36m(train_fbm pid=20192)[0m criterion backward: 64
[2m[36m(train_fbm pid=20192)[0m optimiser: 64
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 65
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 65
[2m[36m(train_fbm pid=20192)[0m zero grad: 65
[2m[36m(train_fbm pid=20192)[0m prediction: 65
[2m[36m(train_fbm pid=20192)[0m cross entropy: 65
[2m[36m(train_fbm pid=20192)[0m criterion backward: 65
[2m[36m(train_fbm pid=20192)[0m optimi

[2m[36m(train_fbm pid=7988)[0m 2023-07-22 12:02:53,097	ERROR syncer.py:466 -- Caught sync error: Sync process failed: GetFileInfo() yielded path 'C:/Users/user/ray_results/train_fbm_2023-07-22_12-00-21/train_fbm_f74c6_00001_1_batch_size=4,lr=0.0033_2023-07-22_12-00-27/events.out.tfevents.1690023633.mesh-pc', which is outside base dir 'C:\Users\user\ray_results\train_fbm_2023-07-22_12-00-21\train_fbm_f74c6_00001_1_batch_size=4,lr=0.0033_2023-07-22_12-00-27\'. Retrying after sleeping for 1.0 seconds...


[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 92
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 92
[2m[36m(train_fbm pid=20192)[0m zero grad: 92
[2m[36m(train_fbm pid=20192)[0m prediction: 92
[2m[36m(train_fbm pid=20192)[0m cross entropy: 92
[2m[36m(train_fbm pid=20192)[0m criterion backward: 92
[2m[36m(train_fbm pid=20192)[0m optimiser: 92
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 93
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 93
[2m[36m(train_fbm pid=20192)[0m zero grad: 93
[2m[36m(train_fbm pid=20192)[0m prediction: 93
[2m[36m(train_fbm pid=20192)[0m cross entropy: 93
[2m[36m(train_fbm pid=20192)[0m criterion backward: 93
[2m[36m(train_fbm pid=20192)[0m optimiser: 93
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 94
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 94
[2m[36m(train_fbm pid=20192)[0m zero grad: 94
[2m[36m(tra

[2m[36m(train_fbm pid=7988)[0m 2023-07-22 12:02:54,103	ERROR syncer.py:466 -- Caught sync error: Sync process failed: GetFileInfo() yielded path 'C:/Users/user/ray_results/train_fbm_2023-07-22_12-00-21/train_fbm_f74c6_00001_1_batch_size=4,lr=0.0033_2023-07-22_12-00-27/events.out.tfevents.1690023633.mesh-pc', which is outside base dir 'C:\Users\user\ray_results\train_fbm_2023-07-22_12-00-21\train_fbm_f74c6_00001_1_batch_size=4,lr=0.0033_2023-07-22_12-00-27\'. Retrying after sleeping for 1.0 seconds...


[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 119
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 119
[2m[36m(train_fbm pid=20192)[0m zero grad: 119
[2m[36m(train_fbm pid=20192)[0m prediction: 119
[2m[36m(train_fbm pid=20192)[0m cross entropy: 119
[2m[36m(train_fbm pid=20192)[0m criterion backward: 119
[2m[36m(train_fbm pid=20192)[0m optimiser: 119
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 120
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 120
[2m[36m(train_fbm pid=20192)[0m zero grad: 120
[2m[36m(train_fbm pid=20192)[0m prediction: 120
[2m[36m(train_fbm pid=20192)[0m cross entropy: 120
[2m[36m(train_fbm pid=20192)[0m criterion backward: 120
[2m[36m(train_fbm pid=20192)[0m optimiser: 120
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 121
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 121
[2m[36m(train_fbm pid=20192)[0m zero grad: 

[2m[36m(train_fbm pid=7988)[0m 2023-07-22 12:02:55,114	ERROR trainable.py:671 -- Could not upload checkpoint to c://\Users\user\ray_results\train_fbm_2023-07-22_12-00-21\train_fbm_f74c6_00001_1_batch_size=4,lr=0.0033_2023-07-22_12-00-27 even after 3 retries.Please check if the credentials expired and that the remote filesystem is supported. For large checkpoints or artifacts, consider increasing `SyncConfig(sync_timeout)` (current value: 1800 seconds).


[2m[36m(train_fbm pid=20192)[0m prediction: 148
[2m[36m(train_fbm pid=20192)[0m cross entropy: 148
[2m[36m(train_fbm pid=20192)[0m criterion backward: 148
[2m[36m(train_fbm pid=20192)[0m optimiser: 148
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 149
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 149
[2m[36m(train_fbm pid=20192)[0m zero grad: 149
[2m[36m(train_fbm pid=20192)[0m prediction: 149
[2m[36m(train_fbm pid=20192)[0m cross entropy: 149
[2m[36m(train_fbm pid=20192)[0m criterion backward: 149
[2m[36m(train_fbm pid=20192)[0m optimiser: 149
[2m[36m(train_fbm pid=20192)[0m features + labels extracted: 150
[2m[36m(train_fbm pid=20192)[0m features + labels added to device: 150
[2m[36m(train_fbm pid=20192)[0m zero grad: 150
[2m[36m(train_fbm pid=20192)[0m prediction: 150
[2m[36m(train_fbm pid=20192)[0m cross entropy: 150
[2m[36m(train_fbm pid=20192)[0m criterion backward: 150
[2m[36m(train_fbm pid=

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.