# Sandbox

First, we will create a Pandas Dataframe out of the Products.csv and clean it.

In [216]:
from data_cleaning import DataCleaning
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision.io import read_image
from torchvision import transforms as T
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

In [5]:
product_path = 'EC2_files/Products.csv'
products_df = pd.read_csv(product_path, lineterminator='\n')

products_df.drop(columns='Unnamed: 0', inplace=True) #dropping unnecessary index column

clean_products_df = DataCleaning(products_df).clean_product_table()

NUMBER OF ROWS REMAINING: 7156

Cleaning prices...
NUMBER OF ROWS REMAINING: 7156

Cleaning nulls...
NUMBER OF ROWS REMAINING: 7156

Displaying nulls in each column:
id                     0
product_name           0
category               0
product_description    0
price                  0
location               0
dtype: int64


Next we can create a column called 'root_category which will extract the main category from the 'category' column e.g. root category of "Home & Garden / Dining, Living Room Furniture / Mirrors, Clocks & Ornaments" is "Home & Garden". 

Making a set out of this column give us all the unique entries, allowing us to make an encoder(enumerating each category) and decoder(finding the category from the assigned number).

In [48]:
clean_products_df['root_category'] = clean_products_df['category'].apply(lambda x:x.split(' / ')[0])

root_set = set(clean_products_df['root_category'])

encoder = {category: index for index, category in enumerate(root_set)}
decoder = {category: index for index, category in encoder.items()}

with open('encoder.pkl', 'wb') as handle:
    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('encoder.pkl', 'rb') as checker: #creating a file out of the encoder dict and checking it was exported correctly
    copy = pickle.load(checker)

print(encoder == copy)

clean_products_df['label'] = clean_products_df['root_category'].apply(lambda x:encoder[x])


True


Merging the Images and Product dataframes to produce a training set with the encoded labels for each image

In [13]:
images_df = pd.read_csv('EC2_files/Images.csv')
images_df.drop(columns='Unnamed: 0', inplace=True) #dropping unnecessary index column

clean_products_df.rename(columns={'id': 'product_id'}, inplace=True) #renaming id column to product_id for table merge

merged_df = pd.merge(clean_products_df, images_df, on='product_id')

training_df = merged_df.drop(columns=['product_id', 'product_name', 'category', 'product_description', 'price', 'location','root_category'])

training_df.to_csv('training_data.csv')

We can create an ImageDataset class that inherits from the torch.utils.data.Dataset class so that it is compatible with the tensor transformations to come. The dataset can then be split into training, validation and testing sets and turned into DataLoader objects.

In [214]:
class ImageDataset(Dataset):
    
    def __init__(self, to_image = False):
        super().__init__()
        self.labels = pd.read_csv('training_data.csv',lineterminator='\n')
        self.to_image = to_image

    def __getitem__(self, index):
        img_path = f'cleaned_images/{self.labels['id'].iloc[index]}.jpg'
        image = read_image(img_path).float()
        label = self.labels['label'].iloc[index]
        if self.to_image == True:    
            transform = T.ToPILImage()
            image = transform(image)
        return image, label

    def __len__(self):
        return len(self.labels)
    
data = ImageDataset(to_image=False)

train_size = int(0.8 * len(data))
val_size = int(0.1 * len(data))
test_size = len(data) - (train_size + val_size)

train_data, val_data, test_data = torch.utils.data.random_split(data, [train_size, val_size, test_size])

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [215]:
def manual_checker(idx): 
    """
    function that retrieves the image and original data 
    from merged_df to check individual items
    """
    transform = T.ToPILImage()
    image = transform(data[idx][0])
    image.show()
    label = data[idx][1]
    print(decoder[label])
    print(merged_df.iloc[idx])

manual_checker(8889)

Phones, Mobile Phones & Telecoms
product_id                          6f0cb3ab-3c46-463e-b808-6985d7721cd9
product_name           Red iPhone XR 64GB | in Ayr, South Ayrshire | ...
category               Phones, Mobile Phones & Telecoms / Mobile Phon...
product_description    Like new £230 Ono comes with box and charger c...
price                                                              230.0
location                                             Ayr, South Ayrshire
root_category                           Phones, Mobile Phones & Telecoms
label                                                                 12
id                                  126148aa-f18e-4f4a-8836-ca8942b0fb79
Name: 8889, dtype: object


In [205]:
class ImageClassifier(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet50 = torch.hub.load(
            'NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)
        self.resnet50.fc = torch.nn.Linear(2048, 13)

    def forward(self, X):
        return self.resnet50(X)
    
def train(model, dataloader, epochs=10):

    optimiser = torch.optim.SGD(model.parameters(), lr=0.1)

    writer = SummaryWriter()
    batch_idx = 0
    
    for epoch in range(epochs):
        for batch in dataloader:
            feature, label = batch
            # print(feature)
            prediction = model(feature)
            loss = F.cross_entropy(prediction, label)
            loss.backward()
            print(loss.item())
            optimiser.step()
            optimiser.zero_grad()
            writer.add_scalar('Loss', loss.item(), batch_idx)
            batch_idx += 1

In [206]:
imgmodel = ImageClassifier()

train(imgmodel, train_loader)


Using cache found in /Users/varghesejacob/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


2.5812928676605225


KeyboardInterrupt: 

In [185]:
imgmodel


ImageClassifier(
  (resnet50): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layers): Sequential(
      (0): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=Tr