In [None]:
import pandas as pd

import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image

entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# # Download function
# def download_images(image_urls, save_dir):
#     if not os.path.exists(save_dir):
#         os.makedirs(save_dir)
#     for url in tqdm(image_urls):
#         img_name = os.path.join(save_dir, url.split("/")[-1])
#         img_data = requests.get(url).content
#         with open(img_name, 'wb') as handler:
#             handler.write(img_data)
def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)

    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)
# # Download train and test images
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

# download_images(train_df['image_link'], 'train_images', allow_multiprocessing=True)
# download_images(test_df['image_link'], 'test_images', allow_multiprocessing=True)



In [None]:
# Explore the unique entities
print("train datafframe columns : ",train_df.columns)

print("unique entity name: ",train_df['entity_name'].unique())
print("unique entity value: ",train_df['entity_value'].unique())

import re

# Define a function to handle different entity_value formats
def extract_numeric_value_and_unit(value):
    # Handle ranges (e.g., '6 kilogram to 9 kilogram')
    range_pattern = r'(\d+\.?\d*)\s?([a-zA-Z]+)\s?to\s?(\d+\.?\d*)\s?([a-zA-Z]+)'
    list_pattern = r'\[([\d+,?\s]+)\]\s?([a-zA-Z]+)'
    single_value_pattern = r'(\d+\.?\d*)\s?([a-zA-Z]+)'

    # Check for ranges
    range_match = re.match(range_pattern, value)
    if range_match:
        value1 = float(range_match.group(1))
        unit1 = range_match.group(2)
        value2 = float(range_match.group(3))
        unit2 = range_match.group(4)

        # Ensure both units are the same (to be consistent)
        if unit1 == unit2:
            avg_value = (value1 + value2) / 2
            return avg_value, unit1
        else:
            return None, None  # Flag this for further investigation

    # Check for lists (e.g., '[9, 11] kilogram')
    list_match = re.match(list_pattern, value)
    if list_match:
        numbers = [float(n) for n in list_match.group(1).split(',')]
        unit = list_match.group(2)
        avg_value = sum(numbers) / len(numbers)  # Use the average for simplicity
        return avg_value, unit

    # Check for single values (e.g., '500.0 gram')
    single_match = re.match(single_value_pattern, value)
    if single_match:
        value = float(single_match.group(1))
        unit = single_match.group(2)
        return value, unit

    # If nothing matches, return None (invalid or missing data)
    return None, None

# Apply the function to the entity_value column
train_df['numeric_value'], train_df['unit'] = zip(*train_df['entity_value'].apply(extract_numeric_value_and_unit))

# Display the first few rows to verify
print(train_df[['entity_value', 'numeric_value', 'unit']])



train datafframe columns :  Index(['image_link', 'group_id', 'entity_name', 'entity_value'], dtype='object')
unique entity name:  ['item_weight' 'item_volume' 'voltage' 'wattage'
 'maximum_weight_recommendation' 'height' 'depth' 'width']
unique entity value:  ['500.0 gram' '1.0 cup' '0.709 gram' ... '21.38 inch' '63.3 inch'
 '4.1 metre']
           entity_value  numeric_value        unit
0            500.0 gram        500.000        gram
1               1.0 cup          1.000         cup
2            0.709 gram          0.709        gram
3            0.709 gram          0.709        gram
4        1400 milligram       1400.000   milligram
...                 ...            ...         ...
263854   5.0 centimetre          5.000  centimetre
263855         8.5 inch          8.500        inch
263856  43.2 centimetre         43.200  centimetre
263857   9.1 centimetre          9.100  centimetre
263858  27.5 centimetre         27.500  centimetre

[263859 rows x 3 columns]


In [None]:
# Check for rows where extraction failed
invalid_rows = train_df[train_df['numeric_value'].isna()]
print("Invalid rows (failed extraction):", invalid_rows.shape[0])

# Check if there are any rows where the units didn't match in ranges
unit_mismatch_rows = train_df[train_df['unit'].isna()]
print("Rows with unit mismatch or missing data:", unit_mismatch_rows.shape[0])

# Inspect a few problematic rows (if any)
print(invalid_rows.head())
print(unit_mismatch_rows.head())


# Check the shape of the cleaned dataframe
print(f"Original DataFrame size: {train_df.shape[0]}")

train_df = train_df.dropna(subset=['numeric_value', 'unit'])

print(f"Cleaned DataFrame size: {train_df.shape[0]}")

Invalid rows (failed extraction): 3276
Rows with unit mismatch or missing data: 3276
                                            image_link  group_id  entity_name  \
139  https://m.media-amazon.com/images/I/71Oo4M3Apx...    752266      voltage   
152  https://m.media-amazon.com/images/I/71P0BToikA...    459516  item_volume   
169  https://m.media-amazon.com/images/I/81Qf73SxLa...    752266      voltage   
215  https://m.media-amazon.com/images/I/71L-+M3VVP...    507619  item_weight   
228  https://m.media-amazon.com/images/I/71iSbwHDcd...    648011      voltage   

                entity_value  numeric_value  unit  
139      [100.0, 240.0] volt            NaN  None  
152  [8.0, 12.0] fluid ounce            NaN  None  
169       [85.0, 265.0] volt            NaN  None  
215       [25.0, 30.0] pound            NaN  None  
228      [175.0, 265.0] volt            NaN  None  
                                            image_link  group_id  entity_name  \
139  https://m.media-amazon.com/ima

In [None]:
# Verify no invalid rows remain
invalid_rows_after_cleaning = train_df[train_df['numeric_value'].isna() | train_df['unit'].isna()]
print(f"Invalid rows after cleaning: {invalid_rows_after_cleaning.shape[0]}")


Invalid rows after cleaning: 0


In [None]:
def standardize_units(value, unit):
    conversion_factors = {
        'kilogram': 1000,  # to grams
        'gram': 1,
        'milligram': 1e-3,
        'pound': 453.592,
        'ounce': 28.3495,
        'centimetre': 1,
        'metre': 100,  # to centimetres
        'millimetre': 0.1,
        'inch': 2.54,  # to centimetres
        'foot': 30.48,
        'yard': 91.44,
        'litre': 1000,  # to millilitres
        'millilitre': 1,
        'cup': 236.588,  # to millilitres
        'pint': 473.176,  # to millilitres
        # Add other conversions as necessary
    }

    if unit in conversion_factors:
        return value * conversion_factors[unit]
    return value

# Apply the conversion to the numeric_value and unit columns
train_df['standardized_value'] = train_df.apply(
    lambda row: standardize_units(row['numeric_value'], row['unit']), axis=1)

# Check the standardized values
print(train_df[['entity_value', 'numeric_value', 'unit', 'standardized_value']].head(10))

print(train_df.columns)

     entity_value  numeric_value       unit  standardized_value
0      500.0 gram        500.000       gram             500.000
1         1.0 cup          1.000        cup             236.588
2      0.709 gram          0.709       gram               0.709
3      0.709 gram          0.709       gram               0.709
4  1400 milligram       1400.000  milligram               1.400
5  1400 milligram       1400.000  milligram               1.400
6  1400 milligram       1400.000  milligram               1.400
7  1400 milligram       1400.000  milligram               1.400
8  1400 milligram       1400.000  milligram               1.400
9  1400 milligram       1400.000  milligram               1.400
Index(['image_link', 'group_id', 'entity_name', 'entity_value',
       'numeric_value', 'unit', 'standardized_value'],
      dtype='object')


In [None]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class ProductDataset(Dataset):
    def __init__(self, dataframe, image_folder, transform=None):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_folder, self.dataframe.iloc[idx, 0].split("/")[-1])

        try:
            image = Image.open(img_name).convert('RGB')
        except FileNotFoundError:
            image = Image.new('RGB', (256, 256), color='gray')
            numeric_value = 0.0
        else:
            if self.transform:
                image = self.transform(image)
            numeric_value = self.dataframe.iloc[idx, 4]

        # Convert numeric_value to float32
        numeric_value = torch.tensor(numeric_value, dtype=torch.float32)

        return image, numeric_value




In [None]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # Resize image
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize
])


In [None]:
def image_exists(image_path):
    return os.path.exists(image_path)

# Apply the filter
train_df = train_df[train_df['image_link'].apply(lambda x: image_exists(os.path.join('train_images', x.split('/')[-1])))]


train_dataset = ProductDataset(train_df, 'train_images', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

class ProductModel(nn.Module):
    def __init__(self):
        super(ProductModel, self).__init__()
        self.cnn = models.resnet18(pretrained=True)
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, 100)

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(100, 1)

    def forward(self, x):
        x = self.cnn(x)
        x = self.dropout(x)
        x = self.fc(x)
        return x



In [None]:
from sklearn.metrics import f1_score
import numpy as np

def compute_f1_score(true_labels, predictions):
    y_true = np.array(true_labels)
    y_pred = np.array(predictions)

    # Compute F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    return f1


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import pandas as pd

# Define the model, criterion, and optimizer
model = ProductModel()
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.005)

num_epochs = 10
accuracy_threshold = 0.1  # Define a threshold (e.g., 10%)

# Lists to store predictions and ground truth for F1 score calculation
all_predictions = []
all_ground_truth = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    total_absolute_error = 0.0
    correct_predictions = 0
    total_predictions = 0
    num_batches = len(train_loader)

    with tqdm(total=num_batches, desc=f"Epoch {epoch+1}/{num_epochs}", unit='batch') as pbar:
        for images, labels in train_loader:
            optimizer.zero_grad()

            # Ensure images and labels are of float32
            images = images.to(torch.float32)
            labels = labels.to(torch.float32)

            outputs = model(images)

            # Ensure outputs are of float32
            outputs = outputs.to(torch.float32)

            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Collect predictions and ground truth
            predictions = outputs.squeeze().detach().cpu().numpy()
            ground_truth = labels.detach().cpu().numpy()

            all_predictions.extend(predictions)
            all_ground_truth.extend(ground_truth)

            # Calculate Mean Absolute Error (MAE) for this batch
            batch_absolute_error = torch.mean(torch.abs(outputs.squeeze() - labels)).item()
            total_absolute_error += batch_absolute_error

            # Calculate accuracy based on custom threshold
            batch_accuracy = torch.mean(((torch.abs(outputs.squeeze() - labels) / labels) < accuracy_threshold).float()).item()
            correct_predictions += batch_accuracy * len(labels)
            total_predictions += len(labels)

            # Update progress bar
            pbar.set_postfix({'Loss': running_loss / (pbar.n + 1), 'MAE': total_absolute_error / (pbar.n + 1), 'Accuracy': correct_predictions / total_predictions})
            pbar.update(1)

    # Print epoch summary
    avg_loss = running_loss / num_batches
    avg_mae = total_absolute_error / num_batches
    avg_accuracy = correct_predictions / total_predictions
    avg_f1_score = compute_f1_score(all_ground_truth, all_predictions)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}, MAE: {avg_mae:.4f}, Accuracy: {avg_accuracy:.4f}, F1 Score: {avg_f1_score:.4f}")


Epoch 1/10:  19%|█▊        | 58/310 [14:17<1:02:07, 14.79s/batch, Loss=7.99e+22, MAE=9.14e+9, Accuracy=0.0269]


UnidentifiedImageError: cannot identify image file 'train_images/716B6PlYipL.jpg'

In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms

class TestDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = self.data_frame.iloc[idx, 1]  # image_link column
        img_path = f"{self.img_dir}/{img_name.split('/')[-1]}"
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, self.data_frame.iloc[idx, 3]  # entity_name column

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

test_dataset = TestDataset(csv_file='test.csv', img_dir='test_images', transform=transform)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)


In [None]:
import torch
import pandas as pd

model.eval()  # Set the model to evaluation mode

predictions = []

with torch.no_grad():
    for idx, (images, entity_names) in enumerate(test_loader):
        images = images.to(torch.float32)
        outputs = model(images)
        predicted_value = outputs.item()

        # Append results to predictions list
        predictions.append({
            'index': idx,
            'prediction': f'{predicted_value} {entity_names[0]}'
        })

# Save predictions to CSV
predictions_df = pd.DataFrame(predictions)
predictions_df.to_csv('test_out.csv', index=False)


In [None]:
# import shutil

# # Source path (the folder you want to copy)
# source_folder = 'drive/MyDrive/train_images'

# # Destination path (where you want to copy the folder)
# destination_folder = 'train_images'

# # Copy the folder
# shutil.copytree(source_folder, destination_folder)

# print(f"Folder copied to {destination_folder}")

# import shutil

# # Source path (the folder you want to move)
# source_folder = 'train_images'

# # Destination path (where you want to move the folder)
# destination_folder = 'drive/MyDrive/train_images'

# # Move the folder
# shutil.move(source_folder, destination_folder)

# print(f"Folder moved to {destination_folder}")
# import os

# # Get the current directory
# current_dir = os.getcwd()

# # List directories in the current directory
# directories = [d for d in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, d))]

# print(directories)



# import shutil
# import os

# folder_path = "train_images"

# # Function to delete a folder
# def delete_folder(folder_path):
#     if os.path.exists(folder_path):
#         try:
#             shutil.rmtree(folder_path)
#             print(f"Folder '{folder_path}' and its contents deleted successfully.")
#         except OSError as e:
#             print(f"Error: {e.strerror}")
#     else:
#         print(f"Folder '{folder_path}' does not exist.")

# # Call the function
# delete_folder(folder_path)

In [None]:
def save_model(model, save_path='trained_model.pth'):
    """
    Save the model parameters to a file.

    Args:
        model (torch.nn.Module): The trained model.
        save_path (str): The file path to save the model parameters.
    """
    # Save the model's state_dict (parameters)
    torch.save(model.state_dict(), save_path)
    print(f"Model parameters saved to {save_path}")


In [None]:
def load_model(model, load_path='trained_model.pth'):
    """
    Load the model parameters from a file.

    Args:
        model (torch.nn.Module): The model architecture.
        load_path (str): The file path from which to load the model parameters.
    """
    # Load the saved model state_dict into the model
    model.load_state_dict(torch.load(load_path))
    model.eval()  # Set the model to evaluation mode
    print(f"Model parameters loaded from {load_path}")
