In [9]:
!pip install duckdb -q

In [12]:
!pip install paddleocr -q

In [None]:
!pip install tensorflow==2.15 -q

In [2]:
import re
#import constants
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image
import argparse

In [25]:
from torch.utils.data import Dataset, DataLoader

In [3]:
def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s==None or str(s)=='nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError("Invalid format in {}".format(s))
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
            unit, s, constants.allowed_units))
    return number, unit


def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)

    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)


In [4]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

In [7]:
df=pd.read_csv('/content/train.csv')

In [28]:
train_path='/content/train.csv'

In [29]:
df=duckdb.sql(f"""(SELECT *
                        FROM '{train_path}'
                        WHERE entity_name='width'
                        ORDER BY random()
                        LIMIT 10)
                        UNION ALL
                        (SELECT *
                        FROM '{train_path}'
                        WHERE entity_name='depth'
                        ORDER BY random()
                        LIMIT 10)
                        UNION ALL
                        (SELECT *
                        FROM '{train_path}'
                        WHERE entity_name='height'
                        ORDER BY random()
                        LIMIT 10)
                        UNION ALL
                        (SELECT *
                        FROM '{train_path}'
                        WHERE entity_name='item_weight'
                        ORDER BY random()
                        LIMIT 10)
                        UNION ALL
                        (SELECT *
                        FROM '{train_path}'
                        WHERE entity_name='maximum_weight_recommendation'
                        ORDER BY random()
                        LIMIT 10)
                        UNION ALL
                        (SELECT *
                        FROM '{train_path}'
                        WHERE entity_name='voltage'
                        ORDER BY random()
                        LIMIT 10)
                        UNION ALL
                        (SELECT *
                        FROM '{train_path}'
                        WHERE entity_name='wattage'
                        ORDER BY random()
                        LIMIT 10)
                        UNION ALL
                        (SELECT *
                        FROM '{train_path}'
                        WHERE entity_name='item_volume'
                        ORDER BY random()
                        LIMIT 10)""").to_df()

In [30]:
download_images(df['image_link'], '/content/train_images')

100%|██████████| 80/80 [00:01<00:00, 67.75it/s]


In [31]:
df.to_csv('unique1.csv', index=False)

In [26]:
class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.label_mapping = {name: idx for idx, name in enumerate(self.data['entity_name'].unique())}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.basename(self.data.iloc[idx, 0])
        img_path = os.path.join(self.root_dir, img_name)
        if not os.path.isfile(img_path):
            print(f"File {img_path} not found.")
            return None, None

        image = Image.open(img_path).convert('RGB')
        label = self.data.iloc[idx, 2]
        label = self.label_mapping[label]

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label)

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.models import resnet50
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
import torch.nn.functional as F

# Data Augmentation for training set
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class CustomRegressor(nn.Module):
    def __init__(self, pretrained_model): # Changed _init_ to __init__
        super(CustomRegressor, self).__init__() # Changed _init_ to __init__
        self.backbone = pretrained_model
        self.backbone.fc = nn.Identity()  # Remove the classification layer

        # Fully connected layers for regression
        self.fc1 = nn.Linear(2048, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.5)

        self.fc2 = nn.Linear(512, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.5)

        self.fc_numeric = nn.Linear(128, 1)  # For numeric output (e.g. item weight)
        self.fc_unit = nn.Linear(128, 1)     # For unit prediction (optional)

    def forward(self, x):
        x = self.backbone(x)
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)

        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)

        numeric_output = self.fc_numeric(x)
        unit_output = self.fc_unit(x)
        return numeric_output, unit_output

# Training function
def train_model(model, train_loader, criterion, optimizer, scheduler, num_epochs=25):
    best_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        total = 0

        for inputs, labels in train_loader:
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            numeric_pred, _ = outputs  # For now, we are focusing on numeric prediction
            loss = criterion(numeric_pred.squeeze(1), labels.float())

            # Backpropagation and optimization
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            total += labels.size(0)

        scheduler.step()
        epoch_loss = running_loss / total
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

        # Save best model based on loss
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save(model.state_dict(), 'best_model.pth')


In [27]:
def collate_fn(batch):
    images = [item[0] for item in batch if item[0] is not None]
    labels = [item[1] for item in batch if item[1] is not None]
    if images:
        images = torch.stack(images, dim=0)
    else:
        images = torch.empty(0)

    if labels:
        labels = torch.tensor(labels)
    else:
        labels = torch.empty(0)

    return images, labels

In [34]:
train_dataset = ImageCaptionDataset('/content/unique1.csv', 'train_images', transform=train_transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2, collate_fn=collate_fn)

In [23]:
# Load the pretrained ResNet50 model
pretrained_model = resnet50(pretrained=True)
for param in pretrained_model.parameters():
    param.requires_grad = False  # Freeze pretrained layers

# Only fine-tune the last few layers
for param in pretrained_model.layer4.parameters():
    param.requires_grad = True

# Instantiate the custom regression model, loss function, and optimizer
model = CustomRegressor(pretrained_model)
criterion = nn.MSELoss()  # Mean Squared Error for regression task
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # L2 regularization
scheduler = CosineAnnealingLR(optimizer, T_max=10)



In [35]:
# Train the model
train_model(model, train_loader, criterion, optimizer, scheduler, num_epochs=25)

Epoch [1/25], Loss: 18.4542
Epoch [2/25], Loss: 15.5891
Epoch [3/25], Loss: 12.9108
Epoch [4/25], Loss: 10.5949
Epoch [5/25], Loss: 8.6016
Epoch [6/25], Loss: 8.1794
Epoch [7/25], Loss: 6.5162
Epoch [8/25], Loss: 6.7084
Epoch [9/25], Loss: 5.9417
Epoch [10/25], Loss: 6.2570
Epoch [11/25], Loss: 6.4298
Epoch [12/25], Loss: 5.6403
Epoch [13/25], Loss: 6.6272
Epoch [14/25], Loss: 6.0464
Epoch [15/25], Loss: 5.3896
Epoch [16/25], Loss: 5.6620
Epoch [17/25], Loss: 4.4905
Epoch [18/25], Loss: 4.3901
Epoch [19/25], Loss: 3.2424
Epoch [20/25], Loss: 2.8637
Epoch [21/25], Loss: 2.2112
Epoch [22/25], Loss: 2.3658
Epoch [23/25], Loss: 2.0478
Epoch [24/25], Loss: 1.9424
Epoch [25/25], Loss: 1.2122


In [38]:
def predict_values(model, data_loader, df):
    model.eval()
    predictions = []
    idx_offset = 0

    with torch.no_grad():
        for images, _ in data_loader: # Assuming your data loader returns a tuple of (images, labels)
            if isinstance(images, tuple): # Check if images is a tuple
                images = images[0] # If it is, extract the first element (assuming it's the image tensor)
            outputs = model(images)
            if isinstance(outputs, tuple): # Check if the model output is a tuple
                outputs = outputs[0] # If it is, extract the first element (assuming it's the prediction tensor)
            outputs = outputs.squeeze(1) # Now apply squeeze to the tensor
            for i, output in enumerate(outputs):
                predicted_value = output.item()
                entity_name = df.iloc[idx_offset + i]['entity_name']
                actual_value = df.iloc[idx_offset + i]['entity_value']

                print(f"Entity Name: {entity_name}, Actual Value: {actual_value}, Predicted Value: {predicted_value}")
                predictions.append((entity_name, actual_value, predicted_value))

            idx_offset += len(images)
    return predictions

In [39]:
predictions = predict_values(model, train_loader, df)

Entity Name: width, Actual Value: 140.0 centimetre, Predicted Value: 4.452517986297607
Entity Name: width, Actual Value: 6.3 inch, Predicted Value: 5.587839126586914
Entity Name: width, Actual Value: 1.0 inch, Predicted Value: 0.47619763016700745
Entity Name: width, Actual Value: 1.929 inch, Predicted Value: 7.478664875030518
Entity Name: width, Actual Value: 140.0 centimetre, Predicted Value: 4.313096523284912
Entity Name: width, Actual Value: 2.8 inch, Predicted Value: 0.905134379863739
Entity Name: width, Actual Value: 23.0 centimetre, Predicted Value: 0.6642702221870422
Entity Name: width, Actual Value: 63.0 inch, Predicted Value: 2.9115726947784424
Entity Name: width, Actual Value: 3.3 inch, Predicted Value: 4.87693977355957
Entity Name: width, Actual Value: 14.0 centimetre, Predicted Value: 0.3742329776287079
Entity Name: depth, Actual Value: 66.0 centimetre, Predicted Value: 0.3043697476387024
Entity Name: depth, Actual Value: 30.0 centimetre, Predicted Value: 3.4547390937805176