### Basic library imports

In [1]:
import os
import pandas as pd
import re

### Read Dataset

In [2]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

### Run Sanity check using src/sanity.py

In [3]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

Parsing successfull for file: ../dataset/sample_test_out.csv


In [4]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

Error: Invalid unit [lbs] found in 6.75 lbs. Allowed units: {'milligram', 'gallon', 'microgram', 'volt', 'inch', 'fluid ounce', 'kilovolt', 'gram', 'kilogram', 'metre', 'cubic inch', 'watt', 'foot', 'centimetre', 'pound', 'pint', 'yard', 'centilitre', 'litre', 'decilitre', 'millilitre', 'microlitre', 'ounce', 'imperial gallon', 'cup', 'quart', 'ton', 'millivolt', 'millimetre', 'cubic foot', 'kilowatt'}


### Download images

In [None]:
from utils import download_images
download_images(train['image_link'], '../images')

In [12]:
assert len(os.listdir('../images')) > 0

**PreProcess**

In [50]:
df_train = pd.read_csv("C://Users//ASUS//Desktop//student_resource 3//dataset//train.csv")

In [49]:
len(df_test['group_id'].unique())

924

In [48]:
df_test = pd.read_csv("C://Users//ASUS//Desktop//student_resource 3//dataset//test.csv")

In [54]:
len(set(train['group_id'].unique().tolist()).difference(set(df_test['group_id'].unique().tolist())))

1

In [4]:
def cross_check_test_groups(df,filtered_groups, column):
    filtered_grp = list(df[column].unique())
    filtered_grp, filtered_groups = set(filtered_grp), set(filtered_groups)
    filtered_groups_difference = filtered_groups.difference(filtered_grp)
    return filtered_groups_difference, len(filtered_groups_difference)

In [5]:
def filter_by_grp(df,df_test,column):
    grp = df.groupby(column)
    filtered_grp = [key for key, group in grp if len(group) >= 100]
    diff, len_of_diff = cross_check_test_groups(df_test,filtered_grp,'group_id')
    if len_of_diff > 0:
        filtered_grp.extend(list(diff))
    filtered_df = df[df[column].isin(filtered_grp)]
    return filtered_df, filtered_grp

In [6]:
def filter_data_without_images(df : pd.DataFrame):
    to_be_included = []
    for i in range(len(df)):
        img_path = df.loc[i]['image_link'].split("/")[-1]
        img_path = os.path.join("C://Users//ASUS//Desktop//student_resource 3//images",img_path)
        if not os.path.exists(img_path):
            continue
        to_be_included.append(df.loc[i]['image_link'])
    out = df[df['image_link'].isin(to_be_included)]
    return out

In [7]:
data = filter_data_without_images(df_train)

In [8]:
data, filtered_groups = filter_by_grp(data,df_test,'group_id')

In [9]:
diff, len_of_diff = cross_check_test_groups(df_test,filtered_groups,'group_id')

In [11]:
len_of_diff

1

In [13]:
data.to_csv("train_data.csv")

In [3]:
unit_set = {
    'cm', 'ft', 'in', 'm', 'mm', 'yd',  # Length (width, depth, height)
    'g', 'gm', 'gms', 'kg', 'µg', 'mcg', 'mg', 'oz', 'ounce', 'lb', 'lbs', 'ton', 'tons',  # Weight (item_weight, maximum_weight_recommendation)
    'kv', 'mv', 'v', 'volt',  # Voltage
    'kw', 'w', 'watt',  # Wattage
    'cl', 'centilitre', 'cu ft', 'cubic foot', 'cu in', 'cubic inch', 'cup', 'dl', 'decilitre', 
    'fl oz', 'fluid ounce', 'gal', 'gallon', 'imp gal', 'imperial gallon', 'l', 'litre', 'µl', 'microlitre', 
    'ml', 'millilitre', 'pt', 'pint', 'qt', 'quart'  # Volume (item_volume)
}


In [37]:
def clean_extracted_value(match):
    number = re.sub(r'[^\d.]', '', match.group(1))
    if number.count('.') > 1:
        parts = number.split('.')
        number = f"{parts[0]}.{parts[1]}"
    

    unit = match.group(2).strip()
    return f"{number} {unit}"

pattern = re.compile(r'([0-9a-zA-Z.]+)\s*([a-zA-Z]+)')


In [39]:
unit_pattern = r'\b(?:' + '|'.join(re.escape(unit) for unit in unit_set) + r')\b'

# Compile the regex pattern
pattern = re.compile(r'(\d+\.?\d*)\s*(' + unit_pattern + r')')

def extract_units(text):
    matches = pattern.findall(text)
    results = [f"{match[0]} {match[1]}" for match in matches]
    return results

# Example usage
text = "weight = 17lbs, 14.54 kv, 54.564   mm, 23k.3s3 mm"
extracted_units = extract_units(text)
print(extracted_units)

['14.54 kv', '54.564 mm', '3 mm']


In [38]:

text = "weight = 17lbs, 14.54 kv, 54.564   mm, 23k.3s3 mm"

# Apply the regex and clean each match
cleaned_results = [clean_extracted_value(match) for match in pattern.finditer(text)]

print(cleaned_results)

[' t', '17 s', '14.54 kv', '54.564 mm', '23.33 mm']


In [30]:
res = [i for i in pattern.finditer(text)]

In [35]:
number = re.sub(r'[^\d.]', '', res[0].group(1))

In [36]:
number

'17'

**transform**

In [80]:
from sklearn.preprocessing import OneHotEncoder
import torch
from transformers import (
    DistilBertTokenizer, DistilBertModel, 
    RobertaTokenizer, RobertaModel, 
    AutoTokenizer, AutoModel
)
from sklearn.decomposition import PCA

In [40]:
train = pd.read_csv("C://Users//ASUS//Desktop//student_resource 3//train_data.csv")

In [44]:
len(train['group_id'].unique())

384

In [57]:
len(set(df_test['group_id'].unique().tolist()).difference(set(df_train['group_id'].unique().tolist())))

286

In [None]:
set(df_test['group_id'].unique().tolist())

In [60]:
encoder = OneHotEncoder()
data = encoder.fit_transform(df_train[['group_id']])

In [62]:
data.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [67]:
df_train['entity_name'].unique()

array(['item_weight', 'item_volume', 'voltage', 'wattage',
       'maximum_weight_recommendation', 'height', 'depth', 'width'],
      dtype=object)

In [77]:
class EmbedColumns:

    def __init__(self, use_pca=True, pca_components=200):
        self.tokenizer = None
        self.model = None
        self.pca = PCA(n_components=pca_components) if use_pca else None
        self.use_pca = use_pca

    def __set_model(self, kind):
        if kind == "DistilBERT":
            self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
            self.model = DistilBertModel.from_pretrained("distilbert-base-uncased")
        elif kind == "RoBERTa":
            self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
            self.model = RobertaModel.from_pretrained("roberta-base")
        elif kind == "MiniLM":
            self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
            self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        else:
            raise ValueError(f"Model kind '{kind}' not supported")

    def __get_pca(self, x):
        if self.pca is not None:
            return self.pca.transform(x.reshape(1, -1))
        return x

    def __apply_pooling(self, out, strategy='mean'):
        """
        Applies pooling strategies to get a fixed-size embedding.
        'mean' or 'max' pooling supported.
        """
        if strategy == 'mean':
            return out.mean(dim=1)  # Mean pooling across tokens
        elif strategy == 'max':
            return out.max(dim=1).values  # Max pooling across tokens
        elif strategy == 'cls':
            return out[:, 0, :]  # CLS token embedding
        else:
            raise ValueError(f"Pooling strategy '{strategy}' not supported")

    def fit_pca(self, dataset_embeddings):
        """
        Fit PCA to a larger dataset of embeddings to avoid fitting on single inputs.
        """
        if self.pca:
            self.pca.fit(dataset_embeddings)

    def get_embeddings(self, val, kind="DistilBERT", pooling_strategy='mean'):
        """
        Get embeddings for the input text. Supports optional pooling strategies.
        """
        self.__set_model(kind)
        inputs = self.tokenizer(val, return_tensors='pt')

        with torch.no_grad():
            outputs = self.model(**inputs)

        hidden_states = outputs.last_hidden_state
        pooled_output = self.__apply_pooling(hidden_states, strategy=pooling_strategy)

        # Convert to numpy
        embedding = pooled_output.squeeze(0).numpy()

        # Apply PCA if enabled
        if self.use_pca:
            embedding = self.__get_pca(embedding)

        return embedding

In [78]:
embed = EmbedColumns()

In [79]:
out = embed.get_embeddings("voltage","DistilBERT")



[ 6.11527979e-01 -2.28224769e-01 -4.01042849e-02 -2.47147188e-01
  4.35910195e-01 -1.66923553e-02 -7.56520554e-02  7.90213719e-02
 -2.58715481e-01 -3.08596820e-01 -6.49185032e-02 -1.68260247e-01
  5.50051779e-02  5.48649311e-01 -2.42879018e-01 -1.05485179e-01
  3.05630803e-01  9.46164653e-02  8.54177415e-01  1.71992838e-01
  7.55001605e-03  1.70443505e-01  4.06949878e-01  2.58254349e-01
 -1.13708079e-01 -1.55428592e-02 -9.15202722e-02  1.37162656e-01
 -1.69410974e-01 -6.69397786e-02  3.10414165e-01 -3.01319897e-01
  2.40785718e-01  3.84524465e-01 -3.98116350e-01 -4.17677760e-01
 -8.69933665e-02  1.55031562e-01 -1.93881586e-01 -1.76166773e-01
  2.04109088e-01 -6.25809550e-01 -3.99705544e-02 -2.62136191e-01
  4.28313136e-01 -2.23178893e-01  5.17215312e-01 -1.09079719e-01
 -1.40012279e-02 -1.33956671e-01 -6.63979232e-01 -9.14008170e-03
 -3.65600467e-01  4.92871344e-01  2.19606534e-01 -1.17567301e-01
 -1.12364128e-01  2.11490214e-01  3.42640132e-01 -1.90454960e-01
  2.82018125e-01 -1.46814

ValueError: n_components=200 must be between 0 and min(n_samples, n_features)=1 with svd_solver='full'

In [72]:
out[:,1,:]

tensor([[ 6.1153e-01, -2.2822e-01, -4.0104e-02, -2.4715e-01,  4.3591e-01,
         -1.6692e-02, -7.5652e-02,  7.9021e-02, -2.5872e-01, -3.0860e-01,
         -6.4919e-02, -1.6826e-01,  5.5005e-02,  5.4865e-01, -2.4288e-01,
         -1.0549e-01,  3.0563e-01,  9.4616e-02,  8.5418e-01,  1.7199e-01,
          7.5500e-03,  1.7044e-01,  4.0695e-01,  2.5825e-01, -1.1371e-01,
         -1.5543e-02, -9.1520e-02,  1.3716e-01, -1.6941e-01, -6.6940e-02,
          3.1041e-01, -3.0132e-01,  2.4079e-01,  3.8452e-01, -3.9812e-01,
         -4.1768e-01, -8.6993e-02,  1.5503e-01, -1.9388e-01, -1.7617e-01,
          2.0411e-01, -6.2581e-01, -3.9971e-02, -2.6214e-01,  4.2831e-01,
         -2.2318e-01,  5.1722e-01, -1.0908e-01, -1.4001e-02, -1.3396e-01,
         -6.6398e-01, -9.1401e-03, -3.6560e-01,  4.9287e-01,  2.1961e-01,
         -1.1757e-01, -1.1236e-01,  2.1149e-01,  3.4264e-01, -1.9045e-01,
          2.8202e-01, -1.4681e-01, -5.5124e-02,  1.8556e-01,  2.2007e-01,
          1.9641e-01,  2.9406e-01, -1.