### Importing packages

In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image

## Reading CSV files

**Dataset Files:**

1. dataset/train.csv: Training file with labels (`entity_value`).
2. dataset/test.csv: Test file without output labels (`entity_value`). Generate predictions using your model/solution on this file's data and format the output file to match `sample_test_out.csv` (Refer to the "Output Format" section above).
3. dataset/sample_test.csv: Sample test input file.
4. dataset/sample_test_out.csv: Sample outputs for `sample_test.csv`. The output for `test.csv` must be formatted in the exact same way.Note: The predictions in the file might not be correct.

In [2]:
DATASET_FOLDER = '../dataset'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

## Data Read


In [3]:
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


In [4]:
test.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [5]:
train['image_link'] = train['image_link'].apply(lambda x: x.split('/')[-1])
test['image_link'] = test['image_link'].apply(lambda x: x.split('/')[-1])
train['entity_name'] = train['entity_name'].apply(lambda x: x.split('_')[-1])

In [6]:
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,61I9XdN6OFL.jpg,748919,weight,500.0 gram
1,71gSRbyXmoL.jpg,916768,volume,1.0 cup
2,61BZ4zrjZXL.jpg,459516,weight,0.709 gram
3,612mrlqiI4L.jpg,459516,weight,0.709 gram
4,617Tl40LOXL.jpg,731432,weight,1400 milligram


In [7]:
test.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,110EibNyclL.jpg,156839,height
1,1,11TU2clswzL.jpg,792578,width
2,2,11TU2clswzL.jpg,792578,height
3,3,11TU2clswzL.jpg,792578,depth
4,4,11gHj8dhhrL.jpg,792578,depth


In [8]:
train['entity_name'].value_counts()

entity_name
weight            102786
depth              45127
width              44183
height             43597
voltage             9466
wattage             7755
volume              7682
recommendation      3263
Name: count, dtype: int64

In [9]:
# spliting the target variable to numerical and units
train['value_num'] = train['entity_value'].str.split().apply(lambda x: x[0].strip())
train['value_unit'] = train['entity_value'].str.split().apply(lambda x: x[1].strip())

In [10]:
train['value_unit'].value_counts()

value_unit
centimetre     65667
gram           63630
inch           45481
millimetre     18895
kilogram       13220
               ...  
325.0]             1
milliampere        1
41.0]              1
31.0]              1
930.0]             1
Name: count, Length: 290, dtype: int64

In [11]:
train['value_unit'].unique()

array(['gram', 'cup', 'milligram', 'kilogram', 'ounce', 'gallon', 'volt',
       'watt', 'pound', 'millilitre', 'cubic', '240.0]', '12.0]', 'fluid',
       '265.0]', 'ton', 'decilitre', '30.0]', 'litre', 'microgram',
       '100.0]', 'centimetre', '21.0]', '16.0]', 'quart', '17.0]',
       '250.0]', '2.0]', 'horsepower', '25.0]', '150.0]', 'kilowatt',
       '11.0]', '120.0]', '5.0]', '2015.0]', '10.0]', '1.5]', '40.0]',
       '15.0]', '15.88]', 'gigabyte', '55.0]', 'millimetre', '127.0]',
       '50.0]', '20.0]', '4928.0]', '1.6]', 'pint', '13.0]', '130.0]',
       'centilitre', '4.0]', '3.5]', '277.0]', '200.0]', '60.0]', '3.1]',
       '35.0]', 'candela', '3.2]', '31.0]', '24.8]', '2000.0]', '1000.0]',
       '3.0]', '80.0]', '7.5]', '415.0]', '65.0]', '8.4]', 'inch',
       'person', '300.0]', '8.0]', '19.8]', '48.0]', '2001.0]', '220.0]',
       '38.0]', '490.0]', '18.0]', '9.0]', 'metre', '75.0]', '2006.0]',
       '14.0]', '36.0]', '14.4]', '27.0]', '23.0]', '260.0]', '66.0]',


In [12]:
train

Unnamed: 0,image_link,group_id,entity_name,entity_value,value_num,value_unit
0,61I9XdN6OFL.jpg,748919,weight,500.0 gram,500.0,gram
1,71gSRbyXmoL.jpg,916768,volume,1.0 cup,1.0,cup
2,61BZ4zrjZXL.jpg,459516,weight,0.709 gram,0.709,gram
3,612mrlqiI4L.jpg,459516,weight,0.709 gram,0.709,gram
4,617Tl40LOXL.jpg,731432,weight,1400 milligram,1400,milligram
...,...,...,...,...,...,...
263854,612J1R1xHlL.jpg,558806,height,5.0 centimetre,5.0,centimetre
263855,61Blzh2+28L.jpg,470067,height,8.5 inch,8.5,inch
263856,51MsegDL9VL.jpg,204245,height,43.2 centimetre,43.2,centimetre
263857,510KhVw4VSL.jpg,752266,height,9.1 centimetre,9.1,centimetre


In [13]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

In [14]:
unit_map = {
    'length': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}
data = {'image_link': train["image_link"],
    'group_id': train["group_id"],
    'entity_name':train["entity_name"],
    'entity_value':train["entity_value"],
    'value_unit': train["value_unit"]}
train = pd.DataFrame(data)
# Creating a set of all valid units that is provided by amazon ml team
valid_units = set()
for units in unit_map.values():
    valid_units.update(units)

# Filtering only the 'value_unit' column and keeping other columns intact
train['value_unit'] = train['value_unit'].apply(lambda x: x if x in valid_units else None)
train = train.dropna(subset=['value_unit'])

print("Filtered DataFrame:")
print(train)

Filtered DataFrame:
             image_link  group_id entity_name     entity_value  value_unit
0       61I9XdN6OFL.jpg    748919      weight       500.0 gram        gram
1       71gSRbyXmoL.jpg    916768      volume          1.0 cup         cup
2       61BZ4zrjZXL.jpg    459516      weight       0.709 gram        gram
3       612mrlqiI4L.jpg    459516      weight       0.709 gram        gram
4       617Tl40LOXL.jpg    731432      weight   1400 milligram   milligram
...                 ...       ...         ...              ...         ...
263854  612J1R1xHlL.jpg    558806      height   5.0 centimetre  centimetre
263855  61Blzh2+28L.jpg    470067      height         8.5 inch        inch
263856  51MsegDL9VL.jpg    204245      height  43.2 centimetre  centimetre
263857  510KhVw4VSL.jpg    752266      height   9.1 centimetre  centimetre
263858  51lzTNLQ-6S.jpg    416664      height  27.5 centimetre  centimetre

[242813 rows x 5 columns]


In [15]:
train['value_unit'].value_counts()

value_unit
centimetre    65667
gram          63630
inch          45481
millimetre    18895
kilogram      13220
pound          9148
ounce          8737
milligram      8016
millilitre     3554
metre          1530
ton            1349
foot           1341
litre           895
microgram       748
gallon          175
pint            107
quart           104
decilitre        97
cup              87
centilitre       32
Name: count, dtype: int64

In [16]:
train['value_unit'].unique()

array(['gram', 'cup', 'milligram', 'kilogram', 'ounce', 'gallon', 'pound',
       'millilitre', 'ton', 'decilitre', 'litre', 'microgram',
       'centimetre', 'quart', 'millimetre', 'pint', 'centilitre', 'inch',
       'metre', 'foot'], dtype=object)

In [17]:
#FILTERED CSV TRAIN DATA
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value,value_unit
0,61I9XdN6OFL.jpg,748919,weight,500.0 gram,gram
1,71gSRbyXmoL.jpg,916768,volume,1.0 cup,cup
2,61BZ4zrjZXL.jpg,459516,weight,0.709 gram,gram
3,612mrlqiI4L.jpg,459516,weight,0.709 gram,gram
4,617Tl40LOXL.jpg,731432,weight,1400 milligram,milligram


In [18]:
train.to_csv("../dataset/filter_train.csv",index=None)