### Importing packages

In [2]:
import os
import pandas as pd
import numpy as np
from PIL import Image

## Reading CSV files

**Dataset Files:**

1. dataset/train.csv: Training file with labels (`entity_value`).
2. dataset/test.csv: Test file without output labels (`entity_value`). Generate predictions using your model/solution on this file's data and format the output file to match `sample_test_out.csv` (Refer to the "Output Format" section above).
3. dataset/sample_test.csv: Sample test input file.
4. dataset/sample_test_out.csv: Sample outputs for `sample_test.csv`. The output for `test.csv` must be formatted in the exact same way.Note: The predictions in the file might not be correct.

In [3]:
DATASET_FOLDER = '../dataset'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

## Data Read


In [4]:
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


In [5]:
test.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [6]:
train['image_link'] = train['image_link'].apply(lambda x: x.split('/')[-1])
test['image_link'] = test['image_link'].apply(lambda x: x.split('/')[-1])
train['entity_name'] = train['entity_name'].apply(lambda x: x.split('_')[-1])

In [7]:
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,61I9XdN6OFL.jpg,748919,weight,500.0 gram
1,71gSRbyXmoL.jpg,916768,volume,1.0 cup
2,61BZ4zrjZXL.jpg,459516,weight,0.709 gram
3,612mrlqiI4L.jpg,459516,weight,0.709 gram
4,617Tl40LOXL.jpg,731432,weight,1400 milligram


In [8]:
train['entity_name'].value_counts()

entity_name
weight            102786
depth              45127
width              44183
height             43597
voltage             9466
wattage             7755
volume              7682
recommendation      3263
Name: count, dtype: int64

In [9]:
# spliting the target variable to numerical and units
train['value_num'] = train['entity_value'].str.split().apply(lambda x: x[0].strip())
train['value_unit'] = train['entity_value'].str.split().apply(lambda x: x[1].strip())

In [11]:
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value,value_num,value_unit
0,61I9XdN6OFL.jpg,748919,weight,500.0 gram,500.0,gram
1,71gSRbyXmoL.jpg,916768,volume,1.0 cup,1.0,cup
2,61BZ4zrjZXL.jpg,459516,weight,0.709 gram,0.709,gram
3,612mrlqiI4L.jpg,459516,weight,0.709 gram,0.709,gram
4,617Tl40LOXL.jpg,731432,weight,1400 milligram,1400.0,milligram


In [10]:
units = train['value_unit'].value_counts().reset_index()
units

Unnamed: 0,value_unit,count
0,centimetre,65667
1,gram,63630
2,inch,45481
3,millimetre,18895
4,kilogram,13220
...,...,...
285,325.0],1
286,milliampere,1
287,41.0],1
288,31.0],1


In [16]:
units = units[units['value_unit'].str[-1]!=']']
units

Unnamed: 0,value_unit,count
0,centimetre,65667
1,gram,63630
2,inch,45481
3,millimetre,18895
4,kilogram,13220
5,pound,9148
6,ounce,8737
7,milligram,8016
8,volt,7691
9,watt,7340


In [20]:
from constants import allowed_units
units[units['value_unit'].apply(lambda x : x in allowed_units)]

Unnamed: 0,value_unit,count
0,centimetre,65667
1,gram,63630
2,inch,45481
3,millimetre,18895
4,kilogram,13220
5,pound,9148
6,ounce,8737
7,milligram,8016
8,volt,7691
9,watt,7340


In [None]:
# now use this units to filter the main train data 