### Basic library imports

In [3]:
import os
import pandas as pd

### Read Dataset

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
DATASET_FOLDER = '/content/sample_data/Images'
train = pd.read_csv(os.path.join(DATASET_FOLDER, '/content/train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, '/content/test.csv'))

### Run Sanity check using src/sanity.py

In [6]:
# !python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

In [7]:
# !python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

### Download images

In [8]:
%pip install constants
import re
import constants
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image

def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s==None or str(s)=='nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError("Invalid format in {}".format(s))
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
            unit, s, constants.allowed_units))
    return number, unit


def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)

    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)

download_images(train['image_link'], '/content/sample_data/Images')
#Image path : /content/sample_data/Images



100%|██████████| 263859/263859 [31:17<00:00, 140.57it/s]


In [9]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input
import numpy as np
import pandas as pd
import os
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import re

# Load pre-trained ResNet50 model without the top layers
base_model = ResNet50(weights='imagenet', include_top=False)

# Add custom layers for entity value extraction
image_input = base_model.input
x = base_model.output
x = GlobalAveragePooling2D()(x)

# Add numeric value branch (output is a single value)
numeric_output = Dense(1024, activation='relu')(x)
numeric_output = Dense(1, activation='linear', name='numeric_output')(numeric_output)

# Add unit output branch (output is a categorical class)
unit_output = Dense(1024, activation='relu')(x)
unit_output = Dense(50, activation='softmax', name='unit_output')(unit_output)  # 50 unit categories (example size)

# Create the final model
model = Model(inputs=image_input, outputs=[numeric_output, unit_output])

# Freeze the ResNet base model layers
for layer in base_model.layers:
    layer.trainable = False

# Compile the model with two losses (MSE for numeric and categorical cross-entropy for units)
model.compile(optimizer='adam', loss={'numeric_output': 'mean_squared_error', 'unit_output': 'categorical_crossentropy'})

# Load your CSV data
data = pd.read_csv('train.csv')

# Lists to store training data
x_train = []
y_train_numeric = []
y_train_units = []

# Dictionary to map various unit names to a standardized unit
unit_mapping = {
    'cm': 'centimeter', 'centimetres': 'centimeter', 'centimeters': 'centimeter', 'centimetre': 'centimeter',
    'in': 'inch', 'inch': 'inch', 'inches': 'inch',
    'mm': 'millimeter', 'millimetres': 'millimeter', 'millimeters': 'millimeter',
    'm': 'meter', 'metres': 'meter', 'meters': 'meter', 'metre': 'meter',
    'ft': 'foot', 'foot': 'foot', 'feet': 'foot',
    'yd': 'yard', 'yards': 'yard', 'yard': 'yard',
    'g': 'gram', 'grams': 'gram', 'gram': 'gram',
    'kg': 'kilogram', 'kgs': 'kilogram', 'kilograms': 'kilogram', 'kilogram': 'kilogram',
    'mg': 'milligram', 'milligrams': 'milligram', 'milligram': 'milligram',
    'lb': 'pound', 'lbs': 'pound', 'pounds': 'pound', 'pound': 'pound',
    'oz': 'ounce', 'ounces': 'ounce', 'ounce': 'ounce',
    't': 'ton', 'tons': 'ton', 'tonnes': 'ton', 'tonne': 'ton',
    'ml': 'millilitre', 'milliliters': 'millilitre', 'millilitres': 'millilitre',
    'l': 'litre', 'liters': 'litre', 'litres': 'litre',
    'cl': 'centilitre', 'centiliters': 'centilitre', 'centilitres': 'centilitre',
    'dl': 'decilitre', 'deciliters': 'decilitre', 'decilitres': 'decilitre',
    'kv': 'kilovolt', 'kilovolts': 'kilovolt',
    'mv': 'millivolt', 'millivolts': 'millivolt',
    'w': 'watt', 'watts': 'watt',
    'kw': 'kilowatt', 'kilowatts': 'kilowatt'
}

# Function to map units to standardized form
def map_unit(unit):
    return unit_mapping.get(unit.lower(), 'unknown')

# Function to preprocess the image
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    return img_array

# Function to parse numeric values and units from the 'entity_value' column
def parse_value_and_unit(value):
    try:
        # Handle ranges and lists
        if '[' in value and ']' in value:
            value = re.sub(r'\[|\]', '', value)  # Remove brackets
            values = [float(v.strip()) for v in value.split(',')]
            numeric_value = np.mean(values)  # Use mean for ranges
        else:
            value_parts = value.split()
            numeric_value = float(value_parts[0])

        unit = map_unit(value_parts[1]) if len(value_parts) > 1 else 'unknown'
        return numeric_value, unit
    except (IndexError, ValueError):
        return None, None

# Set the correct path to your images
image_dir = r"/content/sample_data/Images"
print(f"Looking for images in: {image_dir}")

# Limit the number of images to process
max_images = 1000
processed_images = 0

# Iterate over rows of the dataset
for i, row in data.iterrows():
    if processed_images >= max_images:
        break

    img_filename = f"{row['image_link'].split('/')[-1]}"
    img_path = os.path.join(image_dir, img_filename)

    if os.path.exists(img_path):
        try:
            # Preprocess and append the image data
            img_array = preprocess_image(img_path)
            x_train.append(img_array)

            # Parse numeric value and unit
            value_str = str(row['entity_value'])
            numeric_value, unit = parse_value_and_unit(value_str)

            if numeric_value is not None and unit != 'unknown':
                y_train_numeric.append(numeric_value)
                y_train_units.append(unit)
            else:
                print(f"Skipping row {i}: Unable to process '{value_str}'")
                continue

            processed_images += 1

            if processed_images % 100 == 0:
                print(f"Processed {processed_images} images")

        except Exception as e:
            print(f"Error processing image {img_path}: {str(e)}")
    else:
        print(f"Image not found: {img_path}")

# Convert lists to arrays for training
if x_train:
    X_train = np.vstack(x_train)
    y_train_numeric = np.array(y_train_numeric)

    # Encode units as integers and then convert to one-hot
    label_encoder = LabelEncoder()
    y_train_units_encoded = label_encoder.fit_transform(y_train_units)
    y_train_units_categorical = to_categorical(y_train_units_encoded)

    print(f"Training on {len(X_train)} images")

    # Check sizes before training
    if len(X_train) == len(y_train_numeric) == len(y_train_units_categorical):
        # Train the model with both numeric values and units
        model.fit(X_train, {'numeric_output': y_train_numeric, 'unit_output': y_train_units_categorical},
                  epochs=10, batch_size=32, validation_split=0.2)
    else:
        print("Data sizes do not match. Check your data processing pipeline.")

else:
    print("No images found or loaded. Please check the paths and files.")

# Save the model
model.save('entity_value_unit_model.h5')
print("Model saved as 'entity_value_unit_model.h5'")


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Looking for images in: /content/sample_data/Images
Skipping row 1: Unable to process '1.0 cup'
Skipping row 21: Unable to process '4.0 gallon'
Skipping row 25: Unable to process '48.0 volt'
Skipping row 36: Unable to process '800.0 watt'
Skipping row 37: Unable to process '36.0 volt'
Skipping row 42: Unable to process '150.0 watt'
Skipping row 43: Unable to process '150.0 watt'
Skipping row 44: Unable to process '30.0 watt'
Skipping row 45: Unable to process '30.0 watt'
Skipping row 48: Unable to process '250.0 watt'
Skipping row 62: Unable to process '60.0 watt'
Skipping row 63: Unable to process '30.0 millilitre'
Skipping row 64: Unable to process '30.0 millilitre'
Skipping row 65: Unable to process '30.0 millilitre'
Skipping row 66: Unable to process '30.0 millilitre'
Skipping row 67: Unable to process '30.0 millilitre'
Skipping row 81: Unabl

  saving_api.save_model(


Model saved as 'entity_value_unit_model.h5'


In [10]:
%pip install constants
import re
import constants
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image

def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s==None or str(s)=='nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError("Invalid format in {}".format(s))
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
            unit, s, constants.allowed_units))
    return number, unit


def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)

    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)

download_images(test['image_link'], '/content/sample_data/Test_Images')
#Image path : /content/sample_data/Images



  self.pid = os.fork()
100%|██████████| 131187/131187 [18:01<00:00, 121.26it/s]
  self.pid = os.fork()


In [None]:
import numpy as np
import pandas as pd
import os
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from sklearn.preprocessing import LabelEncoder

# Load the saved model
model = tf.keras.models.load_model('entity_value_unit_model.h5')

# Load your test data
test_data = pd.read_csv('/content/test.csv')

# Set the correct path to your images
test_image_dir = r"/content/sample_data/Test_Images"
print(f"Looking for test images in: {test_image_dir}")

# Ensure the prediction directory exists
prediction_dir = '/content/sample_data/Prediction'
if not os.path.exists(prediction_dir):
    os.makedirs(prediction_dir)

# Function to preprocess the image
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    return img_array

# Load the label encoder used during training to map unit predictions back to unit names
label_encoder = LabelEncoder()
unit_labels = list(unit_mapping.values())  # List of all possible unit labels
label_encoder.fit(unit_labels)

# Lists to store predictions
predictions = []

# Iterate over the test dataset
for i, row in test_data.iterrows():
    img_filename = f"{row['image_link'].split('/')[-1]}"
    img_path = os.path.join(test_image_dir, img_filename)

    if os.path.exists(img_path):
        try:
            print(f"Processing image {i+1}/{len(test_data)}: {img_path}")
            # Preprocess the test image
            img_array = preprocess_image(img_path)

            # Predict both numeric value and unit
            numeric_pred, unit_pred = model.predict(img_array)

            # Get the predicted numeric value
            predicted_value = numeric_pred[0][0]

            # Get the predicted unit (from softmax probabilities)
            predicted_unit_index = np.argmax(unit_pred, axis=1)[0]
            predicted_unit = label_encoder.inverse_transform([predicted_unit_index])[0]

            # If both value and unit are valid, format the prediction
            if predicted_value is not None and predicted_unit is not None:
                prediction = f"{predicted_value:.2f} {predicted_unit}s" if predicted_value > 1 else f"{predicted_value:.2f} {predicted_unit}"
            else:
                # If either value or unit prediction fails, append an empty string
                prediction = ""

            predictions.append(prediction)

        except Exception as e:
            print(f"Error processing image {img_path}: {str(e)}")
            predictions.append("")  # Append an empty string if prediction fails
    else:
        print(f"Image not found: {img_path}")
        predictions.append("")  # Append an empty string if image is not found

# Add the predictions to the test dataset and only keep the 'index' and 'entity_value' columns
test_data['entity_value'] = predictions
output_data = test_data[['index', 'entity_value']]

# Save the results to a new CSV file with only 'index' and 'entity_value' columns
output_path = os.path.join(prediction_dir, '/content/sample_data/Prediction/test_predictions.csv')
output_data.to_csv(output_path, index=False)

print(f"Predictions saved to {output_path}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Error processing image /content/sample_data/Test_Images/518LBxrFS8L.jpg: y contains previously unseen labels: [27]
Processing image 28325/131187: /content/sample_data/Test_Images/518LOZXorML.jpg
Processing image 28326/131187: /content/sample_data/Test_Images/518LOZXorML.jpg
Processing image 28327/131187: /content/sample_data/Test_Images/518LYuosi3L.jpg
Processing image 28328/131187: /content/sample_data/Test_Images/518Lbr56u9L.jpg
Processing image 28329/131187: /content/sample_data/Test_Images/518Lbr56u9L.jpg
Processing image 28330/131187: /content/sample_data/Test_Images/518LggywMGL.jpg
Processing image 28331/131187: /content/sample_data/Test_Images/518LglB17ZL.jpg
Processing image 28332/131187: /content/sample_data/Test_Images/518LipDbc7L.jpg
Processing image 28333/131187: /content/sample_data/Test_Images/518LofR6h2L.jpg
Processing image 28334/131187: /content/sample_data/Test_Images/518LxlLt1fL.jpg
Processing image 283

In [None]:
res = pd.read_csv('/content/sample_data/Prediction/test_predictions.csv')
res.head(100)