### Basic library imports

In [3]:
import os
import pandas as pd
import re

### Read Dataset

In [2]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

### Run Sanity check using src/sanity.py

In [3]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

Parsing successfull for file: ../dataset/sample_test_out.csv


In [4]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

Error: Invalid unit [lbs] found in 6.75 lbs. Allowed units: {'milligram', 'gallon', 'microgram', 'volt', 'inch', 'fluid ounce', 'kilovolt', 'gram', 'kilogram', 'metre', 'cubic inch', 'watt', 'foot', 'centimetre', 'pound', 'pint', 'yard', 'centilitre', 'litre', 'decilitre', 'millilitre', 'microlitre', 'ounce', 'imperial gallon', 'cup', 'quart', 'ton', 'millivolt', 'millimetre', 'cubic foot', 'kilowatt'}


### Download images

In [None]:
from utils import download_images
download_images(train['image_link'], '../images')

In [12]:
assert len(os.listdir('../images')) > 0

**PreProcess**

In [4]:
df_train = pd.read_csv("C://Users//ASUS//Desktop//student_resource 3//dataset//train.csv")

In [49]:
len(df_test['group_id'].unique())

924

In [5]:
df_test = pd.read_csv("C://Users//ASUS//Desktop//student_resource 3//dataset//test.csv")

In [54]:
len(set(train['group_id'].unique().tolist()).difference(set(df_test['group_id'].unique().tolist())))

1

In [4]:
def cross_check_test_groups(df,filtered_groups, column):
    filtered_grp = list(df[column].unique())
    filtered_grp, filtered_groups = set(filtered_grp), set(filtered_groups)
    filtered_groups_difference = filtered_groups.difference(filtered_grp)
    return filtered_groups_difference, len(filtered_groups_difference)

In [5]:
def filter_by_grp(df,df_test,column):
    grp = df.groupby(column)
    filtered_grp = [key for key, group in grp if len(group) >= 100]
    diff, len_of_diff = cross_check_test_groups(df_test,filtered_grp,'group_id')
    if len_of_diff > 0:
        filtered_grp.extend(list(diff))
    filtered_df = df[df[column].isin(filtered_grp)]
    return filtered_df, filtered_grp

In [6]:
def filter_data_without_images(df : pd.DataFrame):
    to_be_included = []
    for i in range(len(df)):
        img_path = df.loc[i]['image_link'].split("/")[-1]
        img_path = os.path.join("C://Users//ASUS//Desktop//student_resource 3//images",img_path)
        if not os.path.exists(img_path):
            continue
        to_be_included.append(df.loc[i]['image_link'])
    out = df[df['image_link'].isin(to_be_included)]
    return out

In [7]:
data = filter_data_without_images(df_train)

In [8]:
data, filtered_groups = filter_by_grp(data,df_test,'group_id')

In [9]:
diff, len_of_diff = cross_check_test_groups(df_test,filtered_groups,'group_id')

In [11]:
len_of_diff

1

In [13]:
data.to_csv("train_data.csv")

In [31]:
unit_set = {
    'cm', 'ft', 'in', 'm', 'mm', 'yd',
    'g', 'gm', 'gms', 'kg', 'µg', 'mcg', 'mg', 'oz', 'ounce', 'lb', 'lbs', 'ton', 'tons',
    'kv', 'mv', 'v', 'volt', 
    'kw', 'w', 'watt',
    'cl', 'centilitre', 'cu ft', 'cubic foot', 'cu in', 'cubic inch', 'cup', 'dl', 'decilitre', 
    'fl oz', 'fluid ounce', 'gal', 'gallon', 'imp gal', 'imperial gallon', 'l', 'litre', 'µl', 'microlitre', 
    'ml', 'millilitre', 'pt', 'pint', 'qt', 'quart'
}


In [37]:
def clean_extracted_value(match):
    number = re.sub(r'[^\d.]', '', match.group(1))
    if number.count('.') > 1:
        parts = number.split('.')
        number = f"{parts[0]}.{parts[1]}"
    

    unit = match.group(2).strip()
    return f"{number} {unit}"

pattern = re.compile(r'([0-9a-zA-Z.]+)\s*([a-zA-Z]+)')


In [39]:
unit_pattern = r'\b(?:' + '|'.join(re.escape(unit) for unit in unit_set) + r')\b'

# Compile the regex pattern
pattern = re.compile(r'(\d+\.?\d*)\s*(' + unit_pattern + r')')

def extract_units(text):
    matches = pattern.findall(text)
    results = [f"{match[0]} {match[1]}" for match in matches]
    return results

# Example usage
text = "weight = 17lbs, 14.54 kv, 54.564   mm, 23k.3s3 mm"
extracted_units = extract_units(text)
print(extracted_units)

['14.54 kv', '54.564 mm', '3 mm']


In [38]:

text = "weight = 17lbs, 14.54 kv, 54.564   mm, 23k.3s3 mm"

# Apply the regex and clean each match
cleaned_results = [clean_extracted_value(match) for match in pattern.finditer(text)]

print(cleaned_results)

[' t', '17 s', '14.54 kv', '54.564 mm', '23.33 mm']


In [30]:
res = [i for i in pattern.finditer(text)]

In [35]:
number = re.sub(r'[^\d.]', '', res[0].group(1))

In [36]:
["17 lbs", "14.54 kv", "54.564 mm", "23.33 mm"]

'17'

**transform**

In [1]:
from sklearn.preprocessing import OneHotEncoder
import torch
from transformers import (
    DistilBertTokenizer, DistilBertModel, 
    RobertaTokenizer, RobertaModel, 
    AutoTokenizer, AutoModel,
    TrOCRProcessor, VisionEncoderDecoderModel, DonutProcessor
)
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from PIL import Image
import requests
from PIL import Image, ImageOps
from io import BytesIO
from paddleocr import PaddleOCR
import easyocr
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv("C://Users//ASUS//Desktop//student_resource 3//dataset//train.csv")

In [44]:
len(train['group_id'].unique())

384

In [57]:
len(set(df_test['group_id'].unique().tolist()).difference(set(df_train['group_id'].unique().tolist())))

286

In [None]:
set(df_test['group_id'].unique().tolist())

In [60]:
encoder = OneHotEncoder()
data = encoder.fit_transform(df_train[['group_id']])

In [62]:
data.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [67]:
df_train['entity_name'].unique()

array(['item_weight', 'item_volume', 'voltage', 'wattage',
       'maximum_weight_recommendation', 'height', 'depth', 'width'],
      dtype=object)

In [10]:
class EmbedColumns:

    def __init__(self, use_pca=False, pca_components=200):
        self.tokenizer = None
        self.model = None
        self.pca = PCA(n_components=pca_components) if use_pca else None
        self.use_pca = use_pca

    def __set_model(self, kind):
        if kind == "DistilBERT":
            self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
            self.model = DistilBertModel.from_pretrained("distilbert-base-uncased")
        elif kind == "RoBERTa":
            self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
            self.model = RobertaModel.from_pretrained("roberta-base")
        elif kind == "MiniLM":
            self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
            self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        else:
            raise ValueError(f"Model kind '{kind}' not supported")

    def __get_pca(self, x):
        if self.pca is not None:
            return self.pca.transform(x.reshape(1, -1))
        return x

    def __apply_pooling(self, out, strategy='mean'):
        """
        Applies pooling strategies to get a fixed-size embedding.
        'mean' or 'max' pooling supported.
        """
        if strategy == 'mean':
            return out.mean(dim=1)
        elif strategy == 'max':
            return out.max(dim=1).values
        elif strategy == 'cls':
            return out[:, 0, :]
        else:
            raise ValueError(f"Pooling strategy '{strategy}' not supported")

    def fit_pca(self, dataset_embeddings):
        """
        Fit PCA to a larger dataset of embeddings to avoid fitting on single inputs.
        """
        if self.pca:
            self.pca.fit(dataset_embeddings)

    def get_embeddings(self, val, kind="DistilBERT", pooling_strategy='cls'):
        """
        Get embeddings for the input text. Supports optional pooling strategies.
        """
        self.__set_model(kind)
        inputs = self.tokenizer(val, return_tensors='pt')

        with torch.no_grad():
            outputs = self.model(**inputs)

        hidden_states = outputs.last_hidden_state

        pooled_output = self.__apply_pooling(hidden_states, strategy=pooling_strategy)

        embedding = pooled_output.squeeze(0).numpy()

        if self.use_pca:
            embedding = self.__get_pca(embedding)

        return embedding

In [None]:
class OCRFeatureExtractor:
    def __init__(self, model_name="TrOCR", pca_components=200):
        self.pca_components = pca_components
        self.model_name = model_name
        
        if model_name == "TrOCR":
            self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
            self.model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
        elif model_name == "Donut":
            self.processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
            self.model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
        else:
            raise ValueError("Unsupported model name. Use 'TrOCR' or 'Donut'.")
    def get_arch(self):
        return self.model
    
    def extract_features(self, image_path):
        image = Image.open(image_path)
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        
        with torch.no_grad():
            encoder_outputs = self.model.encoder(pixel_values).last_hidden_state
        
        encoder_features = encoder_outputs.squeeze(0).numpy()
        
        return encoder_features

    def extract_features_from_folder(self, folder_path):
        import os
        features_list = []
        for filename in os.listdir(folder_path):
            image_path = os.path.join(folder_path, filename)
            if os.path.isfile(image_path):
                features = self.extract_features(image_path)
                features_list.append(features)
        
        return features_list

In [None]:
class PreProcessPipeline:

    def __init__(self, data, ocr_model = "", resnet_model = "",
                 use_group_id  = False,
                 use_resnet = False,
                 image_root_path = "",
                 max_metric_count = 5,
                 kind = "train"):

        self.data = data
        unique_grp_ids = sorted(list(data["group_id"].unique()))
        self.group_id_mappings = {i : grp_id for i, grp_id in enumerate(unique_grp_ids)}
        self.rev_group_id_mappings = {grp_id : i for i, grp_id in enumerate(unique_grp_ids)}
        self.ocr_feature_extrac = OCRFeatureExtractor(ocr_model)
        self.max_metric_count = max_metric_count
        if use_resnet:
            self.resnet_feature_extractor = None
        self.image_root_path = image_root_path
        self.kind = kind
        self.s_one_hot_encode = OneHotEncoder()
        self.met_one_hot = OneHotEncoder()
        self.ocr = PaddleOCR(use_angle_cls=True, lang='en')
        self.map_to_unit= {
            # Length
            'cm': 'centimetre', 'mm': 'millimetre', 'm': 'metre', 'in': 'inch', 'ft': 'foot', 'yd': 'yard',
            
            # Weight
            'g': 'gram', 'gm': 'gram', 'gms': 'gram', 'kg': 'kilogram', 'mg': 'milligram', 'µg': 'microgram',
            'mcg': 'microgram', 'oz': 'ounce', 'lb': 'pound', 'lbs': 'pound', 'ton': 'ton', 'tons': 'ton',
            
            # Voltage
            'v': 'volt', 'kv': 'kilovolt', 'mv': 'millivolt',
            
            # Wattage
            'w': 'watt', 'kw': 'kilowatt',
            
            # Volume
            'cl': 'centilitre', 'cu ft': 'cubic foot', 'cubic foot': 'cubic foot', 'cu in': 'cubic inch', 
            'cubic inch': 'cubic inch', 'cup': 'cup', 'dl': 'decilitre', 'decilitre': 'decilitre', 'fl oz': 'fluid ounce', 
            'fluid ounce': 'fluid ounce', 'gal': 'gallon', 'gallon': 'gallon', 'imp gal': 'imperial gallon',
            'imperial gallon': 'imperial gallon', 'l': 'litre', 'litre': 'litre', 'µl': 'microlitre', 'microlitre': 'microlitre',
            'ml': 'millilitre', 'millilitre': 'millilitre', 'pt': 'pint', 'pint': 'pint', 'qt': 'quart', 'quart': 'quart'
        }
        self.unique_metrics = list(set(self.map_to_unit.values()))
        self.__fit_one_hot()
        
# Example usage
    def __get_full_unit_name(self,unit):
        if unit in self.unique_metrics:
            return unit
        out = self.map_to_unit.get(unit.lower(), unit)
        return out

    
    def get_one_hot_encode(self):
        return self.s_one_hot_encode

    def __fit_one_hot(self):
        self.s_one_hot_encode.fit(self.data[['entity_name']])
        self.met_one_hot.fit([self.unique_metrics])

    def extract_value_unit_from_image(self,image_path):
            img = Image.open(image_path)
            gray_img = ImageOps.grayscale(img)
            
            np_image = np.array(gray_img)
            result = self.ocr.ocr(np_image, cls=True)
            extracted_text = " ".join([res[1][0] for res in result[0]])
            pattern = r'(\d+\.?\d*)\s?(cm³|liters|ml|g|kg|m³|in³|L|oz|fl oz|lb|centimetre|gram|l|pt|metre|g|cm|ton|ft|volt|millilitre|millimetre|kg|v|millivolt|imperial gallon|centilitre|cl|gal|m|kv|microlitre|qt|mv|microgram|w|milligram|µl|lbs|imp gal|fluid ounce|litre|tons|gallon|pound|quart|µg|foot|mg|ounce|mcg|kilovolt|cubic foot|gm|kilowatt|yd|cup|dl|oz|mm|cu ft|kw|cubic inch|gms|yard|kilogram|in|watt|fl oz|inch|decilitre|ml|pint|cu in)'  # Modify as per expected units
            matches = re.findall(pattern, extracted_text, re.IGNORECASE)
            if matches:
                return matches
            else:
                return ""

    def __transform_for_one_record(self,record):
        image_link ,group_id, entity_name, entity_value = None, None, None, None
        if self.kind == "train":
            image_link ,group_id, entity_name, entity_value = record['image_link'], record['group_id'], record['entity_name'], record['entity_value']
        elif self.kind == "test":
            image_link ,group_id, entity_name = record['image_link'], record['group_id'], record['entity_name']
        img_path = os.path.join(self.image_root_path,image_link.split("/")[-1])
        transformed_img = self.ocr_feature_extrac.extract_features(img_path)
        transformed_grp = self.group_id_mappings[group_id]
        transformed_entity_name = self.one_hot_encode.transform([[entity_name]]).toarray()
        if entity_value:
            value, unit = entity_value.split(" ")
            value = float(value)
            transformed_unit = self.met_one_hot.transform([unit])
            entity_value = (value, transformed_unit)
        
        matches = self.extract_value_unit_from_image(img_path)
        if matches: 
            metric_set = []
            for i in range(len(matches)):
                val , uni = matches[i][0], matches[i][1]

                uni = uni.strip()
                mapped_unit = self.__get_full_unit_name(uni)

                transformed_unit = self.met_one_hot.transform([mapped_unit])

                val = float(val)

                metric_set.append([val,transformed_unit])
            if len(metric_set) > self.max_metric_count:
                metric_set = metric_set[:self.max_metric_count]
            
            else:
                
                
                




        if 
        return transformed_img, transformed_grp, transformed_entity_name, entity_value
    
    def preprocess():
    

        



In [4]:

from PIL import Image, ImageOps
import numpy as np
import re

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Modify the parameters if necessary

def extract_value_unit_from_image(image_path):
    # Open and preprocess the image
    img = Image.open(image_path)
    gray_img = ImageOps.grayscale(img)
    
    # Convert the image to a numpy array for PaddleOCR
    np_image = np.array(gray_img)
    
    # Perform OCR using PaddleOCR
    result = ocr.ocr(np_image, cls=True)
    
    # Extract the recognized text
    extracted_text = " ".join([res[1][0] for res in result[0]])  # Concatenate the OCR text
    
    # Use regex to find value + unit (e.g., numbers followed by units like cm³, L, ml, etc.)
    pattern = r'(\d+\.?\d*)\s?(cm³|liters|ml|g|kg|m³|in³|L|oz|fl oz|gallon|µg|kg|ml|litre|fluid ounce|m|ft|quart|kv|fl oz|volt|pt|yd|in|v|tons|gal|l|ounce|cl|µl|lbs|microlitre|mg|w|cubic inch|cm|lb|cubic foot|watt|cu in|cu ft|g|pint|mcg|ton|dl|decilitre|imp gal|cup|gms|gm|centilitre|kw|imperial gallon|mm|millilitre|oz|qt|mv|gallon)'  # Modify as per expected units
    matches = re.findall(pattern, extracted_text, re.IGNORECASE)
    
    if matches:
        # Return the first match (value + unit)
        return matches
    else:
        # Return blank if no match is found
        return " "

# Example usage:
# result = extract_value_unit_from_image('path_to_image.jpg')
# print(result)


[2024/09/14 23:20:45] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ASUS/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ASUS/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6,

In [5]:
matches = extract_value_unit_from_image()

[2024/09/14 23:20:55] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.08312630653381348
[2024/09/14 23:20:55] ppocr DEBUG: cls num  : 4, elapsed : 0.026586055755615234
[2024/09/14 23:20:55] ppocr DEBUG: rec_res num  : 4, elapsed : 0.11325645446777344


In [7]:
for i in range(len(matches)):
    result = extract_value_unit_from_image(i)
    print(matches[i][0], matches[i][1])
    

6.985 cm
2.75 in
182.88 cm
381 cm
72 in
150 in


In [27]:

reader = easyocr.Reader(['en'], gpu=True)  # Set gpu=True for GPU usage

# Function to preprocess and perform OCR on the image URL
def extract_value_unit_from_image(image_url):
    try:
        # Send a request to get the image from the URL
        response = requests.get(image_url)
        response.raise_for_status()  # Ensure the request was successful

        # Open the image and convert it to grayscale
        img = Image.open(BytesIO(response.content))
        gray_img = ImageOps.grayscale(img)
        
        # Convert Pillow image to numpy array for OCR
        np_image = np.array(gray_img)
        
        # Perform OCR using EasyOCR
        result = reader.readtext(np_image, detail=0)  # detail=0 returns only the text
        extracted_text = " ".join(result)  # Join the OCR result into a single string
        
        # Use regex to find the value + unit (e.g., numbers followed by units like cm³, L, ml, etc.)
        pattern = r'(\d+\.?\d*)\s?(cm³|liters|ml|g|kg|m³|in³|L|oz|fl oz|gallon|µg|kg|ml|litre|fluid ounce|m|ft|quart|kv|fl oz|volt|pt|yd|in|v|tons|gal|l|ounce|cl|µl|lbs|microlitre|mg|w|cubic inch|cm|lb|cubic foot|watt|cu in|cu ft|g|pint|mcg|ton|dl|decilitre|imp gal|cup|gms|gm|centilitre|kw|imperial gallon|mm|millilitre|oz|qt|mv)'  # Modify as per expected units
        matches = re.findall(pattern, extracted_text, re.IGNORECASE)
        
        if matches:
            # Return the first match (value + unit)
            return f"{matches[0][0]} {matches[0][1]}"
        else:
            # Return blank if no match is found
            return " "
    
    except requests.exceptions.RequestException as e:
        print(f"Error downloading image from {image_url}: {e}")
        return " "  # Return blank in case of an error

Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


In [3]:
unit_set = {'cm', 'ft', 'in', 'm', 'mm', 'yd','g', 'gm', 'gms', 'kg', 'µg', 'mcg', 'mg', 'oz', 'ounce', 'lb', 'lbs', 'ton', 'tons','kv', 'mv', 'v', 'volt', 'kw', 'w', 'watt','cl', 'centilitre', 'cu ft', 'cubic foot', 'cu in', 'cubic inch', 'cup', 'dl', 'decilitre', 'fl oz', 'fluid ounce', 'gal', 'gallon', 'imp gal', 'imperial gallon', 'l', 'litre', 'µl', 'microlitre', 'ml', 'millilitre', 'pt', 'pint', 'qt', 'quart', 'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard', 'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton', 'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton', 'kilovolt', 'millivolt', 'volt' , 'kilowatt', 'watt', 'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}


In [4]:
units = '|'.join(unit for unit in unit_set)

In [5]:
units

'lb|centimetre|gram|l|pt|metre|g|cm|ton|ft|volt|millilitre|millimetre|kg|v|millivolt|imperial gallon|centilitre|cl|gal|m|kv|microlitre|qt|mv|microgram|w|milligram|µl|lbs|imp gal|fluid ounce|litre|tons|gallon|pound|quart|µg|foot|mg|ounce|mcg|kilovolt|cubic foot|gm|kilowatt|yd|cup|dl|oz|mm|cu ft|kw|cubic inch|gms|yard|kilogram|in|watt|fl oz|inch|decilitre|ml|pint|cu in'

In [21]:
train['entity_name'].unique()

array(['item_weight', 'item_volume', 'voltage', 'wattage',
       'maximum_weight_recommendation', 'height', 'depth', 'width'],
      dtype=object)

In [None]:
extract_value_unit_from_image()

In [12]:
embed = OneHotEncoder()

In [14]:
embed.fit(train[['entity_name']])

In [25]:
embed.transform([["item_volume"]]).toarray()



array([[0., 0., 1., 0., 0., 0., 0., 0.]])

In [86]:
out = embed.get_embeddings("voltage","DistilBERT")



In [7]:
train.loc[0]

image_link      https://m.media-amazon.com/images/I/61I9XdN6OF...
group_id                                                   748919
entity_name                                           item_weight
entity_value                                           500.0 gram
Name: 0, dtype: object

In [4]:
mod = OCRFeatureExtractor(model_name="Donut")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
out = mod.extract_features("C://Users//ASUS//Desktop//student_resource 3//images//31+ZLMVIYaL.jpg")

In [6]:
out.shape

(4800, 1024)

In [8]:
pca = PCA(200)

In [10]:
result = pca.fit_transform(out)

In [12]:
result.shape

(4800, 200)

In [13]:
from torch import nn

In [28]:
n = nn.Linear(200, 200)
n2 = nn.Linear(200,50)
n3 = nn.Linear(240000,200)

In [29]:
out = n2(n(torch.tensor(result)))

In [30]:
out = out.view(out.shape[0]*out.shape[1])

In [31]:
out.shape

torch.Size([240000])

In [32]:
out = n3(out)

In [33]:
out.shape

torch.Size([200])