### Basic library imports

In [136]:
import os
import pandas as pd

### Read Dataset

In [137]:
DATASET_FOLDER = '../Projects/dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))

#sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
#sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

In [138]:
entity_unit_map = {
    'width': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'depth': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'height': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'item_weight': {
        'gram', 'g', 'gm', 
        'kilogram', 'kg', 
        'microgram', 'µg', 
        'milligram', 'mg', 
        'ounce', 'oz', 
        'pound', 'lb', 
        'ton', 't'
    },
    'maximum_weight_recommendation': {
        'gram', 'g', 'gm', 
        'kilogram', 'kg', 
        'microgram', 'µg', 
        'milligram', 'mg', 
        'ounce', 'oz', 
        'pound', 'lb', 
        'ton', 't'
    },
    'voltage': {'kilovolt', 'kV', 'millivolt', 'mV', 'volt', 'V'},
    'wattage': {'kilowatt', 'kW', 'watt', 'W'},
    'item_volume': {
        'centilitre', 'cL', 
        'cubic foot', 'ft³', 
        'cubic inch', 'in³', 
        'cup', 'c', 
        'decilitre', 'dL', 
        'fluid ounce', 'fl oz', 
        'gallon', 'gal', 
        'imperial gallon', 'imp gal', 
        'litre', 'L', 
        'microlitre', 'µL', 
        'millilitre', 'mL', 
        'pint', 'pt', 
        'quart', 'qt'
    }
}


### Run Sanity check using src/sanity.py

In [139]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

Error: Filepath: ../dataset/sample_test.csv invalid or not found.


In [140]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

Error: Filepath: ../dataset/sample_test.csv invalid or not found.


### Download images

In [141]:
'''from utils import download_images
download_images(t['image_link'], '../images')'''

"from utils import download_images\ndownload_images(t['image_link'], '../images')"

In [142]:
#assert len(os.listdir('../images')) > 0

In [143]:
rm -rf ../images


## Mapping url to its file name

In [144]:
from urllib.parse import urlparse
train['image_filename'] = train['image_link'].apply(lambda x: os.path.basename(urlparse(x).path))

# Display the dataframe to check the extracted filenames
print(train[['image_link', 'image_filename']].head())

                                          image_link   image_filename
0  https://m.media-amazon.com/images/I/61I9XdN6OF...  61I9XdN6OFL.jpg
1  https://m.media-amazon.com/images/I/71gSRbyXmo...  71gSRbyXmoL.jpg
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...  61BZ4zrjZXL.jpg
3  https://m.media-amazon.com/images/I/612mrlqiI4...  612mrlqiI4L.jpg
4  https://m.media-amazon.com/images/I/617Tl40LOX...  617Tl40LOXL.jpg


In [145]:
train.sample(5)

Unnamed: 0,image_link,group_id,entity_name,entity_value,image_filename
244003,https://m.media-amazon.com/images/I/51zf4b8EGO...,521308,height,35.0 inch,51zf4b8EGOL.jpg
239233,https://m.media-amazon.com/images/I/51tDiN7Cq7...,569206,height,5.0 centimetre,51tDiN7Cq7L.jpg
166431,https://m.media-amazon.com/images/I/41HdswhLQ-...,858439,width,9.8 centimetre,41HdswhLQ-L.jpg
245044,https://m.media-amazon.com/images/I/614hd17CI9...,858439,height,29.0 centimetre,614hd17CI9L.jpg
12479,https://m.media-amazon.com/images/I/71iWpPgI0G...,558374,item_weight,100 gram,71iWpPgI0GL.jpg


In [146]:
# Remove duplicate rows based on the 'image_id' column
train_cleaned = train.drop_duplicates(subset='image_filename', keep='first')

trained_cleaned=train_cleaned.head(100)
train_cleaned

Unnamed: 0,image_link,group_id,entity_name,entity_value,image_filename
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,61I9XdN6OFL.jpg
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,71gSRbyXmoL.jpg
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,61BZ4zrjZXL.jpg
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,612mrlqiI4L.jpg
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,617Tl40LOXL.jpg
...,...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre,612J1R1xHlL.jpg
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch,61Blzh2+28L.jpg
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre,51MsegDL9VL.jpg
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre,510KhVw4VSL.jpg


In [147]:
# Check for duplicates
if train_cleaned['image_filename'].duplicated().any():
    print("Warning: Duplicate image IDs found in CSV!")
train_cleaned.columns

Index(['image_link', 'group_id', 'entity_name', 'entity_value',
       'image_filename'],
      dtype='object')

In [148]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import tensorflow as tf

image_count = len(list(os.listdir('../Projects/images')))

In [149]:
print(image_count)

111723


In [150]:
'''import os
import pandas as pd

# Define the image directory path
image_directory = "../Projects/images/"

image_files_in_dir = set(os.listdir(image_directory))

# Filter the DataFrame to keep only rows where image_filename exists in the directory
train_cleaned_filtered = train_cleaned[train_cleaned['image_filename'].isin(image_files_in_dir)]

for new_index, row in enumerate(train_cleaned_filtered.itertuples(), start=1):
    old_image_name = row.image_filename
    old_image_path = os.path.join(image_directory, old_image_name)

    # Create the new image name
    new_image_name = f"{new_index}.jpg"
    new_image_path = os.path.join(image_directory, new_image_name)

    # Rename the image file in the directory
    os.rename(old_image_path, new_image_path)

    # Update the DataFrame with the new image filename
    train_cleaned_filtered.at[row.Index, "image_filename"] = new_image_name'''



'import os\nimport pandas as pd\n\n# Define the image directory path\nimage_directory = "../Projects/images/"\n\nimage_files_in_dir = set(os.listdir(image_directory))\n\n# Filter the DataFrame to keep only rows where image_filename exists in the directory\ntrain_cleaned_filtered = train_cleaned[train_cleaned[\'image_filename\'].isin(image_files_in_dir)]\n\nfor new_index, row in enumerate(train_cleaned_filtered.itertuples(), start=1):\n    old_image_name = row.image_filename\n    old_image_path = os.path.join(image_directory, old_image_name)\n\n    # Create the new image name\n    new_image_name = f"{new_index}.jpg"\n    new_image_path = os.path.join(image_directory, new_image_name)\n\n    # Rename the image file in the directory\n    os.rename(old_image_path, new_image_path)\n\n    # Update the DataFrame with the new image filename\n    train_cleaned_filtered.at[row.Index, "image_filename"] = new_image_name'

In [151]:
'''# Save the filtered DataFrame if needed
train_cleaned_filtered.to_csv('filtered_dataframe.csv', index=False)


print("Filtering completed. Remaining rows:", len(train_cleaned_filtered))
train_cleaned_filtered'''

'# Save the filtered DataFrame if needed\ntrain_cleaned_filtered.to_csv(\'filtered_dataframe.csv\', index=False)\n\n\nprint("Filtering completed. Remaining rows:", len(train_cleaned_filtered))\ntrain_cleaned_filtered'

In [153]:
df2 = pd.read_csv( 'filtered_dataframe.csv')

## For out of memory errors in GPU

In [154]:
import tensorflow as tf

# List all physical devices and set memory growth
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


##IMAGE PROCESSING

In [158]:
import numpy as np
import cv2
import os
from PIL import Image, ImageEnhance
import tensorflow as tf

class CustomImageDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, image_paths, batch_size, image_size, contrast_factor=1.1):
        self.image_paths = image_paths
        self.batch_size = batch_size
        self.image_size = image_size
        self.contrast_factor = contrast_factor

    def preprocess_image(self, image):
        # Convert to grayscale
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Denoise using Non-Local Means Denoising
        denoised_image = cv2.fastNlMeansDenoising(gray_image, None, 5, 7, 21)
        
        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) for better contrast
        clahe = cv2.createCLAHE(clipLimit=1.0, tileGridSize=(16, 16))
        enhanced_contrast = clahe.apply(denoised_image)
        
        # Adaptive thresholding for better binary conversion in uneven lighting
        binary_image = cv2.adaptiveThreshold(enhanced_contrast, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                             cv2.THRESH_BINARY, 11, 2)
        
        # Optional: Apply morphological operations to clean up small noise
        kernel = np.ones((1, 1), np.uint8)
        morphed_image = cv2.morphologyEx(binary_image, cv2.MORPH_OPEN, kernel)
        
        # Convert to PIL Image
        pil_image = Image.fromarray(morphed_image)
        
        # Enhance contrast using PIL for fine-tuning
        enhancer = ImageEnhance.Contrast(pil_image)
        enhanced_image = enhancer.enhance(self.contrast_factor)
        
        return np.array(enhanced_image)

    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))
    
    def __getitem__(self, index):
        batch_paths = self.image_paths[index * self.batch_size:(index + 1) * self.batch_size]
        batch_images = []
        
        for path in batch_paths:
            try:
                image = cv2.imread(path)
                if image is None:
                    print(f"Warning: Unable to load image {path}")
                    continue
                image = cv2.resize(image, self.image_size)  # Resize image
                image = self.preprocess_image(image)  # Apply the preprocessing pipeline
                
                # Append to batch_images
                batch_images.append(image)
            except Exception as e:
                print(f"Error processing image {path}: {e}")
        
        batch_images = np.array(batch_images)
        return batch_images
batch_size = 100
image_size = (1536, 1536)  # Set image size to 1024x1024
data_generator = CustomImageDataGenerator(image_paths, batch_size=batch_size, image_size=image_size)

## Splitting entity_value

In [159]:
import pandas as pd
import numpy as np
import re
import pytesseract
from pytesseract import Output
from concurrent.futures import ThreadPoolExecutor
from PIL import Image  # Ensure PIL is imported for image processing
import os

# Define the unit extraction map
entity_unit_map = {
    'width': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'depth': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'height': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'item_weight': {'gram', 'g', 'gm', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton', 't'},
    'maximum_weight_recommendation': {'gram', 'g', 'gm', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton', 't'},
    'voltage': {'kilovolt', 'kV', 'millivolt', 'mV', 'volt', 'V'},
    'wattage': {'kilowatt', 'kW', 'watt', 'W'},
    'item_volume': {'centilitre', 'cL', 'cubic foot', 'ft³', 'cubic inch', 'in³', 'cup', 'c', 'decilitre', 'dL', 'fluid ounce', 'fl oz', 'gallon', 'gal', 'imperial gallon', 'imp gal', 'litre', 'L', 'microlitre', 'µL', 'millilitre', 'mL', 'pint', 'pt', 'quart', 'qt'}
}

def extract_value_and_unit(text, allowed_units):
    value = None
    unit = None
    value_pattern = re.compile(r'\d+(\.\d+)?')
    unit_pattern = re.compile(r'\b(?:' + '|'.join(re.escape(unit) for unit in allowed_units) + r')\b', re.IGNORECASE)
    value_match = value_pattern.search(text)
    unit_match = unit_pattern.search(text)
    if value_match:
        value = value_match.group()
    if unit_match:
        unit = unit_match.group()
    return value, unit

def extract_entities_from_image(image, image_filename):
    # Convert NumPy image to PIL format (if needed for pytesseract)
    pil_image = Image.fromarray(image)

    # Perform OCR
    text = pytesseract.image_to_string(pil_image, output_type=Output.STRING)
    
    extracted_values = []
    for entity, units in entity_unit_map.items():
        value, unit = extract_value_and_unit(text, units)
        if value and unit:
            extracted_values.append(f'{value} {unit}')
    
    # Print the extracted values for debugging
    print(f'Extracted from {image_filename}: {", ".join(extracted_values)}')

    return {
        'image_filename': image_filename,
        'Text': ', '.join(extracted_values)  # Combine all extracted values into a single string
    }

def process_batch(images, filenames):
    data = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = executor.map(lambda image_file: extract_entities_from_image(*image_file), zip(images, filenames))
        for result in results:
            data.append(result)
    return data


In [None]:
def process_batches(data_generator, train_cleaned_filtered, batch_size=64, num_batches=None):
    """
    Process a specified number of batches of preprocessed images.

    Parameters:
    - data_generator: Generator that yields preprocessed images.
    - train_cleaned_filtered: DataFrame with image filenames and other data.
    - batch_size: Number of images per batch (default: 64).
    - num_batches: Number of batches to process (None means process all batches).
    """
    total_batches = len(data_generator) if num_batches is None else min(num_batches, len(data_generator))

    for i in range(total_batches):
        batch_images = data_generator[i]  # Get preprocessed images for this batch
        batch_filenames = data_generator.image_paths[i * batch_size:(i + 1) * batch_size]  # Corresponding filenames
        
        # Filter filenames that are present in the dataset (consistent dataframe usage)
        matching_filenames = [filename for filename in batch_filenames if os.path.basename(filename) in train_cleaned_filtered['image_filename'].values]
        matching_images = [image for image, filename in zip(batch_images, batch_filenames) if os.path.basename(filename) in train_cleaned_filtered['image_filename'].values]
        
        # Check for empty batches
        if len(matching_images) == 0:
            print(f"No matching images found for batch {i + 1}")
            continue
        
        # Extract text from the batch of matching images
        new_data = process_batch(matching_images, [os.path.basename(path) for path in matching_filenames])
        new_df = pd.DataFrame(new_data)

        # Update train_cleaned_filtered based on new_df
        for _, row in new_df.iterrows():
            image_filename = row['image_filename']
            text = row['Text']

            # Update the 'Text' field for matching image filenames
            if image_filename in train_cleaned_filtered['image_filename'].values:
                train_cleaned_filtered.loc[train_cleaned_filtered['image_filename'] == image_filename, 'Text'] = text

        print(f'Processed batch {i + 1}')
        
        # Save the updated dataframe after each batch
        train_cleaned_filtered.to_csv('updated_train_cleaned_filtered.csv', index=False)  # Save after each batch

# Example usage
num_batches_to_process = 5  # Change this to the number of batches you want to process
process_batches(data_generator, train_cleaned_filtered, batch_size=100, num_batches=num_batches_to_process)
