# **AMAZON ML CHALLENGE**
# `- BY THE DECODERS`

STEP 1. Download and Preprocess Images

In [None]:
import os
import requests
from PIL import Image
from io import BytesIO
import pandas as pd

# Function to download a single image
def download_image(url, save_path):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img.save(save_path)
        return True
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")
        return False

# Function to download all images from a CSV
def download_images(csv_file, image_dir):
    # Create the image directory if it doesn't exist
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)

    # Load the CSV file
    data = pd.read_csv(csv_file)

    # Ensure there is an 'index' column (using DataFrame index if not present)
    if 'index' not in data.columns:
        data['index'] = data.index

    # Iterate over the rows and download images
    for idx, row in data.iterrows():
        url = row['image_link']
        img_name = f"{row['index']}.jpg"
        save_path = os.path.join(image_dir, img_name)
        success = download_image(url, save_path)
        if success:
            print(f"Downloaded {img_name}")
        else:
            print(f"Failed to download {img_name}")

# Paths to CSV files and directories to save images
train_csv_path = 'dataset\\train.csv'
test_csv_path = 'dataset\\test.csv'
train_image_dir = 'ML Images\\Train'
test_image_dir = 'ML Images\\Test'

# Download images for train and test datasets
# download_images(train_csv_path, train_image_dir)
download_images(test_csv_path, test_image_dir)


In [None]:
import pandas as pd

# Load the dataset
train_df = pd.read_csv('dataset\\train.csv')

# Inspect the columns
print(train_df.columns)


 STEP-2 Extract Text using OCR

In [None]:
import os
import pytesseract
import cv2
import pandas as pd

# Set the path to your Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'  # Adjust this path if necessary

# Paths to the image directories
train_image_dir = 'ML Images\\Sample_Train'
test_image_dir = 'ML Images\\Sample_Test'

def perform_ocr(image_dir):
    """
    Perform OCR on all images in the specified directory.
    
    Parameters:
        image_dir (str): Path to the directory containing images.
    
    Returns:
        results (list): A list of tuples (index, prediction).
    """
    results = []
    index = 0

    # Iterate through each image file in the directory
    for filename in os.listdir(image_dir):
        image_path = os.path.join(image_dir, filename)
        
        # Read the image
        image = cv2.imread(image_path)
        if image is None:
            print(f"Warning: Could not read image {filename}")
            results.append((index, ""))
            index += 1
            continue

        # Perform OCR using pytesseract
        ocr_result = pytesseract.image_to_string(image)
        
        # Clean and format the OCR result
        if ocr_result.strip():
            # Convert multiline text to a single comma-separated string
            formatted_text = ', '.join([line.strip() for line in ocr_result.splitlines() if line.strip()])
        else:
            formatted_text = ""

        # Append the result to the list
        results.append((index, formatted_text))
        index += 1
    
    return results

# Perform OCR on Train and Test images
train_results = perform_ocr(train_image_dir)
test_results = perform_ocr(test_image_dir)

# Combine train and test results (if needed, modify based on requirement)
# combined_results = train_results + test_results

# Create a DataFrame and save to CSV
df = pd.DataFrame(train_results, columns=['index', 'prediction'])
print(df)

In [None]:
# Output CSV file path
output_csv = 'D:\\Amazon ML Challenge\\student_resource 3\\ocr_output.csv'

df.to_csv(output_csv, index=False)

print(f"OCR processing completed. Results saved to {output_csv}")

EXTRACT TEXT USING VIT

In [4]:
import pandas as pd
from torchvision import transforms
from PIL import Image
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import ViTForImageClassification
import torch.nn as nn


# Custom Dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform
        
        # Ensure entity_name and entity_value are strings
        self.data_frame['entity_name'] = self.data_frame['entity_name'].astype(str)
        self.data_frame['entity_value'] = self.data_frame['entity_value'].astype(str)

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, f"{self.data_frame.iloc[idx, 0]}.jpg")  # Ensure proper file extension
        image = Image.open(img_name).convert("RGB")
        
        entity_name = self.data_frame.iloc[idx, 3]  # Index of entity_name
        entity_value = self.data_frame.iloc[idx, 4]  # Index of entity_value

        if self.transform:
            image = self.transform(image)
        
        return image, entity_name, entity_value

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load dataset
train_dataset = CustomDataset(csv_file='ML Images\\sample_train.csv', img_dir='ML Images\\Sample_Train', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
from transformers import ViTForImageClassification
import torch
from torch import nn

class CustomViTModel(nn.Module):
    def __init__(self, num_labels):
        super(CustomViTModel, self).__init__()
        self.vit = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=num_labels)
    
    def forward(self, images):
        return self.vit(images).logits

# Initialize model
num_labels = len(train_dataset.data_frame['entity_name'].unique())  # Example number of labels, adjust as needed
model = CustomViTModel(num_labels=num_labels)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [None]:
import torch

def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for images, entity_names, entity_values in dataloader:
            # Map entity_names to numerical labels
            labels = torch.tensor([name_to_label[name] for name in entity_names], dtype=torch.long)
            
            # Forward pass
            outputs = model(images)
            # Compute loss
            loss = criterion(outputs, labels)
            
            # Compute predictions
            _, predicted = torch.max(outputs, 1)
            
            # Update counts for accuracy
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * images.size(0)
        
        epoch_loss = running_loss / len(dataloader.dataset)
        epoch_accuracy = correct_predictions / total_predictions
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Example label encoding dictionary
name_to_label = {name: idx for idx, name in enumerate(train_dataset.data_frame['entity_name'].unique())}

# Train the model
train_model(model, train_loader, criterion, optimizer)


In [None]:
# Save the trained model
torch.save(model.state_dict(), 'vit_model.pth')
print("Model saved to 'vit_model.pth'")

In [None]:
# Load the model
model = CustomViTModel(num_labels=num_labels)
model.load_state_dict(torch.load('vit_model.pth'))
model.eval()
print("Model loaded from 'vit_model.pth'")

In [14]:
import pandas as pd
from torchvision import transforms
from PIL import Image
import os
import torch

# Define the test dataset class
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

        # Ensure entity_name is a string
        self.data_frame['entity_name'] = self.data_frame['entity_name'].astype(str)

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, f"{self.data_frame.iloc[idx, 0]}.jpg")  # Ensure proper file extension
        image = Image.open(img_name).convert("RGB")
        
        entity_name = self.data_frame.iloc[idx, 1]  # Index of entity_name

        if self.transform:
            image = self.transform(image)
        
        return image, entity_name

# Load test dataset
test_dataset = TestDataset(csv_file='ML Images\\sample_Test.csv', img_dir='ML Images\\Sample_Test', transform=transform)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

ANOTHER APPROACH FOR THE ABOVE ONE ONLY

In [32]:
import os
import cv2
import pytesseract
import pandas as pd

# Path to the directory containing images
images_path = 'ML Images\\Sample_Train'

# Path to the Tesseract executable (if required, usually on Windows)
# Uncomment and set the path if necessary
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

# Initialize an empty list to store the results
results = []

# Iterate over each image in the directory
for idx, image_file in enumerate(os.listdir(images_path)):
    # Construct the full image path
    image_path = os.path.join(images_path, image_file)
    
    # Read the image using OpenCV
    image = cv2.imread(image_path)
    
    # Convert the image to grayscale (optional, improves OCR accuracy)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply some preprocessing if needed (e.g., thresholding)
    # _, preprocessed_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY)
    
    # Perform OCR on the image using Tesseract
    extracted_text = pytesseract.image_to_string(gray_image)
    
    # Clean up the extracted text (remove line breaks, extra spaces, etc.)
    cleaned_text = ' '.join(extracted_text.split()).strip()
    
    # Append the result to the list as (index, predictions)
    results.append({'index': idx, 'predictions': cleaned_text})

# Create a DataFrame from the results
df_results = pd.DataFrame(results, columns=['index', 'predictions'])

# Display the DataFrame
print(df_results)


     index                                        predictions
0        0       PROPSS’ N RE INGREDIENT MENAGER 100% NATUREL
1        1                  LEBENSMITTELECHT GEPRAGTES DESIGN
2        2  G 4@m® Das Material ist ca. 5mm dick und die F...
3        3  Nature’s Way To Wellness “N91 . y, @ EASY TO S...
4        4  KIM JOHANSON Kaufe 3 Produkte von uns und spar...
..     ...                                                ...
196    196                                                   
197    197  iscover wellness GREEN COFFEE (| BURNFAT __) F...
198    198  GREEN COFFEE Helps Control Supports Blood by A...
199    199  16G Thick High Grade 304 Stainless Steel \ \ a...
200    200  cg @ Puppyiser WITH CHICKEN SHPURINA? @ PuPPYi...

[201 rows x 2 columns]


In [33]:
# Optionally, save the DataFrame to a CSV file
df_results.to_csv('ocr_extracted_text.csv', index=False)

In [36]:
import re
import pandas as pd
from src.constants import entity_unit_map, allowed_units  # Import from constants.py

# Load the DataFrame containing OCR extracted text
df_results = pd.read_csv('ocr_extracted_text.csv')

# Initialize an empty list to store the final predictions
final_predictions = []

# Define a function to extract values and units from text
def extract_value_and_unit(text):
    # Ensure the text is a string
    if not isinstance(text, str):
        return ""
    
    # Define regular expression to find numerical values followed by units
    # Example pattern: "34 gram", "12.5 centimetre"
    pattern = r'(\d+\.?\d*)\s*([a-zA-Z]+)'
    
    # Find all matches in the text
    matches = re.findall(pattern, text)
    
    # Process each match to find valid value and unit
    for match in matches:
        value, unit = match
        
        # Check if the unit is in the allowed units
        if unit in allowed_units:
            # Return the first valid match found
            return f"{float(value)} {unit}"
    
    # If no valid match is found, return an empty string
    return ""

# Iterate over each row in the DataFrame to extract values and units
for _, row in df_results.iterrows():
    # Get the extracted text from the 'predictions' column
    extracted_text = row['predictions']
    
    # Extract the value and unit from the text
    prediction = extract_value_and_unit(extracted_text)
    
    # Append the result to the final predictions list
    final_predictions.append(prediction)

# Add the final predictions to the DataFrame
df_results['prediction'] = final_predictions

# Drop the original 'predictions' column if not needed
df_results.drop(columns=['predictions'], inplace=True)

# Display the updated DataFrame
print(df_results.head())

# Save the final DataFrame to a CSV file
df_results.to_csv('final_predictions.csv', index=False)


   index prediction
0      0           
1      1           
2      2           
3      3           
4      4           


OCR EXTRACTION USING EASY-OCR

In [1]:
import os
import pandas as pd
import easyocr

# Initialize the OCR reader
reader = easyocr.Reader(['en'])  # Use English for OCR

# Directory containing the images
image_dir = 'ML Images\\Sample_Train'

# Create a list to store the extracted text
results = []

# Iterate over all the images in the directory
for idx, image_file in enumerate(sorted(os.listdir(image_dir))):
    # Get the full path to the image
    image_path = os.path.join(image_dir, image_file)

    # Perform OCR on the image
    try:
        # Extract text using EasyOCR
        ocr_result = reader.readtext(image_path, detail=0)  # detail=0 returns only text
        
        # Join the extracted text into a single string separated by commas
        extracted_text = ', '.join(ocr_result)
        
        # Append the result to the list
        results.append({'index': idx, 'predictions': extracted_text})
    
    except Exception as e:
        # In case of error, log the error and save an empty string for this image
        print(f"Error processing image {image_path}: {e}")
        results.append({'index': idx, 'predictions': ''})

# Convert results to a DataFrame
df_results = pd.DataFrame(results)

# Save the results to a CSV file if needed
df_results.to_csv('ocr_extracted_text_easyocr.csv', index=False)

# Display the DataFrame
print(df_results)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


     index                                        predictions
0        0  PROPOS', NATUREJ, INGREDIENT MENAGER, MULTI-US...
1        1  TrLeeda=_, Uer, R, RRIFIC;, LEBENSMITTELECHT, ...
2        2  GroBe Kapazicat; Tragiahigkeit bis zu 30KG, Da...
3        3  Nature' s, Way To Wellness, Ir1Q1, EASY TO, SW...
4        4  KIM JOHANSON, Kaufe 3 Produkte von, uns und sp...
..     ...                                                ...
196    196                                               8, 2
197    197  NHerbal max, Di5 € 0 V e [, w eilne $ s, GREEN...
198    198  Herbal max, BENEFITS OF GREEN COFFEE, Di $ € 0...
199    199  166 Thick High Grade 304 Stainless Steel, Anti...
200    200  NEW, 00s Food OnLT, #, NEW, RPURINAP, @, SUPER...

[201 rows x 2 columns]


In [18]:
import pandas as pd
import re

# Load the train.csv and ocr_extracted.csv
train_df = pd.read_csv('ML Images\\sample_train.csv')
ocr_df = pd.read_csv('ocr_extracted_text_easyocr.csv')

# Import allowed units from constants.py (assuming constants.py is in the current directory)
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Flatten the units into a single set
allowed_units = {unit for units in entity_unit_map.values() for unit in units}

# Create a list to store matched entity names and their values
matched_entities = []

# Iterate through each row in train.csv
for idx, row in train_df.iterrows():
    index = row['index']
    entity_name = row['entity_name']
    entity_value = str(row['entity_value']).strip().lower()
    
    # Find the corresponding OCR text using the index
    ocr_text = ocr_df.loc[ocr_df['index'] == index, 'predictions'].values
    if len(ocr_text) == 0:
        matched_entities.append({'index': index, 'ocr_text': '', 'entity_name': entity_name, 'matched_value': ''})
        continue
    
    # Join the OCR text into one string and lowercase it
    ocr_text_str = ', '.join(ocr_text).lower()

    # Search for numerical values in the OCR text
    number_matches = re.findall(r'\b\d+(\.\d+)?\b', ocr_text_str)
    
    # Search for allowed units in the OCR text
    unit_matches = [unit for unit in allowed_units if unit in ocr_text_str]
    
    # Try to find exact match from extracted text to entity_value in train.csv
    exact_match_found = False
    matched_value = ''

    # Check combinations of numbers and units
    for number in number_matches:
        for unit in unit_matches:
            combined_value = f"{number} {unit}"
            if combined_value == entity_value:
                matched_value = combined_value
                exact_match_found = True
                break
        if exact_match_found:
            break

    # If no exact match, use the first found number and unit as a fallback
    if not exact_match_found:
        if number_matches and unit_matches:
            matched_value = f"{number_matches[0]} {unit_matches[0]}"
        elif number_matches:
            matched_value = number_matches[0]
        elif unit_matches:
            matched_value = unit_matches[0]
    
    # Append the result to the matched entities list
    matched_entities.append({
        'index': index,
        'ocr_text': ocr_text_str,
        'entity_name': entity_name,
        'matched_value': matched_value
    })

# Convert matched entities to a DataFrame
matched_df = pd.DataFrame(matched_entities)

# Save matched DataFrame for reference
matched_df.to_csv('matched_entities_with_units.csv', index=False)

# Display the DataFrame
print(matched_df.head())

   index                                           ocr_text  entity_name  \
0      0  propos', naturej, ingredient menager, multi-us...  item_weight   
1      1  trleeda=_, uer, r, rrific;, lebensmittelecht, ...  item_volume   
2      2  grobe kapazicat; tragiahigkeit bis zu 30kg, da...  item_weight   
3      3  nature' s, way to wellness, ir1q1, easy to, sw...  item_weight   
4      4  kim johanson, kaufe 3 produkte von, uns und sp...  item_weight   

  matched_value  
0           ton  
1                
2                
3                
4                
