In [1]:
!git clone https://github.com/Aniruddha-Ponnuri/Amazon--ML-challenge.git

Cloning into 'Amazon--ML-challenge'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 35 (delta 6), reused 34 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (35/35), 4.96 MiB | 12.20 MiB/s, done.
Resolving deltas: 100% (6/6), done.


In [2]:
!pip install pytesseract spacy

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [3]:
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pytesseract

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.82)] [Connected to cloud.r-                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                                                    Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [2 InRelease 63.4 kB/128 kB 50%] [Connecting to security.ubuntu.com (185.125.190.82)] [Connected 0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                                                                    Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
0% [4 InRelease 15.6 kB/127 kB 12%] [Waiting for headers] [Connected to r2u.s

In [4]:
import pandas as pd
import numpy as np
import requests
from io import BytesIO
from PIL import Image
import pytesseract
import re
import random
import spacy
from spacy.util import minibatch, compounding
from tqdm import tqdm
import os
import sys
sys.path.append('/content/Amazon--ML-challenge/')
from src.utils import parse_string
from src.constants import entity_unit_map
from src.sanity import sanity_check

In [5]:
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

In [6]:
# 1. Load Data
train_df = pd.read_csv('/content/Amazon--ML-challenge/dataset/sample.csv')
test_df = pd.read_csv('/content/Amazon--ML-challenge/dataset/sample_test.csv')

In [7]:
# 2. Process Images Directly from URLs
def load_image_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        return img
    except Exception as e:
        print(f"Error loading image from {url}: {e}")
        # Create a black placeholder image
        img = Image.new('RGB', (100, 100), color='black')
        return img

In [8]:
# 3. Extract Text from Images Using OCR
def extract_text_from_image(img):
    try:
        text = pytesseract.image_to_string(img)
        return text
    except Exception as e:
        print(f"Error during OCR: {e}")
        return ''


In [9]:
print("Extracting OCR text from training images...")
train_texts = []
for idx, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
    image_link = row['image_link']
    img = load_image_from_url(image_link)
    text = extract_text_from_image(img)
    train_texts.append(text)

train_df['ocr_text'] = train_texts

Extracting OCR text from training images...


100%|██████████| 199/199 [04:34<00:00,  1.38s/it]


In [10]:
# 5. Create Training Data for NER Model
def create_training_data(df):
    training_data = []
    for idx, row in df.iterrows():
        text = row['ocr_text']
        entity_value = str(row['entity_value']).strip()
        if not text or not entity_value:
            continue
        start = text.lower().find(entity_value.lower())
        if start != -1:
            end = start + len(entity_value)
            entities = [(start, end, 'ENTITY')]
            training_data.append((text, {"entities": entities}))
    return training_data

In [11]:
print("Creating training data for NER model...")
training_data = create_training_data(train_df)
print(f"Number of training examples: {len(training_data)}")

Creating training data for NER model...
Number of training examples: 2


In [12]:
nlp = spacy.blank('en')

if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

ner.add_label('ENTITY')

n_iter = 10

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    print("Training the NER model...")
    for itn in range(n_iter):
        random.shuffle(training_data)
        losses = {}
        batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.5))
        for batch in batches:
            texts, annotations = zip(*batch)
            try:
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            except Exception as e:
                continue
        print(f"Iteration {itn + 1}/{n_iter}, Losses: {losses}")


Training the NER model...
Iteration 1/10, Losses: {}
Iteration 2/10, Losses: {}
Iteration 3/10, Losses: {}
Iteration 4/10, Losses: {}
Iteration 5/10, Losses: {}
Iteration 6/10, Losses: {}
Iteration 7/10, Losses: {}
Iteration 8/10, Losses: {}
Iteration 9/10, Losses: {}
Iteration 10/10, Losses: {}


In [13]:
model_output_dir = 'ner_model'
nlp.to_disk(model_output_dir)
print(f"Model saved to {model_output_dir}")

Model saved to ner_model


In [14]:
test_df = pd.read_csv('/content/Amazon--ML-challenge/dataset/test.csv')

In [None]:
# 7. Predict Entity Values for test.csv
print("Extracting OCR text from test images...")
test_texts = []
for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    image_link = row['image_link']
    img = load_image_from_url(image_link)
    text = extract_text_from_image(img)
    test_texts.append(text)

test_df['ocr_text'] = test_texts

Extracting OCR text from test images...


  7%|▋         | 8921/131187 [1:00:05<11:51:25,  2.86it/s]

In [None]:
# Function to extract entity value using the NER model
def extract_entity_value(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'ENTITY':
            return ent.text.strip()
    return ''

In [None]:
# Function to format prediction
def format_prediction(pred_text, entity_name):
    if not pred_text:
        return ''
    allowed_units = entity_unit_map[entity_name]
    units_pattern = r'|'.join([re.escape(unit) for unit in allowed_units])
    pattern = rf'(\d+\.?\d*)\s*({units_pattern})\b'
    match = re.search(pattern, pred_text, re.IGNORECASE)
    if match:
        value, unit = match.groups()
        try:
            value = str(float(value))
            unit = unit.lower()
            if unit in allowed_units:
                return f"{value} {unit}"
        except ValueError:
            return ''
    return ''

In [None]:
# Predict and format
print("Predicting entity values for test data...")
formatted_predictions = []
for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    text = row['ocr_text']
    entity_name = row['entity_name']
    pred_text = extract_entity_value(text)
    formatted_prediction = format_prediction(pred_text, entity_name)
    formatted_predictions.append(formatted_prediction)

In [None]:
# 8. Save Predictions to CSV
output_df = pd.DataFrame({
    'index': test_df['index'],
    'prediction': formatted_predictions
})

assert len(output_df) == len(test_df), "Mismatch in number of predictions."

output_df.to_csv('test_out.csv', index=False)
print("Predictions saved to test_out.csv")

# 9. Run Sanity Checker
print("Running sanity checker...")
try:
    sanity_check('dataset/test.csv', 'test_out.csv')
    print("Sanity check passed.")
except Exception as e:
    print(f"Sanity check failed: {e}")