In [3]:
import pandas as pd
import pytesseract
import re
from PIL import Image
import requests
from io import BytesIO
#from utils import download_images

# Set the path for tesseract if needed
# pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'

# Function to download images from URLs
def download_image(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        return img
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")
        return None

# Extract the value for the given entity from the image text
def extract_value_from_text(text, entity_name):
    text = text.lower()  # Normalize text to lowercase
    
    entity_patterns = {
        'width': r'(\d+(\.\d+)?)\s*(cm|mm|inches|inch)',
        'height': r'(\d+(\.\d+)?)\s*(cm|mm|inches|inch)',
        'depth': r'(\d+(\.\d+)?)\s*(cm|mm|inches|inch)',
        'voltage': r'(\d+(\.\d+)?)\s*(volt|v)',
        'wattage': r'(\d+(\.\d+)?)\s*(watt|kw|w)',
        'item_weight': r'(\d+(\.\d+)?)\s*(kg|g|gram|ton)',
        'maximum_weight_recommendation': r'(\d+(\.\d+)?)\s*(kg|g|gram|ton)'
    }
    
    pattern = entity_patterns.get(entity_name)
    
    if pattern:
        match = re.search(pattern, text)
        if match:
            value = match.group(1)  # Extract the numeric value
            unit = match.group(3)   # Extract the unit
            return f"{value} {unit}"
    
    return ""

# Process a single image to extract the entity value
def process_image(image_url, entity_name):
    img = download_image(image_url)
    if img:
        extracted_text = pytesseract.image_to_string(img)
        return extract_value_from_text(extracted_text, entity_name)
    else:
        return ""

# Main function to process the entire DataFrame and predict values
def predict_entity_values(df):
    predictions = []

    for index, row in df.iterrows():
        image_url = row['image_link']
        entity_name = row['entity_name']
        
        # Process the image and get the prediction
        prediction = process_image(image_url, entity_name)
        
        # Append the result to the list
        predictions.append({
            'index': row['index'],
            'prediction': prediction if prediction else ""  # Return empty string if no prediction
        })
    
    # Convert the predictions list to a DataFrame
    prediction_df = pd.DataFrame(predictions)
    
    # Save to CSV
    prediction_df.to_csv('predictions.csv', index=False)

    return prediction_df

# Example usage
df = pd.read_csv('/Users/kaustubh/Downloads/student_resource 3/dataset/sample_test.csv')  # Load the test dataset
result_df = predict_entity_values(df)
print(result_df.head())


   index prediction
0      0      20 cm
1      1      20 cm
2      2           
3      3           
4      4           
