## Import Libraries


In [2]:
import os
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Example model
from sklearn.metrics import mean_squared_error

# === Step 1: Image Downloading === #

In [3]:
image_dir = 'images'
os.makedirs(image_dir, exist_ok=True)

## Load Datasets

In [4]:
# Load datasets
train_df = pd.read_csv('train.csv')

## Download Images

In [6]:
def download_images(image_url, image_id, save_dir):
    try:
        response = requests.get(image_url)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))
            image_path = os.path.join(save_dir, f'{image_id}.jpg')
            image.save(image_path)
            print(f"Image {image_id} downloaded successfully!")
        else:
            print(f"Failed to download image {image_id}: {response.status_code}")
    except Exception as e:
        print(f"Error downloading image {image_id}: {e}")
        # Download all images
for index, row in train_df.iterrows():
    image_url = row['image_link']
    image_id = index
    download_images(image_url, image_id, image_dir)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Image 21314 downloaded successfully!
Image 21315 downloaded successfully!
Image 21316 downloaded successfully!
Image 21317 downloaded successfully!
Image 21318 downloaded successfully!
Image 21319 downloaded successfully!
Image 21320 downloaded successfully!
Image 21321 downloaded successfully!
Image 21322 downloaded successfully!
Image 21323 downloaded successfully!
Image 21324 downloaded successfully!
Image 21325 downloaded successfully!
Image 21326 downloaded successfully!
Image 21327 downloaded successfully!
Image 21328 downloaded successfully!
Image 21329 downloaded successfully!
Image 21330 downloaded successfully!
Image 21331 downloaded successfully!
Image 21332 downloaded successfully!
Image 21333 downloaded successfully!
Image 21334 downloaded successfully!
Image 21335 downloaded successfully!
Image 21336 downloaded successfully!
Image 21337 downloaded successfully!
Image 21338 downloaded successfully!
Image 2133

KeyboardInterrupt: 

# === Step 2: OCR and Preprocessing === #

In [None]:
def preprocess_image(image_path):
    img = Image.open(image_path).convert('L')
    img_np = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    _, img_thresh = cv2.threshold(img_np, 150, 255, cv2.THRESH_BINARY)
    return Image.fromarray(img_thresh)

def extract_text_from_image(image_path):
    try:
        img = preprocess_image(image_path)
        return pytesseract.image_to_string(img)
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return ""

# Perform OCR on all images
ocr_results = {}
for image_file in os.listdir(image_dir):
    if image_file.endswith(".jpg"):
        image_path = os.path.join(image_dir, image_file)
        text = extract_text_from_image(image_path)
        ocr_results[image_file] = text


# === Step 3: Preprocess Entity Values === #

In [None]:
def preprocess_entity_value(value):
    value = str(value).strip()
    if not value:
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(value):
        raise ValueError(f"Invalid format in {value}")
    parts = value.split(maxsplit=1)
    number = float(parts[0])
    unit = parts[1]
    return number, unit

train_df[['value', 'unit']] = train_df['entity_value'].apply(preprocess_entity_value).apply(pd.Series)


# === Step 4: Encode Categorical Features and Prepare Dataset === #

In [None]:
X = pd.get_dummies(train_df[['group_id', 'entity_name']])
y = train_df['value']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Step 5: Train the Model === #

In [None]:
# Example: Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


# === Step 6: Make Predictions on New Data === #

In [None]:
test_df = pd.read_csv('test.csv')
X_test_new = pd.get_dummies(test_df[['group_id', 'entity_name']])
y_pred_new = model.predict(X_test_new)

# Add predictions to the test DataFrame
test_df['predicted_value'] = y_pred_new

# Save predictions to CSV (for competition submission)
test_df.to_csv('test_predictions.csv', index=False)