In [1]:
import os
import cv2
import pandas as pd
import numpy as np
from PIL import Image
import pytesseract
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from utils import parse_string
import constants

In [2]:
# Load data
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')


In [None]:
def preprocess_image(image_path):
    img = Image.open(image_path)
    # Add your image preprocessing steps here
    return img

In [None]:
def extract_text(img):
    text = pytesseract.image_to_string(img)
    return text

In [None]:
def extract_features(text, entity_name):
    # Implement feature extraction logic based on the entity type
    # This is a placeholder and needs to be expanded based on your specific approach
    features = []
    # Add your feature extraction logic here
    return features

In [None]:
def train_model(X, y):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    return model

In [None]:
def predict_entity_value(model, features):
    prediction = model.predict(features)
    # Convert prediction to the required format (value + unit)
    formatted_prediction = format_prediction(prediction)
    return formatted_prediction

In [None]:
def format_prediction(prediction):
    # Implement logic to format the prediction according to the required output format
    # This is a placeholder and needs to be implemented based on your model's output
    return f"{prediction[0]} {prediction[1]}"

In [None]:

# Prepare training data
X = []
y = []
for _, row in train_df.iterrows():
    img_path = f"../images/train/{os.path.basename(row['image_link'])}"
    img = preprocess_image(img_path)
    text = extract_text(img)
    features = extract_features(text, row['entity_name'])
    X.append(features)
    y.append(row['entity_value'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Make predictions on test set
test_predictions = []
for _, row in test_df.iterrows():
    img_path = f"../images/test/{os.path.basename(row['image_link'])}"
    img = preprocess_image(img_path)
    text = extract_text(img)
    features = extract_features(text, row['entity_name'])
    prediction = predict_entity_value(model, features)
    test_predictions.append(prediction)

In [None]:
# Create output file
output_df = pd.DataFrame({
    'index': test_df['index'],
    'prediction': test_predictions
})
output_df.to_csv('test_out.csv', index=False)


In [None]:
# Run sanity check
!python sanity.py --test_filename ../dataset/test.csv --output_filename test_out.csv