In [1]:
pip install easyocr

Note: you may need to restart the kernel to use updated packages.


OCR EXTRACTION FOR TEST IMAGES

In [None]:
import os
import pandas as pd
import easyocr

# Initialize the OCR reader
reader = easyocr.Reader(['en'])  # Use English for OCR

# Directory containing the images
image_dir = '/kaggle/input/test-amazonml/Test'

# Create a list to store the extracted text
results = []

# Iterate over all the images in the directory
for idx, image_file in enumerate(sorted(os.listdir(image_dir))):
    # Get the full path to the image
    image_path = os.path.join(image_dir, image_file)

    # Perform OCR on the image
    try:
        # Extract text using EasyOCR
        ocr_result = reader.readtext(image_path, detail=0)  # detail=0 returns only text
        
        # Join the extracted text into a single string separated by commas
        extracted_text = ', '.join(ocr_result)
        
        # Append the result to the list
        results.append({'index': idx, 'predictions': extracted_text})
    
    except Exception as e:
        # In case of error, log the error and save an empty string for this image
        print(f"Error processing image {image_path}: {e}")
        results.append({'index': idx, 'predictions': ''})

# Convert results to a DataFrame
df_results = pd.DataFrame(results)

# Save the results to a CSV file if needed
df_results.to_csv('ocr_extracted_text_test_easyocr.csv', index=False)

# Display the DataFrame
print(df_results)


  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


OCR EXTRACTION ON TRAIN SET

In [None]:
import os
import pandas as pd
import easyocr

# Initialize the OCR reader
reader = easyocr.Reader(['en'])  # Use English for OCR

# Directory containing the images
image_dir = '/kaggle/input/train-amazonml/Train'

# Create a list to store the extracted text
results = []

# Iterate over all the images in the directory
for idx, image_file in enumerate(sorted(os.listdir(image_dir))):
    # Get the full path to the image
    image_path = os.path.join(image_dir, image_file)

    # Perform OCR on the image
    try:
        # Extract text using EasyOCR
        ocr_result = reader.readtext(image_path, detail=0)  # detail=0 returns only text
        
        # Join the extracted text into a single string separated by commas
        extracted_text = ', '.join(ocr_result)
        
        # Append the result to the list
        results.append({'index': idx, 'predictions': extracted_text})
    
    except Exception as e:
        # In case of error, log the error and save an empty string for this image
        print(f"Error processing image {image_path}: {e}")
        results.append({'index': idx, 'predictions': ''})

# Convert results to a DataFrame
df_results = pd.DataFrame(results)

# Save the results to a CSV file if needed
df_results.to_csv('ocr_extracted_text_easyocr.csv', index=False)

# Display the DataFrame
print(df_results)

ML MODEL ON THE EXTRACTED TEXT FOR PREDICTION OF ENTITY OF TEST SET

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from src.constants import entity_unit_map

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

In [None]:
def extract_entity_value(text, entity_name):
    if pd.isna(text):
        return None
    text = str(text)
    
    if entity_name in entity_unit_map:
        units = entity_unit_map[entity_name]
        # First, try to find a number with any of the units
        for unit in units:
            pattern = rf'(\d+(?:\.\d+)?)\s*{re.escape(unit)}'
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return f"{match.group(1)} {unit}"
        
        # If no unit is found, just extract the first number and use any unit
        number_match = re.search(r'\d+(?:\.\d+)?', text)
        if number_match:
            return f"{number_match.group()} {next(iter(units))}"
    
    # For entities without specific units, just extract the first number
    number_match = re.search(r'\d+(?:\.\d+)?', text)
    return number_match.group() if number_match else None


In [None]:
def train_model(train_df, ocr_df):
    try:
        merged_df = pd.merge(train_df, ocr_df, on='index')
        merged_df['processed_text'] = merged_df['predictions'].apply(preprocess_text)
        
        X = merged_df['processed_text']
        y = merged_df['entity_name']
        
        model = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', MultinomialNB())
        ])
        model.fit(X, y)
        
        return model
    except Exception as e:
        print(f"Error in train_model: {str(e)}")
        raise

In [None]:
def predict_entity_values(model, test_df, ocr_df):
    try:
        merged_df = pd.merge(test_df, ocr_df, on='index')
        merged_df['processed_text'] = merged_df['predictions'].apply(preprocess_text)
        
        predicted_values = []
        for idx, row in merged_df.iterrows():
            entity_name = row['entity_name']
            value = extract_entity_value(row['predictions'], entity_name)
            predicted_values.append(value)
        
        # Create a new DataFrame with only 'index' and 'predictions' columns
        results_df = pd.DataFrame({
            'index': merged_df['index'],
            'predictions': predicted_values
        })
        
        return results_df
    except Exception as e:
        print(f"Error in predict_entity_values: {str(e)}")
        raise

In [None]:
def main():
    try:
        train_df = pd.read_csv('/kaggle/input/csv-for-amazon-ml-challenge/train.csv')
        ocr_train_df = pd.read_csv('ocr_extracted_text_easyocr.csv')
        test_df = pd.read_csv('/kaggle/input/csv-for-amazon-ml-challenge/test.csv')  # Assuming you have a test.csv file
        ocr_test_df = pd.read_csv('ocr_extracted_text_test_easyocr.csv')
        
        model = train_model(train_df, ocr_train_df)
        results_df = predict_entity_values(model, test_df, ocr_test_df)
        
        # Save results with only 'index' and 'predictions' columns
        results_df.to_csv('test_predictions.csv', index=False)
        print("Predictions saved to 'test_predictions.csv'")
    except Exception as e:
        print(f"Error in main function: {str(e)}")

if __name__ == "__main__":
    main()