
**1. Extract Text from Images Using EasyOCR and Save Results to CSV:**

*Libraries Used: cv2, numpy, easyocr, os, pandas, torch.*
Process:
1. Images are loaded from a folder using os.listdir().
2. Text is extracted from each image using EasyOCR, a deep learning-based OCR library that supports GPU acceleration via CUDA.
3. The recognize_text() function uses EasyOCR to extract text from each image.
4. The extracted text is saved in a CSV file along with the corresponding image names. The text is only saved if the recognition probability is above 0.2.
5. The GPU (CUDA) is used for text recognition if available, otherwise, the CPU is used.
6. The resulting CSV contains two columns: Image Name and Detected Text.

In [None]:
import cv2
import numpy as np
import easyocr
import os
import pandas as pd
import torch

# Extracting the path of images one by one in the form of list
img_folder_path = r"E:\ML_challenge_DATASET\TEST\Test_data\Folder_10"
create_path = lambda f: os.path.join(img_folder_path, f)
test_image_files = os.listdir(img_folder_path)

# Recognize text function using EasyOCR with CUDA support
def recognize_text(img_path, reader):
    '''Loads an image and recognizes text using GPU if available.'''
    return reader.readtext(img_path)

# Extracting text from all images in the folder and saving to a CSV
def save_ocr_results_to_csv(folder_path, csv_filename):
    '''Detects text from all images in a folder and saves the results in a CSV.'''

    data = []  # List to hold image names and their detected texts

    # Check if CUDA is available
    use_cuda = torch.cuda.is_available()
    device = 'GPU' if use_cuda else 'CPU'
    print(f"Using {device} for computation.")

    # Create the EasyOCR reader with GPU support if available
    reader = easyocr.Reader(['en'], gpu=use_cuda)  # Enable GPU if CUDA is available

    try:
        # Loop through all image files in the folder
        for img_file in os.listdir(folder_path):
            # Get the full image path
            img_path = os.path.join(folder_path, img_file)

            if img_file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
                print(f"Processing image: {img_file}")

                # Perform OCR on the image
                result = recognize_text(img_path, reader)

                # Combine all recognized text in the image
                detected_text = " ".join([text for (_, text, prob) in result if prob >= 0.2])

                # Append the image name and detected text to the data list
                data.append([img_file, detected_text])

    except Exception as e:
        print(f"Error encountered: {e}")

    finally:
        # Convert the data to a DataFrame and save it to a CSV file
        df = pd.DataFrame(data, columns=['Image Name', 'Detected Text'])
        df.to_csv(csv_filename, index=False)
        print(f"Partial results saved to {csv_filename}")

# Specify the folder containing images and the CSV file to save the results
csv_output_path = r"E:\ML_challenge_DATASET\TEST\Test_data\Folder10_OCR.csv"

# Call the function to process all images and save results to CSV
save_ocr_results_to_csv(img_folder_path, csv_output_path)


**2. Extract and Standardize Units from Detected Text:**

*Libraries Used: pandas, re.*
Process:
1. A dictionary, entity_unit_map, defines unit types (e.g., length, weight, volume) and their possible variations (e.g., "cm", "mm", "kg").
2. A unit_conversion_map converts unit abbreviations or variants into a standardized form.
3. The correct_o_to_0() function corrects errors where the character 'O' is mistakenly used in numeric values.
4. The extract_numbers_with_units() function extracts numbers followed by units (e.g., "20 kg") from the detected text.
5. A CSV with image names and detected text is read, and for each row, unit-specific information (e.g., width, weight, height) is extracted and standardized into the desired units.
6. The results are saved in a new CSV file with the standardized values.

In [None]:
import pandas as pd
import re

# Define the unit map
entity_unit_map = {
    "width": {"centimetre", "centimeter", "centimeters", "cm", "foot", "ft", "millimetre", "millimeter", "mm", "metre", "meter", "meters", "m", "inch", "in", "inches", "yard", "yd", '"', "'", "CM", "Inches"},
    "depth": {"centimetre", "centimeter", "centimeters", "cm", "foot", "ft", "millimetre", "millimeter", "mm", "metre", "meter", "meters", "m", "inch", "in", "inches", "yard", "yd", '"', "'", "CM", "Inches"},
    "height": {"centimetre", "centimeter", "centimeters", "cm", "foot", "ft", "FEET", "millimetre", "millimeter", "mm", "metre", "meter", "meters", "m", "inch", "in", "inches", "yard", "yd", '"',"'", "CM", "Inches"},
    "item_weight": {"milligram", "mg", "kilogram", "kg", "microgram", "µg", "gram", "g", "ounce", "oz", "ton", "pound", "lb", "LBS", "lbs", "MG", "mG"},
    "maximum_weight_recommendation": {"milligram", "mg", "kilogram", "kg", "microgram", "µg", "gram", "g", "ounce", "oz", "ton", "pound", "lb", "lbs", "LBS", "mG", "MG"},
    "voltage": {"millivolt", "mv", "kilovolt", "kv", "volt", "v", "MV", "MV"},
    "wattage": {"kilowatt", "kw", "watt", "w", "Watt", "WATT", "KW"},
    "item_volume": {"cubic foot", "ft³", "microlitre", "µl", "cup", "Cup", "fluid ounce", "fl oz", "centilitre", "cl", "imperial gallon", "gal", "pint", "pt", "decilitre", "dl", "litre", "l", "millilitre", "ml", "quart", "qt", "cubic inch", "in³", "gallon", "gal"},
}

# Define the unit conversion map
unit_conversion_map = {
    "cm": "centimetre", "CM": "centimetre", "Cm": "centimetre", "cM": "centimetre",
    "ft": "foot", "FT": "foot", "Ft": "foot", "fT": "foot", "FEET": "foot",
    "m": "metre", "M": "metre",
    "mm": "millimetre", "MM": "millimetre", "Mm": "millimetre", "mM": "millimetre",
    "kg": "kilogram", "KG": "kilogram", "Kg": "kilogram", "kG": "kilogram",
    "µg": "microgram", "µG": "microgram", "Mg": "milligram", "MG": "milligram",
    "g": "gram", "G": "gram",
    "oz": "ounce", "OZ": "ounce", "Oz": "ounce", "oZ": "ounce",
    "lb": "pound", "lbs": "pound", "LBS": "pound", "Lb": "pound", "lB": "pound", "LB": "pound", "Lbs": "pound",
    "mG": "milligram", "mg": "milligram", "MG": "milligram",
    "mv": "millivolt", "MV": "millivolt", "Mv": "millivolt", "mV": "millivolt",
    "kv": "kilovolt", "KV": "kilovolt", "Kv": "kilovolt", "kV": "kilovolt",
    "v": "volt", "V": "volt",
    "w": "watt", "W": "watt", "WATT": "watt", "Watt": "watt",
    "kw": "kilowatt", "KW": "kilowatt", "Kw": "kilowatt", "kW": "kilowatt",
    "ft³": "cubic foot", "FT³": "cubic foot", "Ft³": "cubic foot", "fT³": "cubic foot",
    "µl": "microlitre", "µL": "microlitre", "ul": "microlitre", "uL": "microlitre",
    "Cup": "cup", "cup": "cup", "CUP": "cup",
    "fl oz": "fluid ounce", "FL OZ": "fluid ounce", "Fl Oz": "fluid ounce", "fl Oz": "fluid ounce", "FL Oz": "fluid ounce", "floz": "fluid ounce",
    "cl": "centilitre", "CL": "centilitre", "Cl": "centilitre",
    "gal": "gallon", "GAL": "gallon", "Gal": "gallon", "gAl": "gallon",
    "pt": "pint", "PT": "pint", "Pt": "pint",
    "dl": "decilitre", "DL": "decilitre", "Dl": "decilitre",
    "ml": "millilitre", "ML": "millilitre", "Ml": "millilitre", "mL": "millilitre",
    "qt": "quart", "QT": "quart", "Qt": "quart",
    "in³": "cubic inch", "IN³": "cubic inch", "In³": "cubic inch",
    "L": "litre",
    "inch": "inch", "Inch": "inch", "INCH": "inch", "iNCH": "inch", "in": "inch", "IN": "inch", "iN": "inch", "In": "inch", "Inches": "inch", '"': "inch",
}


# Function to correct 'O' to '0' when it's the 4th character in numeric patterns
def correct_o_to_0(text):
    '''Corrects the character 'O' to '0' in numeric contexts, especially in the 4th position of numbers.'''
    if not isinstance(text, str):
        return text

    # Regex to identify patterns where 'O' appears as the 4th character in a number like '140OmG'
    corrected_text = re.sub(r'(\d{3})O', r'\g<1>0', text)

    return corrected_text

# Function to extract numbers with specified units
def extract_numbers_with_units(text, units):
    '''Extract numbers followed by specified units from the given text.'''
    if not isinstance(text, str):
        return ''

    # Correct 'O' to '0' in the text
    text = correct_o_to_0(text)

    # Sort units by length in decreasing order to match longer units first
    sorted_units = sorted(units, key=len, reverse=True)

    # Create a regex pattern to match numbers with specified units
    unit_pattern = '|'.join(re.escape(unit) for unit in sorted_units)

    # Modified pattern: match numbers followed by units with or without spaces between them
    pattern = rf'(\d+(?:[Oo]\d+)?(?:\.\d+)?(?:\s*[-–—to]+\s*\d+(?:\.\d+)?)?)\s*({unit_pattern})\b'

    # Find all matches of the pattern in the text (case-insensitive)
    matches = re.findall(pattern, text, flags=re.IGNORECASE)

    # Process matches to handle ranges and format as "number unit"
    results = []
    for match in matches:
        number_range, unit = match
        # Handle cases where the unit is a quotation mark (for inches)
        if unit in {'\\"', '"', "'", 'inches'}:
            unit = 'inch'
        # Replace 'to' or dashes with '-' for ranges
        number_range = re.sub(r'\s*(?:[-–—to]+)\s*', '-', number_range)
        # Remove any spaces in the number
        number_range = number_range.replace(' ', '')

        # Convert units using the unit conversion map
        if unit in unit_conversion_map:
            unit = unit_conversion_map[unit]

        results.append(f'{number_range} {unit}')

    # Join the results into a single string
    return ', '.join(results)

# Load the original CSV file with image names and detected text
input_csv = r"E:\ML_challenge_DATASET\TEST\Test_data\Detected_OCR\Folder1_OCR.csv"
df = pd.read_csv(input_csv)

# Create a new DataFrame to store results
results_df = pd.DataFrame()

# Extract numbers with units and create columns based on entity_unit_map
for entity, units in entity_unit_map.items():
    results_df[entity] = df['Detected Text'].apply(lambda text: extract_numbers_with_units(text, units))

# Add the image name column and any additional columns from the original DataFrame
results_df = pd.concat([df[['Image Name', 'Detected Text']], results_df], axis=1)

# Save the results to a new CSV file
output_csv = r"E:\ML_challenge_DATASET\TEST\Test_data\Detected_OCR\Folder1_OCR_CHECK.csv"
results_df.to_csv(output_csv, index=False)

print(f'Results saved to {output_csv}')


**3. Merging Two CSV Files on Image Names:**

*Libraries Used: pandas.*

Process:
1. Two CSV files are loaded: one with image links and entity names, the other with image names and detected text from OCR.
2. The image name is extracted from the image link (assuming the file name is at the end of the URL).
3. The two datasets are merged on the common field, 'Image Name', using a left join.
4. The merged dataset is saved as a new CSV file, containing the image name, link, entity name, and OCR-detected text.

In [None]:
import pandas as pd

# Load the two CSV files
csv1 = pd.read_csv(r"C:\Users\ANNA MANI\Downloads\66e31d6ee96cd_student_resource_3\student_resource 3\dataset\test.csv")  # Contains index, image link, and entity name
csv2 = pd.read_csv(r"E:\ML_challenge_DATASET\TEST\Check_detection\DETECTION_2\Folder10_OCR.csv")  # Contains image names

# Extract the image name from the image link in the first CSV (assuming image link contains file name at the end)
csv1['Image Name'] = csv1['image_link'].apply(lambda x: x.split('/')[-1])

# Merge the two CSVs based on the image name
merged_csv = pd.merge(csv2, csv1, on='Image Name', how='left')

# Save the merged data to a new CSV file
merged_csv.to_csv(r"E:\ML_challenge_DATASET\TEST\Check_detection\DETECTION_2\Folder10_OCR_Merged.csv", index=False)

print("Merged CSV file saved successfully.")


**4. Concatenating Multiple CSV Files:**

*Libraries Used: pandas, os.*

Process:
1. The folder path containing multiple CSV files is provided.
2. Each CSV file in the folder is read into a DataFrame and stored in a list.
3. All DataFrames are concatenated into a single large DataFrame.
4. The combined data from all CSV files can be processed or saved for further analysis.

In [None]:
import pandas as pd
import os

# Path to the folder containing the CSV files
folder_path = r"E:\ML_challenge_DATASET\TEST\Check_detection\DETECTION_2\Merged"
# List to store dataframes
dfs = []

# Iterate through all CSV files in the folder
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Append DataFrame to list
        dfs.append(df)

# Concatenate all DataFrames in the list into one DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

# Sort by index column (assuming the index column is named 'Index')
merged_df.sort_values(by='index', inplace=True)

# Reset the index of the merged DataFrame
merged_df.reset_index(drop=True, inplace=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv(r"E:\ML_challenge_DATASET\TEST\Check_detection\DETECTION_2\Merged_OUTPUT.csv", index=False)

print("CSV files have been merged and saved successfully.")
