In [None]:
import cv2
import numpy as np
import easyocr
import pandas as pd
import requests
from io import BytesIO
from PIL import Image

def fetch_image_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content))

        # Ensure the image is in RGB format
        if image.mode != 'RGB':
            image = image.convert('RGB')

        return np.array(image)
    except Exception as e:
        print(f"Error fetching image from URL {url}: {e}")
        return None


def preprocess_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return binary

def detect_text_orientation_and_extract(image, reader):
    if image is None:
        return ""

    preprocessed_image = preprocess_image(image)
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Detect text in the image
    results = reader.readtext(rgb_image)

    if not results:
        return ""

    # Calculate the average angle of the detected text
    angles = []
    for result in results:
        box = result[0]
        x1, y1 = box[0]
        x2, y2 = box[2]
        angle = np.arctan2(y2 - y1, x2 - x1) * (180 / np.pi)
        angles.append(angle)
    
    if angles:
        # Filter and smooth angles
        angles = np.array(angles)
        average_angle = np.mean(angles)

        # Rotate the image only if the rotation angle is significant
        if abs(average_angle) >= 25:
            height, width = image.shape[:2]
            center = (width // 2, height // 2)
            rotation_angle = -average_angle  # Negative to correct the orientation
            rotation_matrix = cv2.getRotationMatrix2D(center, rotation_angle, 1.0)
            rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))

            # Extract text from the corrected image
            rgb_rotated_image = cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB)
            extracted_text_results = reader.readtext(rgb_rotated_image)
        else:
            extracted_text_results = results

        # Concatenate extracted text into one string and remove newlines
        extracted_text = " ".join([result[1] for result in extracted_text_results])
        extracted_text_cleaned = extracted_text.replace("\n", " ").strip()
        return extracted_text_cleaned
    return ""

def process_images_from_csv(csv_file, output_file):
    df = pd.read_csv(csv_file)

    # Initialize EasyOCR Reader once
    reader = easyocr.Reader(['en'])

    for index, row in df.iloc[50000:75000].iterrows():
        url = row['image_link']
        print(index)
        image = fetch_image_from_url(url)
        extracted_text = detect_text_orientation_and_extract(image, reader)

        # Assign extracted text to the corresponding row in the DataFrame
        df.at[index, 'extracted_text'] = extracted_text

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file, index=False)
    print(f"[INFO] Extraction complete. Saved results to {output_file}")

# Example usage
csv_file = '/kaggle/input/ml-chal/student_resource 3/dataset/test.csv'  # Replace with your CSV file path
output_file = 'updated_test_with_text_50_60.csv'  # Output file
process_images_from_csv(csv_file, output_file)