In [None]:
pip install opencv-python-headless pytesseract PyMuPDF

In [None]:
pip install opencv-python-headless pytesseract PyMuPDF tensorflow


In [None]:
import cv2
import pytesseract
import numpy as np
import fitz


In [None]:

def ocr_image(image):
    grayscale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(grayscale)
    return text

def detect_green_boxes(image):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Define the lower and upper green color range (you might need to adjust these values)
    lower_green = np.array([40, 40, 40])
    upper_green = np.array([80, 255, 255])
    
    mask = cv2.inRange(hsv, lower_green, upper_green)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    green_boxes = []
    for contour in contours:
        area = cv2.contourArea(contour)
        if area > 500:  # Adjust this threshold based on your PDF's resolution and green box size
            x, y, w, h = cv2.boundingRect(contour)
            green_boxes.append((x, y, x + w, y + h))
    
    return green_boxes

def classify_coupons(ocr_text, green_boxes):
    # Convert the OCR text to a list of lines
    ocr_lines = ocr_text.split('\n')

    products_with_coupons = []
    for box in green_boxes:
        x1, y1, x2, y2 = box

        # Ensure that the box coordinates are within the range of the OCR lines
        if y1 >= 0 and y2 < len(ocr_lines):
            box_text = ocr_lines[y1:y2 + 1]  # Extract lines within the box region
            box_text = '\n'.join(box_text)  # Convert back to a single string

            for keyword in coupon_keywords:
                if keyword in box_text:
                    products_with_coupons.append(box_text)
                    break

    return products_with_coupons


def process_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        image_matrix = page.get_pixmap()

        # Convert image to numpy array and BGR format (OpenCV's standard format)
        image = np.frombuffer(image_matrix.samples, dtype=np.uint8).reshape(
            image_matrix.h, image_matrix.w, 3
        )

        ocr_text = ocr_image(image)
        green_boxes = detect_green_boxes(image)
        products_with_coupons = classify_coupons(ocr_text, green_boxes)

        if products_with_coupons:
            print(f"Products with coupons on Page {page_num + 1}:")
            for product in products_with_coupons:
                print(product)
        else:
            print(f"No products with coupons found on Page {page_num + 1}.")

    pdf_document.close()




In [None]:
if __name__ == "__main__":
    pdf_path = "Jewelosco.pdf"
    process_pdf(pdf_path)

In [None]:
tesseract --version


In [None]:
//=======================================

In [None]:
import cv2
import pytesseract
import numpy as np
import fitz
import tensorflow as tf

# Load the pre-trained text classification model
model = tf.keras.models.load_model("text_classification_model")

def ocr_image(image):
    grayscale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(grayscale)
    return text

def detect_green_boxes(image):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Define the lower and upper green color range (you might need to adjust these values)
    lower_green = np.array([40, 40, 40])
    upper_green = np.array([80, 255, 255])
    
    mask = cv2.inRange(hsv, lower_green, upper_green)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    green_boxes = []
    for contour in contours:
        area = cv2.contourArea(contour)
        if area > 500:  # Adjust this threshold based on your PDF's resolution and green box size
            x, y, w, h = cv2.boundingRect(contour)
            green_boxes.append((x, y, x + w, y + h))
    
    return green_boxes

def classify_coupons(ocr_text):
    coupon_keywords = ['DIGITAL-ONLY OFFER', 'DIGITAL REBATE', 'DIGITAL COUPON']

    # Use the pre-trained model to predict if the text contains a coupon offer or not
    predictions = model.predict([ocr_text])
    prediction_label = np.argmax(predictions[0])
    return coupon_keywords[prediction_label] if prediction_label < len(coupon_keywords) else None

def process_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        image_matrix = page.get_pixmap()

        # Convert image to numpy array and BGR format (OpenCV's standard format)
        image = np.frombuffer(image_matrix.samples, dtype=np.uint8).reshape(
            image_matrix.h, image_matrix.w, 3
        )

        ocr_text = ocr_image(image)
        green_boxes = detect_green_boxes(image)
        coupon_offer = classify_coupons(ocr_text)

        if coupon_offer:
            print(f"Page {page_num + 1} - Offer: {coupon_offer}")
            # Display the image with green boxes
            image_with_boxes = image.copy()
            for box in green_boxes:
                x1, y1, x2, y2 = box
                cv2.rectangle(image_with_boxes, (x1, y1), (x2, y2), (0, 255, 0), 2)

            cv2.imshow(f"Page {page_num + 1}", image_with_boxes)
            cv2.waitKey(0)
            cv2.destroyAllWindows()
            
        else:
            print(f"Page {page_num + 1} - No coupon offer found.")

    pdf_document.close()

if __name__ == "__main__":
    pdf_path = "Jewelosco.pdf"
    process_pdf(pdf_path)


In [None]:
import cv2
import pytesseract
import numpy as np
import fitz
import openai

# Set your OpenAI API key
openai.api_key = "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"

def ocr_image(image):
    grayscale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(grayscale)
    return text

def detect_green_boxes(image):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Define the lower and upper green color range (you might need to adjust these values)
    lower_green = np.array([40, 40, 40])
    upper_green = np.array([80, 255, 255])
    
    mask = cv2.inRange(hsv, lower_green, upper_green)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    green_boxes = []
    for contour in contours:
        area = cv2.contourArea(contour)
        if area > 500:  # Adjust this threshold based on your PDF's resolution and green box size
            x, y, w, h = cv2.boundingRect(contour)
            green_boxes.append((x, y, x + w, y + h))
    
    return green_boxes

def classify_coupons(ocr_text):
    coupon_keywords = ['DIGITAL-ONLY OFFER', 'DIGITAL REBATE', 'DIGITAL COUPON']

    # Use GPT-3.5 Turbo for text classification
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=ocr_text,
        max_tokens=100,
        temperature=0.7
    )

    # Check if any coupon keyword is present in the GPT-3.5 Turbo response
    for keyword in coupon_keywords:
        if keyword in response.choices[0].text:
            return keyword

    return None

def process_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        image_matrix = page.get_pixmap()

        # Convert image to numpy array and BGR format (OpenCV's standard format)
        image = np.frombuffer(image_matrix.samples, dtype=np.uint8).reshape(
            image_matrix.h, image_matrix.w, 3
        )

        ocr_text = ocr_image(image)
        green_boxes = detect_green_boxes(image)
        coupon_offer = classify_coupons(ocr_text)

        if coupon_offer:
            print(f"Page {page_num + 1} - Offer: {coupon_offer}")
            # Display the image with green boxes
            image_with_boxes = image.copy()
            for box in green_boxes:
                x1, y1, x2, y2 = box
                cv2.rectangle(image_with_boxes, (x1, y1), (x2, y2), (0, 255, 0), 2)

            cv2.imshow(f"Page {page_num + 1}", image_with_boxes)
            cv2.waitKey(0)
            cv2.destroyAllWindows()
            
        else:
            print(f"Page {page_num + 1} - No coupon offer found.")

    pdf_document.close()

if __name__ == "__main__":
    pdf_path = "Jewelosco.pdf"
    process_pdf(pdf_path)


In [None]:
!pip install openai

In [None]:
import cv2
import pytesseract
import numpy as np
import fitz
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Load the DistilBERT tokenizer and classifier
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

def ocr_image(image):
    grayscale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(grayscale)
    return text

def classify_coupons(ocr_text):
    # Preprocess the text using the tokenizer
    inputs = tokenizer(ocr_text, return_tensors="tf", truncation=True, padding=True)
    
    # Use the classifier to predict if the text contains a coupon offer or not
    predictions = model.predict(inputs)[0]
    prediction_label = np.argmax(predictions[0])
    return "Offer" if prediction_label == 1 else "No Offer"

def process_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        image_matrix = page.get_pixmap()

        # Convert image to numpy array and BGR format (OpenCV's standard format)
        image = np.frombuffer(image_matrix.samples, dtype=np.uint8).reshape(
            image_matrix.h, image_matrix.w, 3
        )

        ocr_text = ocr_image(image)
        coupon_offer = classify_coupons(ocr_text)

        if coupon_offer == "Offer":
            print(f"Page {page_num + 1} - Offer: {coupon_offer}")
            # Display the image (optional)
            cv2.imshow(f"Page {page_num + 1}", image)
            cv2.waitKey(0)
            cv2.destroyAllWindows()
            
        else:
            print(f"Page {page_num + 1} - No coupon offer found.")

    pdf_document.close()

if __name__ == "__main__":
    pdf_path = "Jewelosco.pdf"
    process_pdf(pdf_path)


In [None]:
import cv2
import pytesseract
import numpy as np
import fitz
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Load the DistilBERT tokenizer and classifier
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

def ocr_image(image):
    grayscale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(grayscale)
    return text

def preprocess_text(ocr_text):
    # Preprocess the text using the tokenizer and convert to list of strings
    inputs = tokenizer(ocr_text, return_tensors="tf", truncation=True, padding=True)
    return [inputs["input_ids"].numpy(), inputs["attention_mask"].numpy()]

def classify_coupons(X):
    # Use the classifier to predict if the text contains a coupon offer or not
    predictions = model.predict(X)[0]
    prediction_label = np.argmax(predictions[0])
    return prediction_label

def process_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        image_matrix = page.get_pixmap()

        # Convert image to numpy array and BGR format (OpenCV's standard format)
        image = np.frombuffer(image_matrix.samples, dtype=np.uint8).reshape(
            image_matrix.h, image_matrix.w, 3
        )

        ocr_text = ocr_image(image)
        X = preprocess_text(ocr_text)
        coupon_label = classify_coupons(X)

        if coupon_label == 1:
            print(f"Page {page_num + 1} - Offer: Found")
            # Display the image (optional)
            cv2.imshow(f"Page {page_num + 1}", image)
            cv2.waitKey(0)
            cv2.destroyAllWindows()
            
        else:
            print(f"Page {page_num + 1} - No coupon offer found.")

    pdf_document.close()

if __name__ == "__main__":
    pdf_path = "Jewelosco.pdf"
    process_pdf(pdf_path)


In [None]:
import cv2
import pytesseract
import numpy as np
import fitz
import tensorflow as tf

# Load the pre-trained text classification model
model = tf.keras.models.load_model("text_classification_model")

def ocr_image(image):
    grayscale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(grayscale)
    return text

def detect_green_boxes(image):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Define the lower and upper green color range (you might need to adjust these values)
    lower_green = np.array([40, 40, 40])
    upper_green = np.array([80, 255, 255])
    
    mask = cv2.inRange(hsv, lower_green, upper_green)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    green_boxes = []
    for contour in contours:
        area = cv2.contourArea(contour)
        if area > 500:  # Adjust this threshold based on your PDF's resolution and green box size
            x, y, w, h = cv2.boundingRect(contour)
            green_boxes.append((x, y, x + w, y + h))
    
    return green_boxes

def classify_coupons(ocr_text):
    coupon_keywords = ['DIGITAL-ONLY OFFER', 'DIGITAL REBATE', 'DIGITAL COUPON']

    # Use the pre-trained model to predict if the text contains a coupon offer or not
    predictions = model.predict([ocr_text])
    prediction_label = np.argmax(predictions[0])
    return coupon_keywords[prediction_label] if prediction_label < len(coupon_keywords) else None

def process_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        image_matrix = page.get_pixmap()

        # Convert image to numpy array and BGR format (OpenCV's standard format)
        image = np.frombuffer(image_matrix.samples, dtype=np.uint8).reshape(
            image_matrix.h, image_matrix.w, 3
        )

        ocr_text = ocr_image(image)
        green_boxes = detect_green_boxes(image)
        coupon_offer = classify_coupons(ocr_text)

        if coupon_offer:
            print(f"Page {page_num + 1} - Offer: {coupon_offer}")
            # Save the image with green boxes to disk
            image_with_boxes = image.copy()
            for box in green_boxes:
                x1, y1, x2, y2 = box
                cv2.rectangle(image_with_boxes, (x1, y1), (x2, y2), (0, 255, 0), 2)

            output_image_path = f"output_page_{page_num + 1}.jpg"
            cv2.imwrite(output_image_path, image_with_boxes)
            
        else:
            print(f"Page {page_num + 1} - No coupon offer found.")

    pdf_document.close()

if __name__ == "__main__":
    pdf_path = "Jewelosco.pdf"
    process_pdf(pdf_path)
