In [48]:
import pandas as pd
import os
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBox, LTTextLine, LTChar

In [50]:

def parse_color(color):
    if isinstance(color, str):
        # Remove "RGB" prefix if present and strip parentheses
        color = color.replace("RGB", "").strip()
        # Extract RGB values from the string
        color = color.strip("()")
        parts = color.split(",")
        if len(parts) == 3:
            try:
                r = float(parts[0].strip())
                g = float(parts[1].strip())
                b = float(parts[2].strip())
                return r, g, b
            except ValueError:
                return None
    elif isinstance(color, tuple) and len(color) == 3:
        return color
    return None

def calculate_average_color(color):
    rgb = parse_color(color)
    if rgb:
        avg_color = sum(rgb) / len(rgb)
        return avg_color
    return None

def extract_font_information(pdf_file):
    font_data = []
    with open(pdf_file, 'rb') as file:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page_number, page in enumerate(PDFPage.get_pages(file), start=1):
            interpreter.process_page(page)
            layout = device.get_result()
            for element in layout:
                if isinstance(element, (LTTextBox, LTTextLine)):
                    for text_line in element:
                        line_text = text_line.get_text().strip()
                        for char in text_line:
                            if isinstance(char, LTChar):
                                color = char.graphicstate.ncolor
                                avg_color = calculate_average_color(color)
                                font_data.append({
                                    "Page": page_number,
                                    "Font": char.fontname,
                                    "Size": char.size,
                                    "Color": color,
                                    "Text": line_text,
                                    "Color_Avg": avg_color
                                })
    return font_data

In [51]:
def extract_metadata(pdf_file):
    reader = PdfReader(pdf_file)
    metadata = reader.metadata
    producer = metadata.get('producer', 'Unknown')
    return producer

In [52]:
def process_pdfs_in_directory(directory):
    all_font_data = []
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_file = os.path.join(directory, filename)
            producer = extract_metadata(pdf_file)
            font_info = extract_font_information(pdf_file)
            for entry in font_info:
                entry["Producer"] = producer
                entry["Filename"] = filename
            all_font_data.extend(font_info)
    return all_font_data

In [53]:
directory_path = "./certificates_dataset"

In [54]:
all_font_data = process_pdfs_in_directory(directory_path)
font_info_df = pd.DataFrame(all_font_data)

In [44]:
import re

def parse_color(color):
    if isinstance(color, str):
        # Remove "RGB" prefix if present and strip parentheses
        color = color.replace("RGB", "").strip()
        # Extract RGB values from the string
        match = re.match(r'\((\d+\.?\d*),\s*(\d+\.?\d*),\s*(\d+\.?\d*)\)', color)
        if match:
            return tuple(float(value) for value in match.groups())
    elif isinstance(color, tuple) and len(color) == 3:
        return color
    return None

def calculate_average_color(color):
    rgb = parse_color(color)
    if rgb:
        avg_color = sum(rgb) / len(rgb)
        return avg_color
    return None


In [45]:

# Convert the 'Color' column to average RGB values
'''font_info_df["Color_Avg"] = font_info_df["Color"].apply(calculate_average_color)
print(font_info_df.dtypes)'''

'font_info_df["Color_Avg"] = font_info_df["Color"].apply(calculate_average_color)\nprint(font_info_df.dtypes)'

In [55]:
font_info_df.head()

Unnamed: 0,Page,Font,Size,Color,Text,Color_Avg,Producer,Filename
0,1,PUGNCY+TimesNewRoman,9.27163,"(0, 0, 0)",(cid:1),0.0,Unknown,17_Sankalp Gunjal_JFO (1).pdf
1,1,BXMJHM+DejaVuSans,24.0,0,(cid:74)(cid:97)(cid:118)(cid:97)(cid:32)(cid:...,,Unknown,17_Sankalp Gunjal_JFO (1).pdf
2,1,BXMJHM+DejaVuSans,24.0,0,(cid:74)(cid:97)(cid:118)(cid:97)(cid:32)(cid:...,,Unknown,17_Sankalp Gunjal_JFO (1).pdf
3,1,BXMJHM+DejaVuSans,24.0,0,(cid:74)(cid:97)(cid:118)(cid:97)(cid:32)(cid:...,,Unknown,17_Sankalp Gunjal_JFO (1).pdf
4,1,BXMJHM+DejaVuSans,24.0,0,(cid:74)(cid:97)(cid:118)(cid:97)(cid:32)(cid:...,,Unknown,17_Sankalp Gunjal_JFO (1).pdf


In [47]:
def summarize_font_info(df):
    summary_df = df.groupby(["Filename", "Page", "Text", "Font", "Size", "Color", "Producer"]).size().reset_index(name='Count')
    return summary_df

font_info_summary_df = summarize_font_info(font_info_df)


TypeError: unhashable type: 'list'