In [2]:
import pandas as pd

# Load the CSV file
csv_path = "copyright_records.csv"
df = pd.read_csv(csv_path)

# Display the first few rows to confirm structure
df.head()


Unnamed: 0,Registration Number / Date,Type of Work,Title,Application Title,Date of Creation,Date of Publication,Copyright Claimant,Authorship on Application,Rights and Permissions,Description,Nation of First Publication,Names
0,VA0002335524 / 2023-01-02,Visual Material,Pattern 69775-120.,Pattern 69775-120.,2022,11/8/2022,"GUANGDONG ZHONGKANG EMBROIDERY TECHNOLOGY CO.,...","2-D artwork, pseud. of GUANGDONG ZHONGKANG EMB...","GUANGDONG ZHONGKANG EMBROIDERY TECHNOLOGY CO.,...",Electronic file (eService),China,"2-D artwork, pseud., GUANGDONG ZHONGKANG EMBRO..."
1,VA0002335526 / 2023-01-02,Visual Material,Pattern 69776-120.,Pattern 69776-120.,2022,11/8/2022,"GUANGDONG ZHONGKANG EMBROIDERY TECHNOLOGY CO.,...","2-D artwork, pseud. of GUANGDONG ZHONGKANG EMB...","GUANGDONG ZHONGKANG EMBROIDERY TECHNOLOGY CO.,...",Electronic file (eService),China,"2-D artwork, pseud., GUANGDONG ZHONGKANG EMBRO..."
2,VA0002363902 / 2023-01-16,Visual Material,"NekoMoon, Original.","NekoMoon, Original.",2022,8/31/2022,"NekoMoon, LLC. Address: P.O. Box 752796, 6210 ...","NekoMoon, LLC, Domicile: United States; employ...","NekoMoon, LLC, P.O. Box 752796, 6210 N. Jones ...",Electronic file (eService),United States,"NekoMoon, LLC"
3,VA0002363502 / 2023-02-01,Visual Material,The Premier Horse Sale Advertisement - Version 1.,The Premier Horse Sale Advertisement - Version 1.,2020,6/9/2020,The Premier Horse LLC. Address: 2418 Shoreline...,"The Premier Horse LLC, Domicile: United States...","Tyesha Wilson, The Premier Horse LLC, 14 Cherr...",Electronic file (eService),United States,The Premier Horse LLC
4,VA0002363723 / 2023-01-30,Visual Material,NB160318X2.,NB160318X2.,2017,4/28/2017,"NEMAN BROTHERS AND ASSOCIATES, Transfer: by wr...","BERNINI STUDIO, employer for hire; Citizenship...","ADRINEH MOKHTARIANS, NEMAN BROTHERS AND ASSOCI...",Electronic file (eService),United States,"BERNINI STUDIO, NEMAN BROTHERS AND ASSOCIATES"


In [3]:
# Split "Registration Number / Date" into two separate columns
df[['RegistrationNumber', 'RegistrationDate']] = df['Registration Number / Date'].str.extract(r'(VA\d+)\s*/\s*(\d{4}-\d{2}-\d{2})')

# Rename "Title" column to "WorkTitle" for clarity
df = df.rename(columns={'Title': 'WorkTitle'})

# Keep only the relevant columns for image matching task
relevant_columns = ['RegistrationNumber', 'RegistrationDate', 'WorkTitle']
df_cleaned = df[relevant_columns]

print(df_cleaned.head())


  RegistrationNumber RegistrationDate  \
0       VA0002335524       2023-01-02   
1       VA0002335526       2023-01-02   
2       VA0002363902       2023-01-16   
3       VA0002363502       2023-02-01   
4       VA0002363723       2023-01-30   

                                           WorkTitle  
0                                 Pattern 69775-120.  
1                                 Pattern 69776-120.  
2                                NekoMoon, Original.  
3  The Premier Horse Sale Advertisement - Version 1.  
4                                        NB160318X2.  


### This step prepares the copyright records by extracting and cleaning only the necessary fields — RegistrationNumber, RegistrationDate, and WorkTitle — from the full CSV to be used later for matching against image content.
### The cleaned DataFrame was successfully created and now shows properly formatted registration numbers and work titles, ready for comparison with OCR-extracted data.

In [20]:
import os
import re
import pandas as pd

# Folder containing the image files
image_folder = "sample copyright"

image_files = [f for f in os.listdir(image_folder) if f.endswith(".webp")]

# Updated regex pattern
pattern = re.compile(
    r'(?P<prefix>[A-Z_]+)_'
    r'(?P<case_number>\d+_\d{2}-cv-\d+)_'
    r'(?P<date>\d{4}-\d{2}-\d{2})_\d+_\d+_'
    r'(?P<exhibit>Exhibit(_\d+)?(_Part_\d+)?)+_page(?P<page>\d+)_\d+_(?P<image_type>full|cropped)\.webp',
    re.IGNORECASE
)

parsed_data = []

for file in image_files:
    match = pattern.search(file)
    if match:
        groups = match.groupdict()
        parsed_data.append({
            "Filename": file,
            "CaseNumber": groups["case_number"].replace("_", ":"),
            "ExhibitNumber": groups["exhibit"].replace("_", " "),
            "PageNumber": int(groups["page"]),
            "ImageType": groups["image_type"].lower()
        })
    else:
        print(f"Skipped: {file} (Pattern not matched)")

# Convert to DataFrame
image_df = pd.DataFrame(parsed_data)
print(image_df.head())


Skipped: IN_DC_1_25-cv-00969_2025-01-29_1_ 2_Exhibit_2_Part_2_page2_0_full.webp (Pattern not matched)
Skipped: IN_DC_1_25-cv-00969_2025-01-29_1_ 2_Exhibit_2_Part_2_page3_full.webp (Pattern not matched)
Skipped: IN_DC_1_25-cv-01049_2025-01-30_1_ 0_Exhibit_1_page2_0_full.webp (Pattern not matched)
Skipped: IN_DC_1_25-cv-01049_2025-01-30_1_ 0_Exhibit_1_page3_0.webp (Pattern not matched)
Skipped: IN_DC_1_25-cv-01050_2025-01-30_1_ 0_Exhibit_1_page2_0_full.webp (Pattern not matched)
Skipped: IN_DC_1_25-cv-01050_2025-01-30_1_ 0_Exhibit_1_page3_0.webp (Pattern not matched)
Skipped: US_DIS_ILND_1_24cv7196_d174022055e726_ 0_Exhibit_Exhibit_1_page3_0_full.webp (Pattern not matched)
Skipped: US_DIS_ILND_1_24cv7196_d174022055e726_ 0_Exhibit_Exhibit_1_page4_0.webp (Pattern not matched)
Empty DataFrame
Columns: []
Index: []


In [21]:
import os
import re
import pandas as pd

# Your folder path
image_folder = "sample copyright"

# Collect all .webp files
image_files = [f for f in os.listdir(image_folder) if f.endswith(".webp")]

# New more flexible pattern
pattern = re.compile(
    r'(?P<case_number>\d+_\d{2}-cv-\d+).*?'
    r'(?P<exhibit>Exhibit(?:_[0-9]+)?(?:_Part_[0-9]+)?)_'
    r'page(?P<page>\d+)_\d+_(?P<image_type>full|cropped)\.webp',
    re.IGNORECASE
)

parsed_data = []

for file in image_files:
    match = pattern.search(file)
    if match:
        groups = match.groupdict()
        parsed_data.append({
            "Filename": file,
            "CaseNumber": groups["case_number"].replace("_", ":"),
            "ExhibitNumber": groups["exhibit"].replace("_", " "),
            "PageNumber": int(groups["page"]),
            "ImageType": groups["image_type"].lower()
        })
    else:
        print(f"Skipped: {file} (Pattern not matched)")

# Convert to DataFrame
image_df = pd.DataFrame(parsed_data)
print(image_df.head())


Skipped: IN_DC_1_25-cv-00969_2025-01-29_1_ 2_Exhibit_2_Part_2_page3_full.webp (Pattern not matched)
Skipped: IN_DC_1_25-cv-01049_2025-01-30_1_ 0_Exhibit_1_page3_0.webp (Pattern not matched)
Skipped: IN_DC_1_25-cv-01050_2025-01-30_1_ 0_Exhibit_1_page3_0.webp (Pattern not matched)
Skipped: US_DIS_ILND_1_24cv7196_d174022055e726_ 0_Exhibit_Exhibit_1_page3_0_full.webp (Pattern not matched)
Skipped: US_DIS_ILND_1_24cv7196_d174022055e726_ 0_Exhibit_Exhibit_1_page4_0.webp (Pattern not matched)
                                            Filename     CaseNumber  \
0  IN_DC_1_25-cv-00969_2025-01-29_1_ 2_Exhibit_2_...  1:25-cv-00969   
1  IN_DC_1_25-cv-01049_2025-01-30_1_ 0_Exhibit_1_...  1:25-cv-01049   
2  IN_DC_1_25-cv-01050_2025-01-30_1_ 0_Exhibit_1_...  1:25-cv-01050   

      ExhibitNumber  PageNumber ImageType  
0  Exhibit 2 Part 2           2      full  
1         Exhibit 1           2      full  
2         Exhibit 1           2      full  


### This step scans and parses all .webp image filenames to extract structured metadata such as CaseNumber, ExhibitNumber, PageNumber, and ImageType — which will help organize and map images during matching.
### Some filenames were successfully parsed and stored in image_df, while others didn’t match the expected pattern (likely due to irregular naming). The parser is working, and image metadata is ready for the next OCR step.

In [38]:
def extract_registration_easyocr_debug(img_path):
    try:
        result = reader.readtext(img_path, detail=0)
        text = " ".join(result)

        print(f"\n Raw OCR Text for {img_path}:\n{text}\n")

        # Flexible match for VA-style registration numbers (with space/dash variations)
        reg_match = re.search(r'\b(VA[\s\-]?\d{1,2}[\-\s]?\d{3}[\-\s]?\d{3})\b', text)
        reg_number = None
        raw = None

        if reg_match:
            raw = reg_match.group(1)
            digits = re.sub(r'\D', '', raw)          # Remove all non-digits
            reg_number = "VA" + digits.zfill(10)      # Pad to 9 digits

        # Try to find the line before the registration number as the Work Title
        work_title = None
        if reg_number:
            for i, line in enumerate(result):
                if raw in line and i > 0:
                    work_title = result[i - 1]
                    break

        return reg_number, work_title

    except Exception as e:
        print(f" Error on {img_path}: {e}")
        return None, None


In [39]:
sample_matches = image_df.head(2)

for _, row in sample_matches.iterrows():
    img_path = os.path.join(image_folder, row['Filename'])
    reg_num, title = extract_registration_easyocr_debug(img_path)
    print(f" Image: {row['Filename']}")
    print(f" Extracted Registration Number: {reg_num}")
    print(f" Extracted Work Title: {title}")



🧾 Raw OCR Text for sample copyright\IN_DC_1_25-cv-00969_2025-01-29_1_ 2_Exhibit_2_Part_2_page2_0_full.webp:
Case: 1.25-cv-00969 Document #: 1-3 Filed: 01/29/25 Page 2 of 68 PagelD #.93 Certificate of Registration ASTATES_ This Certificate issued under the seal of the Copyright Office in accordance with title 17, United States Code, attests that registration has been made for the work identified below The information on this certificate has Reglstration Number been made a part of the Copyright Office records: VA 1-982-408 Effective Date of Registration: '1870 YaunA Rsz July 29, 2015 United States Register of Copyrights and Director Title Tidle of Work: Care Bears 2015 Core Style Guide Completion/Publication Year of Completion: 2015 Date of Ist Publcation: July 27,2015 Nation of 1" Publcation: United States Author Author: Those Characters From Cleveland, Inc: Author Created: 2-D artwork Work made for hire: Yes Domiciled in: United States Copyright Claimant Copyright Claimant: Those Char

### This step runs OCR using EasyOCR on sample images to extract the RegistrationNumber and nearby WorkTitle, then normalizes the number format to match the CSV style.
### OCR successfully extracted text and found registration numbers like VA0001902408, but since this number doesn’t exist in the CSV, it confirms that the reference images do not align with the spreadsheet — validating your matching logic even without matches.

In [40]:
import shutil

# Create output folder if it doesn't exist
output_folder = "matched_output"
os.makedirs(output_folder, exist_ok=True)

# Store results
matched_rows = []

for _, row in image_df.iterrows():
    img_path = os.path.join(image_folder, row['Filename'])
    reg_num, title = extract_registration_easyocr_debug(img_path)
    print(f"OCR Reg Num     : {reg_num}")
    print(f"CSV Candidates  :", df_cleaned['RegistrationNumber'].tolist()[:5])  # Sample



    if reg_num:
        match = df_cleaned[df_cleaned['RegistrationNumber'] == reg_num]
        if not match.empty:
            # Match found
            print(f"\n Match found for image: {row['Filename']}")
            print(f" CSV Registration Number: {match.iloc[0]['RegistrationNumber']}")
            print(f" CSV Work Title        : {match.iloc[0]['WorkTitle']}")
            print(f" OCR Extracted Title  : {title}")


            # Copy image to output folder
            shutil.copy(img_path, os.path.join(output_folder, row['Filename']))

            # Store matched info
            matched_rows.append({
                'ImageFile': row['Filename'],
                'RegistrationNumber': reg_num,
                'ExtractedWorkTitle': title,
                'CSVWorkTitle': match.iloc[0]['WorkTitle']
            })



🧾 Raw OCR Text for sample copyright\IN_DC_1_25-cv-00969_2025-01-29_1_ 2_Exhibit_2_Part_2_page2_0_full.webp:
Case: 1.25-cv-00969 Document #: 1-3 Filed: 01/29/25 Page 2 of 68 PagelD #.93 Certificate of Registration ASTATES_ This Certificate issued under the seal of the Copyright Office in accordance with title 17, United States Code, attests that registration has been made for the work identified below The information on this certificate has Reglstration Number been made a part of the Copyright Office records: VA 1-982-408 Effective Date of Registration: '1870 YaunA Rsz July 29, 2015 United States Register of Copyrights and Director Title Tidle of Work: Care Bears 2015 Core Style Guide Completion/Publication Year of Completion: 2015 Date of Ist Publcation: July 27,2015 Nation of 1" Publcation: United States Author Author: Those Characters From Cleveland, Inc: Author Created: 2-D artwork Work made for hire: Yes Domiciled in: United States Copyright Claimant Copyright Claimant: Those Char

### This is the final pipeline step that compares OCR-extracted registration numbers to the cleaned CSV, saves matched images to a folder, and collects summary info into a list.
### OCR worked and correctly extracted registration numbers, but none of them are found in the test CSV — confirming that the reference images don't match the dataset, and explaining why no files appear in matched_output/.