In [34]:

import os
import re
from pathlib import Path
from tqdm import tqdm
import shutil


In [None]:

INPUT_FOLDER = "/Users/tanmayagarwal/Desktop/My_Computer/Columbia/Fall_2025/RA_LLM/main_folder/chemistry_chapters_v2"
OUTPUT_FOLDER = "/Users/tanmayagarwal/Desktop/My_Computer/Columbia/Fall_2025/RA_LLM/main_folder/chemistry_chapters_cleaned"

FILE_EXTENSION = ".md"

print(f"Input folder: {INPUT_FOLDER}")
print(f"Output folder: {OUTPUT_FOLDER}")


Input folder: /Users/tanmayagarwal/Desktop/My_Computer/Columbia/Fall_2025/RA_LLM/main_folder/chemistry_chapters_v2
Output folder: /Users/tanmayagarwal/Desktop/My_Computer/Columbia/Fall_2025/RA_LLM/main_folder/chemistry_chapters_cleaned
Processing files with extension: .md


In [36]:
def remove_images(text):
    image_pattern = r'!\[\]\([^)]+\)'
    matches = re.findall(image_pattern, text)
    images_removed = len(matches)
    
    cleaned_text = re.sub(image_pattern, '', text)
    
    return cleaned_text, images_removed

def remove_formulas(text):
    formulas_removed = 0
    
    double_dollar_pattern = r'\$\$[^$]*\$\$'
    double_matches = re.findall(double_dollar_pattern, text)
    formulas_removed += len(double_matches)
    text = re.sub(double_dollar_pattern, '', text)
    
    single_dollar_pattern = r'\$[^\s$][^$]*\$'
    single_matches = re.findall(single_dollar_pattern, text)
    formulas_removed += len(single_matches)
    text = re.sub(single_dollar_pattern, '', text)
    
    return text, formulas_removed

def clean_markdown_file(file_path):

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        content, images_removed = remove_images(content)
        content, formulas_removed = remove_formulas(content)
        
        return content, images_removed, formulas_removed
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, 0, 0


In [37]:
Path(OUTPUT_FOLDER).mkdir(parents=True, exist_ok=True)
print(f"Output directory created/verified: {OUTPUT_FOLDER}")

input_path = Path(INPUT_FOLDER)
if not input_path.exists():
    print(f"Error: Input folder does not exist: {INPUT_FOLDER}")
else:
    md_files = list(input_path.glob(f"*{FILE_EXTENSION}"))
    print(f"Found {len(md_files)} {FILE_EXTENSION} files to process")


Output directory created/verified: /Users/tanmayagarwal/Desktop/My_Computer/Columbia/Fall_2025/RA_LLM/main_folder/chemistry_chapters_cleaned
Found 21 .md files to process


In [38]:
if md_files:
    total_images_removed = 0
    total_formulas_removed = 0
    processed_files = 0
    failed_files = 0
    
    results = []
    
    print("Processing files...")
    
    for file_path in tqdm(md_files, desc="Cleaning files"):
        # Clean the file
        cleaned_content, images_removed, formulas_removed = clean_markdown_file(file_path)
        
        if cleaned_content is not None:
            # Save the cleaned file
            output_file_path = Path(OUTPUT_FOLDER) / file_path.name
            
            with open(output_file_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)
            
            # Update totals
            total_images_removed += images_removed
            total_formulas_removed += formulas_removed
            processed_files += 1
            
            # Store results for this file
            results.append({
                'filename': file_path.name,
                'images_removed': images_removed,
                'formulas_removed': formulas_removed,
                'total_removed': images_removed + formulas_removed
            })
            
            # Print progress for files with content removed
            if images_removed > 0 or formulas_removed > 0:
                print(f"✓ {file_path.name}: {images_removed} images, {formulas_removed} formulas removed")
        else:
            failed_files += 1
            print(f"✗ Failed to process: {file_path.name}")
    
    print("\n" + "="*60)
    print("PROCESSING COMPLETE")
    print("="*60)
    print(f"Files processed successfully: {processed_files}")
    print(f"Files failed: {failed_files}")
    print(f"Total images removed: {total_images_removed}")
    print(f"Total formulas removed: {total_formulas_removed}")
    print(f"Total items removed: {total_images_removed + total_formulas_removed}")
    print(f"\nCleaned files saved to: {OUTPUT_FOLDER}")
else:
    print("No markdown files found to process.")


Processing files...


Cleaning files: 100%|██████████| 21/21 [00:00<00:00, 804.23it/s]

✓ chapter_01.md: 40 images, 390 formulas removed
✓ chapter_11.md: 45 images, 493 formulas removed
✓ chapter_05.md: 27 images, 800 formulas removed
✓ chapter_15.md: 16 images, 920 formulas removed
✓ chapter_21.md: 37 images, 497 formulas removed
✓ chapter_04.md: 31 images, 745 formulas removed
✓ chapter_14.md: 26 images, 1073 formulas removed
✓ chapter_20.md: 100 images, 164 formulas removed
✓ chapter_10.md: 74 images, 596 formulas removed
✓ chapter_09.md: 48 images, 847 formulas removed
✓ chapter_19.md: 44 images, 547 formulas removed
✓ chapter_08.md: 57 images, 567 formulas removed
✓ chapter_18.md: 66 images, 1034 formulas removed
✓ chapter_07.md: 68 images, 773 formulas removed
✓ chapter_17.md: 20 images, 382 formulas removed
✓ chapter_03.md: 29 images, 699 formulas removed
✓ chapter_13.md: 10 images, 779 formulas removed
✓ chapter_02.md: 43 images, 431 formulas removed
✓ chapter_12.md: 40 images, 688 formulas removed
✓ chapter_06.md: 43 images, 756 formulas removed
✓ chapter_16.md: 




Detailed Results (table form me, note "slide me dalne ke liye")
