In [7]:
import os

def walk_through_dir(dir_path):
    for dirpath, dirnames, filenames in os.walk(dir_path):
        num_images = len([f for f in filenames if f.endswith('.png') or f.endswith('.jpg')])
        print(f"There are {len(dirnames)} directories and {num_images} images in '{dirpath}'.")
        for dirname in dirnames:
            print(f"  Subdirectory: {dirname}")

walk_through_dir('data')


There are 657 directories and 0 images in 'data'.
  Subdirectory: 000
  Subdirectory: 001
  Subdirectory: 002
  Subdirectory: 003
  Subdirectory: 004
  Subdirectory: 005
  Subdirectory: 006
  Subdirectory: 007
  Subdirectory: 008
  Subdirectory: 009
  Subdirectory: 010
  Subdirectory: 011
  Subdirectory: 012
  Subdirectory: 013
  Subdirectory: 014
  Subdirectory: 015
  Subdirectory: 016
  Subdirectory: 017
  Subdirectory: 018
  Subdirectory: 019
  Subdirectory: 020
  Subdirectory: 021
  Subdirectory: 022
  Subdirectory: 023
  Subdirectory: 024
  Subdirectory: 025
  Subdirectory: 026
  Subdirectory: 027
  Subdirectory: 028
  Subdirectory: 029
  Subdirectory: 030
  Subdirectory: 031
  Subdirectory: 032
  Subdirectory: 033
  Subdirectory: 034
  Subdirectory: 035
  Subdirectory: 036
  Subdirectory: 037
  Subdirectory: 038
  Subdirectory: 039
  Subdirectory: 040
  Subdirectory: 041
  Subdirectory: 042
  Subdirectory: 043
  Subdirectory: 044
  Subdirectory: 045
  Subdirectory: 046
  Subdirec

In [9]:
import os

def walk_through_dir(dir_path):
    texts = []
    for dirpath, dirnames, filenames in os.walk(dir_path):
        num_images = len([f for f in filenames if f.endswith('.png') or f.endswith('.jpg')])
        print(f"There are {len(dirnames)} directories and {num_images} images in '{dirpath}'.")

        # Placeholder for extracting text from each image
        for filename in filenames:
            if filename.endswith('.png') or filename.endswith('.jpg'):
                image_path = os.path.join(dirpath, filename)
                print(f"Found image: {image_path}")
                
                # Simulate OCR output
                extracted_text = f"Text extracted from {filename}"  # Placeholder text
                texts.append(extracted_text)
    
    return texts

# Run directory traversal
texts = walk_through_dir('data')  # Change 'data' to your directory path
print(f"Extracted texts: {texts}")


There are 657 directories and 0 images in 'data'.
There are 0 directories and 59 images in 'data\000'.
Found image: data\000\label_10_a01-049u.png
Found image: data\000\label_11_a01-049x.png
Found image: data\000\label_12_a01-053u.png
Found image: data\000\label_13_a01-058u.png
Found image: data\000\label_14_a01-063u.png
Found image: data\000\label_15_a01-068u.png
Found image: data\000\label_16_a01-072u.png
Found image: data\000\label_17_a01-077u.png
Found image: data\000\label_18_a01-082u.png
Found image: data\000\label_19_a01-087u.png
Found image: data\000\label_1_a01-000u.png
Found image: data\000\label_20_a01-091u.png
Found image: data\000\label_21_a01-096u.png
Found image: data\000\label_22_a01-102u.png
Found image: data\000\label_23_a01-107u.png
Found image: data\000\label_24_a01-113u.png
Found image: data\000\label_25_a01-117u.png
Found image: data\000\label_26_a01-122u.png
Found image: data\000\label_27_a01-128u.png
Found image: data\000\label_28_a01-132u.png
Found image: data\

In [10]:
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        # Tokenize the text
        input_ids = self.tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True)
        return input_ids.squeeze(0)

# Initialize tokenizer and dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
dataset = TextDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
from transformers import GPT2LMHeadModel
import torch

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer for training
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [None]:
# Training Loop
model.train()
num_epochs = 2  # Number of epochs

for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        inputs = batch.to(device)  # Move inputs to GPU if available
        labels = inputs.clone()
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")


In [None]:
model_save_path = "fine_tuned_gpt2"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")
