# Amharic Character Dataset - Exploratory Data Analysis

This notebook explores the processed Amharic character dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import os
import random

# Config
INDEX_FILE = '../data/processed/dataset_index.csv'
IMG_DIR = '../data/processed/amharic_chars'


In [None]:
# Load Index
if os.path.exists(INDEX_FILE):
    df = pd.read_csv(INDEX_FILE)
    print(f"Loaded {len(df)} records.")
    print(df.head())
else:
    print("Index file not found! Run preprocessing first.")

In [None]:
# Class Distribution
class_counts = df['class'].value_counts()
print(f"Number of classes: {len(class_counts)}")
print(f"Min samples per class: {class_counts.min()}")
print(f"Max samples per class: {class_counts.max()}")

plt.figure(figsize=(20, 5))
class_counts[:50].plot(kind='bar')
plt.title('Top 50 Class Distribution')
plt.show()

In [None]:
# Visualize Samples
def show_samples(n=10):
    plt.figure(figsize=(20, 4))
    samples = df.sample(n)
    for i, (_, row) in enumerate(samples.iterrows()):
        plt.subplot(1, n, i+1)
        # Construct relative path
        # The csv might have full paths or relative to execution context. 
        # Our script saved relative to 'data/processed' or project root?
        # Let's fix path if needed
        img_path = row['path']
        if not os.path.exists(img_path):
             # Try adjusting relative to notebook location
             img_path = os.path.join('..', img_path)
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            plt.imshow(img, cmap='gray')
            plt.title(row['class'])
            plt.axis('off')
    plt.show()

show_samples(10)

In [None]:
# Split Distribution
print(df['split'].value_counts())
