# **Build a Dataset Class for Horse Breeds**

https://www.kaggle.com/datasets/olgabelitskaya/horse-breeds

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("olgabelitskaya/horse-breeds")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'horse-breeds' dataset.
Path to dataset files: /kaggle/input/horse-breeds


### Imports

In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision import models

from sklearn.model_selection import train_test_split

device = torch.device('cpu')
print(f'Using device: {device}')

Using device: cpu


### Split the data into train val, and test set (starified)

In [3]:
img_dir = path
img_files = sorted([f for f in os.listdir(img_dir) if f.endswith('.png')])

breed_names = ['Akhal-Teke', 'Appaloosa', 'Orlov Trotter', 
               'Vladimir Heavy Draft', 'Percheron', 'Arabian', 'Friesian']

labels = [int(f[:2]) - 1 for f in img_files]
df = pd.DataFrame({'filename': img_files, 'label': labels})

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f'Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}')
print(f'Train distribution:\n{train_df["label"].value_counts().sort_index()}')

Train: 536, Val: 67, Test: 67
Train distribution:
label
0    98
1    84
2    86
3    30
4    45
5    97
6    96
Name: count, dtype: int64


### Dataset Class

In [None]:
class HorseDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.df.iloc[idx]['filename'])
        image = Image.open(img_path).convert('RGB')
        label = self.df.iloc[idx]['label']
        
        if self.transform:
            image = self.transform(image)
            
        return image, label

### Transforms

### Create Dataloader objects

#### Display some images

### Define Model 

### define Loss and Optimizer

#### Build one_epoch_training function loop 

#### Build one_epoch_validation function loop 

### Combine all to train the model
it should Save the best model and track train and val loss and accuracy


### test the model on test set

### show some predictions with the images

### Analyze the results
Is the model overfitting/underfitting?
Plot the training and validation loss/accuracy curves

### Load the model