# *Configuration*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import random
random.seed(42)

import numpy as np
np.random.seed(42)

# *Import*

In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from skimage.exposure import equalize_adapthist
from scipy.ndimage import rotate, gaussian_filter, map_coordinates, zoom

# *Dataset*

In [None]:
data_path = "/content/drive/MyDrive/Teeth/Final"
files = os.listdir(data_path)
print("Dataset size: ", len(files))

val_codes_primary = ["u7", "v8", "w9", "x10", "y11", "z12", "a61", "b62", "c63", "d64", "e65", "f66", "a67", "b68", "c69", "d70", "e71", "f72"]
val_codes_label_based = ["a73", "b74", "c75", "d76", "e77", "f78"]

val_data = []
train_data = []

for file_name in os.listdir(data_path):
    parts = file_name.split("_")
    binary_label = parts[1]
    code = parts[2].split("-")[0]

    if code in val_codes_primary or (code in val_codes_label_based and binary_label == "1"):
        val_data.append(file_name)
    else:
        train_data.append(file_name)

Dataset size:  540


In [None]:
x_train = []
y_train = []

for file in train_data:
    x = np.load(os.path.join("/content/drive/MyDrive/Teeth/Final", file))["arr_0"]
    x = (x - x.min()) / (x.max() - x.min())
    x_train.append(x)
    
    y_train.append(int(file[5]))

x_train  = np.expand_dims(np.array(x_train, dtype=np.float32), axis=-1)
y_train  = np.array(y_train, dtype=np.int32)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

np.savez_compressed(f"/content/drive/MyDrive/Init/Train.npz", x=x_train, y=y_train)
print("Train has been saved!")

In [None]:
x_val = []
y_val = []

for file in train_data:
    x = np.load(os.path.join("/content/drive/MyDrive/Teeth/Final", file))["arr_0"]
    x = (x - x.min()) / (x.max() - x.min())
    x_val.append(x)
    
    y_val.append(int(file[5]))

x_val  = np.expand_dims(np.array(x_val, dtype=np.float32), axis=-1)
y_val  = np.array(y_val, dtype=np.int32)
x_val, y_val = shuffle(x_val, y_val, random_state=42)

np.savez_compressed(f"/content/drive/MyDrive/Init/Validation.npz", x=x_val, y=y_val)
print("Validation has been saved!")

In [None]:
x_test = []
y_test = []

for file in os.listdir("/content/drive/MyDrive/Teeth/Final_Test"):
    x = np.load(os.path.join("/content/drive/MyDrive/Teeth/Final_Test", file))["arr_0"]
    x = (x - x.min()) / (x.max() - x.min())
    x_test.append(x)
    
    last_part = file.split('_')[-1]
    y_test.append(int(last_part.split('.')[0]))

x_test  = np.expand_dims(np.array(x_test, dtype=np.float32), axis=-1)
y_test  = np.array(y_test, dtype=np.int32)
x_test, y_test = shuffle(x_test, y_test, random_state=42)

np.savez_compressed(f"/content/drive/MyDrive/Init/Test.npz", x=x_test, y=y_test)
print("Test has been saved!")

In [None]:
def prepare_data():
    with np.load("/content/drive/MyDrive/Teeth/Init/Train.npz") as data:
        x_train = data['x']
        y_train = data['y']

    with np.load("/content/drive/MyDrive/Teeth/Init/Validation.npz") as data:
        x_val = data['x']
        y_val = data['y']

    with np.load("/content/drive/MyDrive/Teeth/Init/Test.npz") as data:
        x_test = data['x'].astype(np.float32)
        y_test = data['y']

    return x_train, y_train, x_val, y_val, x_test, y_test


x_train, y_train, x_val, y_val, x_test, y_test = prepare_data()

# *Fix Padding*

In [None]:
def remove_padding_borders(x, tol=1e-3):
    def find_padding_extent(arr, axis):
        size = arr.shape[axis]
        start, end = 0, size

        # Start border
        for i in range(size):
            sl = np.take(arr, indices=i, axis=axis)
            if np.allclose(sl, sl.flat[0], atol=tol):
                start += 1
            else:
                break

        # End border
        for i in reversed(range(size)):
            sl = np.take(arr, indices=i, axis=axis)
            if np.allclose(sl, sl.flat[0], atol=tol):
                end -= 1
            else:
                break

        return start, end

    x_clean = np.copy(x)

    for idx in tqdm(range(x.shape[0])):
        sample = x_clean[idx, :, :, :, 0]

        # Find borders in each axis
        d_start, d_end = find_padding_extent(sample, 0)
        h_start, h_end = find_padding_extent(sample, 1)
        w_start, w_end = find_padding_extent(sample, 2)

        # Set paddings to zero
        sample[:d_start, :, :] = 0
        sample[d_end:, :, :] = 0
        sample[:, :h_start, :] = 0
        sample[:, h_end:, :] = 0
        sample[:, :, :w_start] = 0
        sample[:, :, w_end:] = 0

        x_clean[idx, :, :, :, 0] = sample

    return x_clean


x_train = remove_padding_borders(x_train)
x_val = remove_padding_borders(x_val)
x_test = remove_padding_borders(x_test)

# *Prepare Train*

In [None]:
def elastic_deformation(data, alpha=15, sigma=3):
    shape = data.shape
    dx = gaussian_filter(np.random.randn(*shape), sigma) * alpha
    dy = gaussian_filter(np.random.randn(*shape), sigma) * alpha
    dz = gaussian_filter(np.random.randn(*shape), sigma) * alpha
    x, y, z = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]), np.arange(shape[2]), indexing='ij')
    indices = (x + dx, y + dy, z + dz)
    return map_coordinates(data, indices, order=1, mode='reflect')

def random_rotation(data, max_angle=10):
    angle = np.random.uniform(-max_angle, max_angle)
    axes = random.choice([(0, 1), (0, 2), (1, 2)])
    return rotate(data, angle, axes=axes, reshape=False, mode='nearest')

def random_flip(data):
    if np.random.rand() > 0.5:
        data = np.flip(data, axis=0)
    if np.random.rand() > 0.5:
        data = np.flip(data, axis=1)
    if np.random.rand() > 0.5:
        data = np.flip(data, axis=2)
    return data

def add_noise(data, min_scale=0.05, max_scale=0.15):
    scale = np.random.uniform(min_scale, max_scale)
    noise = np.random.normal(loc=0.0, scale=scale, size=data.shape)
    return data + noise

def random_zoom(data, zoom_range=(1.1, 1.5)):
    zoom_factor = np.random.uniform(*zoom_range)
    zoomed = zoom(data, zoom_factor, order=1, mode='nearest')

    original_shape = data.shape
    zoomed_shape = zoomed.shape
    crop_slices = tuple(
        slice((z - o) // 2, (z - o) // 2 + o) if z > o else slice(0, o)
        for o, z in zip(original_shape, zoomed_shape)
    )
    return zoomed[crop_slices]

def apply_augmentations(data):
    augmentations = [random_rotation,
                     random_flip,
                     add_noise,
                     random_zoom,
                     elastic_deformation]

    random.shuffle(augmentations)
    augment_set1 = augmentations[:3]
    augment_set2 = augmentations[3:]

    augmented1 = data.copy()
    for augment in augment_set1:
        augmented1 = augment(augmented1)

    augmented2 = data.copy()
    for augment in augment_set2:
        augmented2 = augment(augmented2)

    return data, augmented1, augmented2

In [None]:
def threshold_background(x, threshold=0.3):
    x_clean = np.copy(x)
    x_clean[np.abs(x_clean) < threshold] = 0
    return x_clean

def local_contrast_enhancement(data, clip_limit=0.03):
    return equalize_adapthist(data, clip_limit=clip_limit)

In [None]:
def process_files(x_init, y_init):
    X, y = [], []

    x_init_rm = threshold_background(x_init)

    for idx in tqdm(range(x_init.shape[0])):
        enhanced = local_contrast_enhancement(x_init_rm[idx, 20:120, :, :, 0])
        original, augmented1, augmented2 = apply_augmentations(enhanced)

        original = original[..., np.newaxis]
        augmented1 = augmented1[..., np.newaxis]
        augmented2 = augmented2[..., np.newaxis]

        original = (original - np.min(original)) / (np.max(original) - np.min(original))
        augmented1 = (augmented1 - np.min(augmented1)) / (np.max(augmented1) - np.min(augmented1))
        augmented2 = (augmented2 - np.min(augmented2)) / (np.max(augmented2) - np.min(augmented2))

        X.extend([original, augmented1, augmented2])

        label = int(y_init[idx])
        y.extend([label, label, label])

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.int32)
    X, y = shuffle(X, y, random_state=42)

    np.savez_compressed(f"/content/drive/MyDrive/Teeth/Final/train.npz", x=X, y=y)
    print(f"Train set has been saved!")


process_files(x_train, y_train)

100%|██████████| 450/450 [04:39<00:00,  1.61it/s]


Train-augmented has been saved!


100%|██████████| 450/450 [00:15<00:00, 28.52it/s]


Train has been saved!


100%|██████████| 90/90 [00:04<00:00, 18.97it/s]


Validation has been saved!


# *Prepare Evaluation*

In [None]:
def process_eval_files(x_init, y_init, file_name):
    X, y = [], []

    x_init_rm = threshold_background(x_init)

    for idx in tqdm(range(x_init.shape[0])):
        enhanced = local_contrast_enhancement(x_init_rm[idx, 20:120, :, :, 0])

        original = enhanced[..., np.newaxis]
        original = (original - np.min(original)) / (np.max(original) - np.min(original))

        X.append(original)

        label = int(y_init[idx])
        y.append(label)

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.int32)
    X, y = shuffle(X, y, random_state=42)

    np.savez_compressed(f"/content/drive/MyDrive/Teeth/Final/{file_name}.npz", x=X, y=y)
    print(f"{file_name} set has been saved!")


process_eval_files(x_test, y_test, "Test")
process_eval_files(x_val, y_val, "Validation")

# *Evaluate Dataset*

In [None]:
def prepare_data():
    with np.load("/content/drive/MyDrive/Teeth/Final/train.npz") as data:
        x_train = data['x']
        y_train = data['y']

    with np.load("/content/drive/MyDrive/Teeth/Final/Validation.npz") as data:
        x_val = data['x']
        y_val = data['y']

    with np.load("/content/drive/MyDrive/Teeth/Final/Test.npz") as data:
        x_test = data['x'].astype(np.float32)
        y_test = data['y']

    return x_train, y_train, x_val, y_val, x_test, y_test


x_train, y_train, x_val, y_val, x_test, y_test = prepare_data()

In [None]:
# 1. Shape
print("Shapes:")
print("x_train:", x_train.shape)
print("x_val:", x_val.shape)
print("x_test:", x_test.shape)

# 2. Label Balance
print("\nLabel Distribution:")
print("Train:", Counter(y_train))
print("Validation:", Counter(y_val))
print("Test:", Counter(y_test))

# 3. Normalization Check
def check_normalization(x, name):
    print(f"\n{name} stats:")
    print("Min:", np.min(x))
    print("Max:", np.max(x))
    print("Mean:", np.mean(x))
    print("Std:", np.std(x))

check_normalization(x_train, "Train")
check_normalization(x_val, "Validation")
check_normalization(x_test, "Test")

# 4. Visualization - Hint-based plot
def plot_dataset(dataset):
  for i in range(0, 90, 10):
    plt.imshow(dataset[i, :, 25, :, 0], cmap='gray')
    plt.show()

plot_dataset(x_train)
plot_dataset(x_test)
plot_dataset(x_val)