In [1]:
import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, mixed_precision
from tensorflow.keras.constraints import MinMaxNorm
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2_as_graph

# GPUメモリ設定
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Mixed Precision
mixed_precision.set_global_policy('mixed_float16')
print("TensorFlow Version:", tf.__version__)

2026-01-08 13:37:40.056669: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  if not hasattr(np, "object"):


TensorFlow Version: 2.20.0


In [2]:
BATCH_SIZE = 128 # ResNet20ならバッチサイズを増やしても大丈夫です
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.cifar10.load_data()

# 正規化
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

def augment(image, label):
    # 1. 少し大きくリサイズ (32 -> 36)
    image = tf.image.resize(image, [36, 36])
    # 2. ランダムクロップ (36 -> 32)
    image = tf.image.random_crop(image, size=[32, 32, 3])
    # 3. 水平反転
    image = tf.image.random_flip_left_right(image)
    # 4. その他の調整
    image = tf.image.random_brightness(image, max_delta=0.2)
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
    return image, label

def resize_only(image, label):
    # テストデータはそのまま (32x32)
    # image = tf.image.resize(image, [32, 32]) # 必要ならリサイズ
    return image, label

# パイプライン構築
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)) \
    .shuffle(5000) \
    .map(augment, num_parallel_calls=tf.data.AUTOTUNE) \
    .batch(BATCH_SIZE) \
    .prefetch(tf.data.AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)) \
    .map(resize_only, num_parallel_calls=tf.data.AUTOTUNE) \
    .batch(BATCH_SIZE) \
    .prefetch(tf.data.AUTOTUNE)

  d = cPickle.load(f, encoding="bytes")
I0000 00:00:1767847076.435234     749 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1767847076.437326     749 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3582 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
2026-01-08 13:37:56.442070: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 614400000 exceeds 10% of free system memory.
2026-01-08 13:37:58.798938: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 614400000 exceeds 10% of free system memory.


In [3]:
class ParticipatingConv2D(layers.Layer):
    """
    ナッシュ均衡pruning用の畳み込み層 (修正版)
    - カーネル正則化 (Weight Decay) を追加
    - beta, gamma, kernel_regularizer_l2 を初期化引数で設定可能
    """
    def __init__(self, filters, kernel_size, strides=1, padding='same', 
                 beta=1e-4, gamma=1e-4, use_bias=False, 
                 kernel_regularizer_l2=1e-4, # 追加: カーネル自体の重み減衰
                 **kwargs):
        super().__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.strides = strides
        self.padding = padding.upper()
        self.use_bias = use_bias
        
        # ペナルティ係数
        self.beta = beta   # L2 (参加度×カーネル)
        self.gamma = gamma # L1 (参加度のみ)
        self.kernel_regularizer_l2 = kernel_regularizer_l2 # L2 (カーネルのみ)
        
    def build(self, input_shape):
        self.kernel = self.add_weight(
            name='kernel',
            shape=(*self.kernel_size, input_shape[-1], self.filters),
            initializer='glorot_uniform',
            trainable=True
        )
        
        if self.use_bias:
            self.bias = self.add_weight(
                name='bias',
                shape=(self.filters,),
                initializer='zeros',
                trainable=True
            )
        
        self.participation = self.add_weight(
            name='participation',
            shape=(self.filters,),
            initializer='ones',
            trainable=True,
            constraint=MinMaxNorm(min_value=0.0, max_value=1.0)
        )

    def call(self, inputs, training=None):
        if self.padding == 'SAME':
            y = tf.nn.conv2d(inputs, self.kernel, strides=self.strides, padding='SAME')
        else:
            y = tf.nn.conv2d(inputs, self.kernel, strides=self.strides, padding='VALID')
        
        if self.use_bias:
            y = tf.nn.bias_add(y, self.bias)
        
        # 参加度を適用
        y = y * self.participation
        
        if training:
            # (A) カーネル自体の正則化 (Weight Decay)
            kernel_loss = self.kernel_regularizer_l2 * tf.reduce_sum(tf.square(self.kernel))
            
            # (B) Nash均衡Pruning用のペナルティ
            l2_per_filter = tf.reduce_sum(tf.square(self.kernel), axis=[0, 1, 2])
            nash_l2 = self.beta * tf.reduce_sum(l2_per_filter * tf.square(self.participation))
            nash_l1 = self.gamma * tf.reduce_sum(tf.abs(self.participation))

            # 合計をLossに追加
            self.add_loss(kernel_loss + nash_l2 + nash_l1)
        
        return y
    
    def get_active_filters(self, threshold=0.01):
        return tf.reduce_sum(tf.cast(self.participation > threshold, tf.int32)).numpy()
    
    def get_sparsity(self, threshold=0.01):
        active = self.get_active_filters(threshold)
        return 1.0 - (active / self.filters)

In [4]:
def basic_block_participating(x, filters, stride=1, beta=1e-4, gamma=1e-4, kernel_reg=1e-4):
    """
    ResNet20用 Basic Block (Participating)
    """
    shortcut = x
    
    # Pre-activationっぽい構成にするか、標準的な構成にするか
    # ここでは標準的な構成 (Conv -> BN -> ReLU) を採用
    
    # 1. Conv 3x3
    y = ParticipatingConv2D(filters, 3, strides=stride, padding='same', 
                           beta=beta, gamma=gamma, kernel_regularizer_l2=kernel_reg)(x)
    y = layers.BatchNormalization()(y)
    y = layers.Activation('relu')(y)

    # 2. Conv 3x3
    y = ParticipatingConv2D(filters, 3, strides=1, padding='same', 
                           beta=beta, gamma=gamma, kernel_regularizer_l2=kernel_reg)(y)
    y = layers.BatchNormalization()(y)

    # Shortcut調整 (次元が違う場合)
    if stride > 1 or x.shape[-1] != filters:
        shortcut = layers.Conv2D(filters, 1, strides=stride, padding='same', 
                                 kernel_regularizer=regularizers.l2(kernel_reg))(x)
        shortcut = layers.BatchNormalization()(shortcut)

    # Add
    y = layers.Add()([shortcut, y])
    y = layers.Activation('relu')(y)
    return y

def build_resnet20_participating(input_shape=(32, 32, 3), classes=10, 
                                 beta=1e-4, gamma=1e-4, kernel_reg=1e-4):
    """
    ResNet20 (3 stages x 3 blocks + stem + head)
    フィルタ数は標準の [16, 32, 64]
    """
    inputs = layers.Input(input_shape)
    
    # Stem
    x = layers.Conv2D(16, 3, strides=1, padding='same', 
                      kernel_regularizer=regularizers.l2(kernel_reg))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    
    # Stage 1 (16 filters)
    for _ in range(3):
        x = basic_block_participating(x, 16, stride=1, beta=beta, gamma=gamma, kernel_reg=kernel_reg)
        
    # Stage 2 (32 filters)
    x = basic_block_participating(x, 32, stride=2, beta=beta, gamma=gamma, kernel_reg=kernel_reg)
    for _ in range(2):
        x = basic_block_participating(x, 32, stride=1, beta=beta, gamma=gamma, kernel_reg=kernel_reg)
        
    # Stage 3 (64 filters)
    x = basic_block_participating(x, 64, stride=2, beta=beta, gamma=gamma, kernel_reg=kernel_reg)
    for _ in range(2):
        x = basic_block_participating(x, 64, stride=1, beta=beta, gamma=gamma, kernel_reg=kernel_reg)
        
    # Head
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.3)(x) # Dropout追加
    outputs = layers.Dense(classes, activation='softmax', dtype='float32')(x)
    
    return models.Model(inputs, outputs, name="ResNet20_Nash")

In [5]:
class PruningStatsCallback(tf.keras.callbacks.Callback):
    def __init__(self, threshold=0.01):
        super().__init__()
        self.threshold = threshold
        self.history = {'sparsity': [], 'mean_participation': []}
    
    def on_epoch_end(self, epoch, logs=None):
        total_filters = 0
        active_filters = 0
        participation_sum = 0.0
        
        for layer in self.model.layers:
            if isinstance(layer, ParticipatingConv2D):
                total_filters += layer.filters
                active_filters += layer.get_active_filters(self.threshold)
                participation_sum += tf.reduce_sum(layer.participation).numpy()
        
        sparsity = 1.0 - (active_filters / total_filters) if total_filters > 0 else 0.0
        mean_s = participation_sum / total_filters if total_filters > 0 else 0.0
        
        self.history['sparsity'].append(sparsity)
        self.history['mean_participation'].append(mean_s)
        
        print(f"\n[Pruning] Sparsity: {sparsity*100:.1f}%, Mean s: {mean_s:.4f}")

# --- 学習実行 ---
# 係数設定 (実験で調整するならここを変える)
BETA = 1e-4
GAMMA = 1e-4
KERNEL_L2 = 1e-4

model = build_resnet20_participating(
    input_shape=(32, 32, 3), # CIFAR-10の元サイズ
    classes=10,
    beta=BETA,
    gamma=GAMMA,
    kernel_reg=KERNEL_L2
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# コールバック
pruning_stats = PruningStatsCallback()
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

print(f"\nTraining Start: Beta={BETA}, Gamma={GAMMA}, L2={KERNEL_L2}")

history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=50, # ResNet20なら50epochでも数十分で終わるはずです
    callbacks=[pruning_stats, early_stop, reduce_lr]
)


Training Start: Beta=0.0001, Gamma=0.0001, L2=0.0001
Epoch 1/50


2026-01-08 13:38:03.159250: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 614400000 exceeds 10% of free system memory.


TypeError: Cannot convert a list containing a tensor of dtype <dtype: 'float16'> to <dtype: 'float32'> (Tensor is: <tf.Tensor 'Sum_3:0' shape=() dtype=float16>)

In [None]:
# 可視化
epochs = range(1, len(history.history['loss']) + 1)

plt.figure(figsize=(15, 5))

# Accuracy
plt.subplot(1, 3, 1)
plt.plot(epochs, history.history['accuracy'], label='Train')
plt.plot(epochs, history.history['val_accuracy'], label='Val')
plt.title('Accuracy')
plt.legend()

# Loss
plt.subplot(1, 3, 2)
plt.plot(epochs, history.history['loss'], label='Train')
plt.plot(epochs, history.history['val_loss'], label='Val')
plt.title('Loss')
plt.legend()

# Sparsity
plt.subplot(1, 3, 3)
plt.plot(epochs, pruning_stats.history['sparsity'], label='Sparsity', color='green')
plt.title('Sparsity Evolution')
plt.xlabel('Epoch')
plt.legend()

plt.tight_layout()
plt.show()