In [1]:
import numpy as np 
import pandas as pd 

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout , BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from keras.callbacks import ReduceLROnPlateau

In [3]:
train_df=pd.read_csv('sign_mnist_train.csv')
test_df=pd.read_csv('sign_mnist_test.csv')

In [4]:
train_df.describe()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,...,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0
mean,12.318813,145.419377,148.500273,151.247714,153.546531,156.210891,158.411255,160.472154,162.339683,163.954799,...,141.104863,147.495611,153.325806,159.125332,161.969259,162.736696,162.906137,161.966454,161.137898,159.824731
std,7.287552,41.358555,39.942152,39.056286,38.595247,37.111165,36.125579,35.016392,33.661998,32.651607,...,63.751194,65.512894,64.427412,63.708507,63.738316,63.444008,63.50921,63.298721,63.610415,64.396846
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,121.0,126.0,130.0,133.0,137.0,140.0,142.0,144.0,146.0,...,92.0,96.0,103.0,112.0,120.0,125.0,128.0,128.0,128.0,125.5
50%,13.0,150.0,153.0,156.0,158.0,160.0,162.0,164.0,165.0,166.0,...,144.0,162.0,172.0,180.0,183.0,184.0,184.0,182.0,182.0,182.0
75%,19.0,174.0,176.0,178.0,179.0,181.0,182.0,183.0,184.0,185.0,...,196.0,202.0,205.0,207.0,208.0,207.0,207.0,206.0,204.0,204.0
max,24.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


The train_df dataset consit of 1st column representing labels 1 to 24. The label is loaded in a seperate dataframe called 'train_label' and the 'label' column is dropped from the original training dataframe which now consist of only 784 pixel values for each image.

In [5]:
# Extract labels and features
y_train = train_df['label']
X_train = train_df.drop(columns=['label'])

y_test = test_df['label']
X_test = test_df.drop(columns=['label'])

In [6]:
# Function to split dataset based on label ranges
def split_dataset(X, y):
    # Convert to DataFrame for easy indexing
    X = pd.DataFrame(X)
    y = pd.Series(y)

    # Define label ranges
    mask1 = (y >= 0) & (y <= 7)
    mask2 = (y > 7) & (y <= 13)
    mask3 = (y > 13) & (y <= 24)

    # Split data
    X_set1, y_set1 = X[mask1], y[mask1]
    X_set2, y_set2 = X[mask2], y[mask2]
    X_set3, y_set3 = X[mask3], y[mask3]

    return (X_set1, y_set1), (X_set2, y_set2), (X_set3, y_set3)

# Apply the function to training data
(X_train_1, y_train_1), (X_train_2, y_train_2), (X_train_3, y_train_3) = split_dataset(X_train, y_train)

# Apply the function to test data
(X_test_1, y_test_1), (X_test_2, y_test_2), (X_test_3, y_test_3) = split_dataset(X_test, y_test)


In [7]:
# Convert DataFrame to NumPy array and reshape
X_train_1 = X_train_1.to_numpy().reshape(-1, 28, 28, 1)
X_test_1 = X_test_1.to_numpy().reshape(-1, 28, 28, 1)

X_train_2 = X_train_2.to_numpy().reshape(-1, 28, 28, 1)
X_test_2 = X_test_2.to_numpy().reshape(-1, 28, 28, 1)

X_train_3 = X_train_3.to_numpy().reshape(-1, 28, 28, 1)
X_test_3 = X_test_3.to_numpy().reshape(-1, 28, 28, 1)


In [8]:
from tensorflow.keras.utils import to_categorical

# Ensure labels are within the correct range (0-23)
y_train_1 = to_categorical(y_train_1 - 1, num_classes=24)
y_test_1 = to_categorical(y_test_1 - 1, num_classes=24)
y_train_2 = to_categorical(y_train_2 - 1, num_classes=24)
y_test_2 = to_categorical(y_test_2 - 1, num_classes=24)
y_train_3 = to_categorical(y_train_3 - 1, num_classes=24)
y_test_3 = to_categorical(y_test_3 - 1, num_classes=24)


In [9]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience = 2, verbose=1,factor=0.5, min_lr=0.00001)

Model Architechture

In [10]:
model = Sequential()
model.add(Conv2D(75 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu' , input_shape = (28,28,1)))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(50 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(25 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Flatten())
model.add(Dense(units = 512 , activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(units = 24 , activation = 'softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


EWC for Continual Learning Implementation

Split Data: Divide the dataset into three subsets:

Subset 1: Labels 0-7
Subset 2: Labels 7-13
Subset 3: Labels 13-24
Train Sequentially with EWC:

Train the model on Subset 1, compute Fisher Information Matrix (FIM), and save important weights.
Apply EWC loss when training on Subset 2, preventing drastic changes to important weights.
Repeat the process for Subset 3.


Train the model on the first subset and compute the Fisher Information Matrix (FIM).
Store the important parameters learned in the first task.
Apply EWC penalty when training on the second subset.
Repeat the process for the third subset.

In [11]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

class EWC:
    def __init__(self, model, dataset, labels, lambda_ewc=0.1):
        self.model = model
        self.dataset = dataset
        self.labels = labels  # Initialize labels
        self.labels = tf.convert_to_tensor(self.labels, dtype=tf.float32)  
        self.lambda_ewc = lambda_ewc
        self.fisher_information = {}
        self.optimal_params = {}

    def compute_fisher_information(self):
        """Compute the Fisher Information for each parameter in the model"""
        with tf.GradientTape() as tape:
            predictions = self.model(self.dataset, training=True)
            sparse_labels = tf.argmax(self.labels, axis=1)  # Convert one-hot to class indices
            loss = tf.keras.losses.sparse_categorical_crossentropy(sparse_labels, predictions, from_logits=False)


        grads = tape.gradient(loss, self.model.trainable_variables)
        
        for var, grad in zip(self.model.trainable_variables, grads):
            if grad is not None:
                fisher_value = tf.square(grad)  # Approximation of Fisher Information
                self.fisher_information[var.name] = fisher_value
                self.optimal_params[var.name] = var.numpy()

    def ewc_loss(self):
        """Compute the EWC penalty"""
        penalty = 0
        for var in self.model.trainable_variables:
            if var.name in self.fisher_information:
                penalty += tf.reduce_sum(self.fisher_information[var.name] * tf.square(var - self.optimal_params[var.name]))

        return self.lambda_ewc * penalty

    def apply_ewc(self, loss):
        """Modify the loss function with EWC penalty"""
        return loss + self.ewc_loss()


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define data augmentation
datagen = ImageDataGenerator(
    rotation_range=15,      # Rotate images by 15 degrees
    width_shift_range=0.1,  # Shift width by 10%
    height_shift_range=0.1, # Shift height by 10%
    shear_range=0.1,        # Apply shear transformation
    zoom_range=0.1,         # Random zoom
    horizontal_flip=True,   # Flip images horizontally
    brightness_range=[0.8, 1.2],  # Adjust brightness
    fill_mode='nearest'     # Fill in missing pixels
)

# Train on the first subset with augmentation
ewc = EWC(model, X_train_1, y_train_1, lambda_ewc=0.1)

# Fit model using augmented data
train_generator_1 = datagen.flow(X_train_1, y_train_1, batch_size=32)
model.fit(train_generator_1, validation_data=(X_test_1, y_test_1), epochs=10, callbacks=[learning_rate_reduction])

# Compute Fisher Information
ewc.compute_fisher_information()

# Train on the second subset with EWC penalty and augmentation
train_generator_2 = datagen.flow(X_train_2, y_train_2, batch_size=32)

def custom_loss(y_true, y_pred):
    base_loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
    return ewc.apply_ewc(base_loss)

model.compile(optimizer=Adam(), loss=custom_loss, metrics=['accuracy'])
model.fit(train_generator_2, validation_data=(X_test_2, y_test_2), epochs=10, callbacks=[learning_rate_reduction])

# Compute Fisher Information again
ewc.compute_fisher_information()

# Train on the third subset with EWC penalty and augmentation
train_generator_3 = datagen.flow(X_train_3, y_train_3, batch_size=32)
model.fit(train_generator_3, validation_data=(X_test_3, y_test_3), epochs=10, callbacks=[learning_rate_reduction])


Epoch 1/10


  self._warn_if_super_not_called()


[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 43ms/step - accuracy: 0.5369 - loss: 1.3715 - val_accuracy: 0.7067 - val_loss: 0.8681 - learning_rate: 0.0010
Epoch 2/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 43ms/step - accuracy: 0.8997 - loss: 0.2686 - val_accuracy: 0.8665 - val_loss: 0.3134 - learning_rate: 0.0010
Epoch 3/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 43ms/step - accuracy: 0.9423 - loss: 0.1579 - val_accuracy: 0.9603 - val_loss: 0.0926 - learning_rate: 0.0010
Epoch 4/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 42ms/step - accuracy: 0.9719 - loss: 0.0823 - val_accuracy: 0.8683 - val_loss: 0.3632 - learning_rate: 0.0010
Epoch 5/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 42ms/step - accuracy: 0.9702 - loss: 0.0814 - val_accuracy: 0.9881 - val_loss: 0.0340 - learning_rate: 0.0010
Epoch 6/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1

In [None]:
# Evaluate model performance on the first test subset
test_loss_1, test_accuracy_1 = model.evaluate(X_test_1, y_test_1)
print(f"Test Accuracy on Set 1: {test_accuracy_1:.4f}")

# Evaluate model performance on the second test subset
test_loss_2, test_accuracy_2 = model.evaluate(X_test_2, y_test_2)
print(f"Test Accuracy on Set 2: {test_accuracy_2:.4f}")

# Evaluate model performance on the third test subset
test_loss_3, test_accuracy_3 = model.evaluate(X_test_3, y_test_3)
print(f"Test Accuracy on Set 3: {test_accuracy_3:.4f}")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Get predictions for each test set
y_pred_1 = model.predict(X_test_1)
y_pred_1_classes = y_pred_1.argmax(axis=1)
y_test_1_classes = y_test_1.argmax(axis=1)

y_pred_2 = model.predict(X_test_2)
y_pred_2_classes = y_pred_2.argmax(axis=1)
y_test_2_classes = y_test_2.argmax(axis=1)

y_pred_3 = model.predict(X_test_3)
y_pred_3_classes = y_pred_3.argmax(axis=1)
y_test_3_classes = y_test_3.argmax(axis=1)

# Print classification reports
print("Classification Report for Test Set 1:")
print(classification_report(y_test_1_classes, y_pred_1_classes))

print("Classification Report for Test Set 2:")
print(classification_report(y_test_2_classes, y_pred_2_classes))

print("Classification Report for Test Set 3:")
print(classification_report(y_test_3_classes, y_pred_3_classes))


In [None]:

# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=range(24), yticklabels=range(24))
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(title)
    plt.show()

# Plot confusion matrices
plot_confusion_matrix(y_test_1_classes, y_pred_1_classes, "Confusion Matrix - Test Set 1")
plot_confusion_matrix(y_test_2_classes, y_pred_2_classes, "Confusion Matrix - Test Set 2")
plot_confusion_matrix(y_test_3_classes, y_pred_3_classes, "Confusion Matrix - Test Set 3")
