In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import math

In [2]:
# Function to count and print the number of each digit in the dataset
def print_digit_distribution(y, label="Distribution"):
    digit_counts = np.sum(y, axis=0)
    print(f"{label}:")
    for i, count in enumerate(digit_counts):
        print(f"Digit {i}: {int(count)}")
    print()  # Add an extra line for better readability

def introduce_bias(x_train, y_train, digit_to_reduce, reduction_percentage):
    # Find indices of the digit to reduce
    indices_of_digit = np.where(np.argmax(y_train, axis=1) == digit_to_reduce)[0]

    reduction_percentage = 1 - reduction_percentage
    # Randomly select a subset of the digit to remove
    np.random.shuffle(indices_of_digit)
    indices_to_remove = indices_of_digit[len(indices_of_digit) * reduction_percentage // 100:]

    # Initialize lists to store filtered data
    biased_x_train = []
    biased_y_train = []

    # Iterate through the original training data
    for i in range(len(x_train)):
        # Check if the current index is not in indices_to_remove
        if i not in indices_to_remove:
            # If not, append the corresponding data to the filtered lists
            biased_x_train.append(x_train[i])
            biased_y_train.append(y_train[i])

    # Convert the filtered lists to numpy arrays
    return np.array(biased_x_train), np.array(biased_y_train)


def visualize_examples(model, x_test, y_test, num_examples=25, figsize=(10, 10), fontsize=10):
    # Predictions on test set
    predictions = model.predict(x_test)

    # Visualize examples
    plt.figure(figsize=figsize)
    for i in range(num_examples):
        plt.subplot(5, 5, i + 1)
        plt.imshow(x_test[i], cmap='gray')
        plt.title(f"True: {np.argmax(y_test[i])}, Predicted: {np.argmax(predictions[i])}", fontsize=fontsize)
        plt.axis('off')
    plt.show()


def plot_misclassifications(model, x_test, y_test):
    # Obtain the predictions and the true classes
    predictions = np.argmax(model.predict(x_test), axis=-1)
    true_classes = np.argmax(y_test, axis=1)

    # Identify misclassified indices
    misclassified_indices = np.where(predictions != true_classes)[0]

    # Extract the true labels of misclassified images
    misclassified_true_labels = true_classes[misclassified_indices]

    # Count the number of misclassified instances for each digit
    misclassified_counts = np.zeros(10)
    for label in misclassified_true_labels:
        misclassified_counts[label] += 1

    # Create a bar chart
    digits = np.arange(10)  # Array of digits from 0 to 9
    plt.figure(figsize=(10, 6))
    plt.bar(digits, misclassified_counts, color='red')
    plt.xlabel('Digits')
    plt.ylabel('Number of Misclassifications')
    plt.xticks(digits)
    plt.title('Number of Misclassified Instances for Each Digit')
    plt.show()


def evaluate_model(model, x_test, y_test):
    # Evaluate the model on the test dataset
    loss, accuracy = model.evaluate(x_test, y_test)
    print("Test Loss:", loss)
    print("Test Accuracy:", accuracy)

def plot_confusion_matrix(model, x_test, y_test):
    # Generate predictions
    predictions = np.argmax(model.predict(x_test), axis=-1)
    true_classes = np.argmax(y_test, axis=1)

    # Generate confusion matrix
    cm = confusion_matrix(true_classes, predictions)

    # Plot confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[0,1,2,3,4,5,6,7,8,9], yticklabels=[0,1,2,3,4,5,6,7,8,9])
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()


def print_misclassified_images(model, x_test, y_test, num_images=100):
    # Generate predictions for the specified number of images in the test set
    predictions = model.predict(x_test[:num_images])

    # Convert predictions to class numbers
    predicted_classes = np.argmax(predictions, axis=1)

    # Convert true labels from one-hot encoding to class numbers
    true_classes = np.argmax(y_test[:num_images], axis=1)

    # Identify the indices of misclassified images within the specified number of predictions
    misclassified_indices = np.where(predicted_classes != true_classes)[0]

    print(f"Total misclassified images in first {num_images} predictions: {len(misclassified_indices)}")

    # Plot the misclassified images
    for i, misclassified_index in enumerate(misclassified_indices):
        plt.figure(figsize=(2, 2))
        plt.imshow(x_test[misclassified_index].reshape(28, 28), cmap='gray')
        plt.title(f"Predicted: {predicted_classes[misclassified_index]}, True: {true_classes[misclassified_index]}")
        plt.axis('off')
        plt.show()


def print_misclassified_images_by_label_in_grid(model, x_test, y_test, true_label, num_images=100):
    # Generate predictions for the specified number of images in the test set
    predictions = model.predict(x_test[:num_images])

    # Convert predictions to class numbers
    predicted_classes = np.argmax(predictions, axis=1)

    # Convert true labels from one-hot encoding to class numbers
    true_classes = np.argmax(y_test[:num_images], axis=1)

    # Identify the indices of misclassified images within the specified number of predictions
    misclassified_indices = np.where((predicted_classes != true_classes) & (true_classes == true_label))[0]

    print(f"Total misclassified images of true label {true_label} in first {num_images} predictions: {len(misclassified_indices)}")

    # Determine the size of the grid
    num_misclassified = len(misclassified_indices)
    grid_size = math.ceil(math.sqrt(num_misclassified))

    # Create a figure with subplots in a square or nearly square layout
    plt.figure(figsize=(grid_size * 2, grid_size * 2))
    for i, misclassified_index in enumerate(misclassified_indices):
        plt.subplot(grid_size, grid_size, i + 1)
        plt.imshow(x_test[misclassified_index].reshape(28, 28), cmap='gray')  # Adjust the reshape dimensions as per your dataset
        plt.title(f"Pred: {predicted_classes[misclassified_index]}, True: {true_classes[misclassified_index]}", fontsize=10)
        plt.axis('off')
    plt.tight_layout()
    plt.show()




In [None]:
# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Convert labels to one-hot encoding
y_train = to_categorical(y_train, num_classes=10)
y_test = to_categorical(y_test, num_classes=10)

# Cut the input dataset to only half of thier values
half_index = len(x_train) // 2

x_train = x_train[:half_index]
y_train = y_train[:half_index]


model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(10, activation='softmax')
])


# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train,
                    epochs=2,
                    validation_data=(x_test, y_test))

print("Running the model prior to introducing a bias in the training set.")

# Evaluating model
evaluate_model(model, x_test, y_test)

# Plotting the confusion matrix
plot_confusion_matrix(model, x_test, y_test)



In [None]:
# Analyze misclassifications for each digit
plot_misclassifications(model, x_test, y_test)

# Visualize predictions for the test set
visualize_examples(model, x_test, y_test, num_examples=25)

In [None]:
# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train / 255.0
x_test = x_test / 255.0
x_train = np.expand_dims(x_train, axis=-1)
x_test = np.expand_dims(x_test, axis=-1)
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Print original dataset distribution
print_digit_distribution(y_train, label="Original Dataset Distribution")

# Introduce bias (reduce occurrences of digit '8' by 50%)
biased_x_train, biased_y_train = introduce_bias(x_train, y_train, digit_to_reduce=8, reduction_percentage=50)

# Print biased dataset distribution
print_digit_distribution(biased_y_train, label="Biased Dataset Distribution")

# Define CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model on the biased dataset
model.fit(biased_x_train, biased_y_train, epochs=2, batch_size=64, validation_split=0.1)

# Evaluating model
evaluate_model(model, x_test, y_test)

# Plotting the confusion matrix
plot_confusion_matrix(model, x_test, y_test)

# Analyze misclassifications for each digit
plot_misclassifications(model, x_test, y_test)

# Visualize predictions for the test set
visualize_examples(model, x_test, y_test, num_examples=25)


print_misclassified_images(model, x_test, y_test, num_images=100)

# Visualize misclassified images for a specific digit (e.g., digit 8)
print_misclassified_images_by_label_in_grid(model, x_test, y_test, true_label=8, num_images=100)