In [None]:
# Create a new Jupyter Notebook file named 'mnist_analysis.ipynb'
# Copy and paste the following content into the file

# %% [markdown]
# # MNIST Handwritten Digit Recognition
# 
# This notebook analyzes the MNIST dataset of handwritten digits using machine learning techniques.

# %% [markdown]
# ## 1. Importing Libraries

# %%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Set style for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

# %% [markdown]
# ## 2. Loading and Exploring the Data

# %%
# Load the dataset
df = pd.read_csv('mnist.csv')

# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

# %%
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum().sum())  # Should be 0 for this dataset

# %%
# Check the distribution of labels
plt.figure(figsize=(10, 5))
sns.countplot(x='label', data=df)
plt.title('Distribution of Digits in the Dataset')
plt.xlabel('Digit')
plt.ylabel('Count')
plt.show()

# %% [markdown]
# ## 3. Data Preprocessing

# %%
# Separate features and labels
X = df.drop('label', axis=1)
y = df['label']

# Normalize the pixel values (0-255 -> 0-1)
X = X / 255.0

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# %% [markdown]
# ## 4. Visualizing the Digits

# %%
# Function to display digits
def plot_digits(images, labels, n_rows=2, n_cols=5):
    plt.figure(figsize=(12, 6))
    for i in range(n_rows * n_cols):
        plt.subplot(n_rows, n_cols, i+1)
        plt.imshow(images[i].reshape(28, 28), cmap='gray')
        plt.title(f'Label: {labels.iloc[i]}')
        plt.axis('off')
    plt.tight_layout()
    plt.show()

# Display some examples from the training set
plot_digits(X_train.values, y_train)

# %% [markdown]
# ## 5. Dimensionality Reduction with PCA

# %%
# Apply PCA to reduce dimensions for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

# Plot the first two principal components
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label='Digit')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA of MNIST Dataset')
plt.show()

# %% [markdown]
# ## 6. Model Training - Random Forest

# %%
# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")

# %% [markdown]
# ## 7. Model Training - Neural Network

# %%
# Build a simple neural network
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(784,)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, 
                    epochs=15, 
                    batch_size=32, 
                    validation_split=0.2, 
                    verbose=1)

# %% [markdown]
# ## 8. Model Evaluation

# %%
# Evaluate the neural network
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Neural Network Test Accuracy: {test_acc:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# %%
# Make predictions with the neural network
y_pred_nn = model.predict(X_test)
y_pred_nn_classes = np.argmax(y_pred_nn, axis=1)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_nn_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_nn_classes))

# %% [markdown]
# ## 9. Visualizing Misclassified Examples

# %%
# Find misclassified examples
misclassified_idx = np.where(y_pred_nn_classes != y_test)[0]

# Display some misclassified examples
if len(misclassified_idx) > 0:
    plt.figure(figsize=(12, 6))
    for i, idx in enumerate(misclassified_idx[:10]):
        plt.subplot(2, 5, i+1)
        plt.imshow(X_test.iloc[idx].values.reshape(28, 28), cmap='gray')
        plt.title(f'True: {y_test.iloc[idx]}, Pred: {y_pred_nn_classes[idx]}')
        plt.axis('off')
    plt.tight_layout()
    plt.show()
else:
    print("No misclassified examples found!")

# %% [markdown]
# ## 10. Conclusion

# %%
# Compare model performances
models = ['Random Forest', 'Neural Network']
accuracies = [accuracy_rf, test_acc]

plt.figure(figsize=(8, 6))
bars = plt.bar(models, accuracies, color=['skyblue', 'lightgreen'])
plt.ylabel('Accuracy')
plt.title('Model Comparison')
plt.ylim(0, 1)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.4f}', ha='center', va='bottom')

plt.show()

# %%
print("Analysis Complete!")