### Step 1: Setting up the environment

In [None]:
%pip install numpy pandas scikit-learn matplotlib seaborn

### Step 2: Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### Step 3: Load and Explore the Dataset

In [None]:
# Load the MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]

# Display the shape of the dataset
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

# Display the first few rows of the dataset
print(X.head())
print(y.head())

In [None]:
# Convert target to integers
y = y.astype(np.int8)

### Step 4: Preprocess the Data

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(f'Training set size: {X_train.shape[0]} samples')
print(f'Testing set size: {X_test.shape[0]} samples')


In [None]:
# Scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Step 5: Train a Logistic Regression Model

In [None]:
# Reduce dimensionality
pca = PCA(n_components=0.95, random_state=42)  # Preserve 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga']
}
model = LogisticRegression(max_iter=1000, multi_class='ovr', random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train_pca, y_train)

# Best model
best_model = grid_search.best_estimator_

### Step 6: Make Predictions

In [None]:
# Make predictions
y_pred = best_model.predict(X_test_pca)

# Display the first few predictions
print('First few predictions:', y_pred[:5])
print('First few actual values:', y_test.values[:5])

### Step 7: Evaluate the Model

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

### Step 8: Visualize the Results

In [None]:
# Visualize some of the test images along with their predicted labels
plt.figure(figsize=(10, 10))
for i in range(16):
    plt.subplot(4, 4, i+1)
    plt.imshow(X_test.iloc[i].values.reshape(28, 28), cmap='gray')
    plt.title(f'True: {y_test.iloc[i]}, Pred: {y_pred[i]}')
    plt.axis('off')
plt.show()

# Visualize the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### Step 9: Save the Model

In [None]:
# Save the model to a file
joblib.dump(model, 'mnist_logistic_regression_model.pkl')

# Load the model from a file
# model = joblib.load('mnist_logistic_regression_model.pkl')