Import

In [None]:
# --- CELL 1: IMPORTER ---
import time
notebook_start = time.time()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import os

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

import scipy.ndimage as ndimage
from scipy.ndimage import gaussian_filter, map_coordinates
from PIL import Image

# --- CELL 2: DATALADDNING ---
mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False, parser='auto')
X = mnist["data"] / 255.0  # Normalisera direkt
y = mnist["target"].astype(np.uint8)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# --- CELL 3: DESKEWING ---
def deskew(image):
    img = image.reshape(28, 28)
    mu = ndimage.center_of_mass(img)
    if np.isnan(mu).any(): return image
    y_coords, x_coords = np.mgrid[:28, :28]
    mu11 = np.sum((x_coords - mu[1]) * (y_coords - mu[0]) * img)
    mu02 = np.sum((y_coords - mu[0])**2 * img)
    if abs(mu02) < 1e-2: return image
    skew = mu11 / mu02
    matrix = np.array([[1, 0], [skew, 1]])
    center = np.array([14, 14])
    offset = center - np.dot(matrix, center)
    return ndimage.affine_transform(img, matrix, offset=offset, order=1, mode='constant', cval=0).flatten()

X_train_deskewed = np.array([deskew(img) for img in X_train])
X_test_deskewed = np.array([deskew(img) for img in X_test])

In [None]:
# --- CELL 4: DEFINITIONER OCH DATA-LADDNING (KOMPLETT) ---
import joblib
import os
import numpy as np
import scipy.ndimage as ndimage

# 1. DEFINIERA FUNKTIONER (Globala för att nås av TTA)
def shift_image(image, dx, dy):
    return ndimage.shift(image.reshape(28, 28), [dy, dx], cval=0, mode="constant").flatten()

def rotate_image(image, angle):
    return ndimage.rotate(image.reshape(28, 28), angle, reshape=False, cval=0, mode="constant").flatten()

def zoom_image(image, zoom_factor):
    img = image.reshape(28, 28)
    zoomed = ndimage.zoom(img, zoom_factor, order=1)
    h, w = img.shape
    if zoom_factor < 1.0:
        pad_h, pad_w = (h - zoomed.shape[0]) // 2, (w - zoomed.shape[1]) // 2
        result = np.pad(zoomed, ((pad_h, h - zoomed.shape[0] - pad_h), (pad_w, w - zoomed.shape[1] - pad_w)), mode='constant')
    else:
        start_h, start_w = (zoomed.shape[0] - h) // 2, (zoomed.shape[1] - w) // 2
        result = zoomed[start_h:start_h+h, start_w:start_w+w]
    return result.flatten()

def elastic_transform(image, alpha=8, sigma=3):
    shape = (28, 28)
    image_2d = image.reshape(shape)
    random_state = np.random.RandomState(None)
    dx = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma) * alpha
    dy = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma) * alpha
    y, x = np.mgrid[0:shape[0], 0:shape[1]]
    indices = np.reshape(y+dy, (-1, 1)), np.reshape(x+dx, (-1, 1))
    return map_coordinates(image_2d, indices, order=1, mode='constant', cval=0).flatten()

# 2. CACHE-LOGIK
fast_data_path = "C:/mnist_data/mnist_svc_hog_max.joblib"
if not os.path.exists("C:/mnist_data"):
    os.makedirs("C:/mnist_data")

if os.path.exists(fast_data_path):
    print(">>> Laddar RIGID dataset från NVMe...")
    X_train_augmented, y_train_augmented = joblib.load(fast_data_path)
else:
    print(">>> Skapar nytt rigid dataset...")
    X_train_aug = [X_train_deskewed]
    y_train_aug = [y_train]

    for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
        X_train_aug.append(np.apply_along_axis(shift_image, 1, X_train_deskewed, dx, dy))
        y_train_aug.append(y_train)

    for angle in (4, -4, 8, -8, 12, -12):
        X_train_aug.append(np.apply_along_axis(rotate_image, 1, X_train_deskewed, angle))
        y_train_aug.append(y_train)

    for z in (0.9, 1.1):
        X_train_aug.append(np.apply_along_axis(zoom_image, 1, X_train_deskewed, z))
        y_train_aug.append(y_train)

    X_train_augmented = np.concatenate(X_train_aug)
    y_train_augmented = np.concatenate(y_train_aug)
    shuffle_idx = np.random.permutation(len(X_train_augmented))
    X_train_augmented = X_train_augmented[shuffle_idx]
    y_train_augmented = y_train_augmented[shuffle_idx]
    joblib.dump((X_train_augmented, y_train_augmented), fast_data_path)

print(f"Klart! Rader: {len(X_train_augmented)}")

In [None]:
# --- CELL 5: PCA ---
pca = PCA(n_components=140, whiten=True, random_state=42)


In [None]:
# --- CELL 6: SMART ENSEMBLE (Laddar din vinnande SVC) ---

# 1. Försök ladda din befintliga succé-modell
model_path = 'mnist_svc_hog_max.joblib'

if os.path.exists(model_path):
    print(f">>> Hittade guld-modellen ({model_path}). Laddar in SVC-experten...")
    # Vi plockar ut SVC-delen direkt ur din sparade pipeline
    loaded_pipe = joblib.load(model_path)
    svc_expert = loaded_pipe.named_steps['svc']
else:
    print(">>> VARNING: Hittade ingen sparad fil. Skapar en ny SVC (C=25).")
    svc_expert = SVC(C=25, kernel='rbf', cache_size=2000)

# 2. Definiera de nya kollegorna (KNN och RF)
# De här behöver tränas från grunden för att lära sig dina HOG-features
knn_expert = KNeighborsClassifier(n_neighbors=3, weights='distance', n_jobs=-1)
rf_expert = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)

# 3. Skapa juryn
voting_clf = VotingClassifier(
    estimators=[
        ('svc', svc_expert), 
        ('knn', knn_expert), 
        ('rf', rf_expert)
    ],
    voting='hard',
    n_jobs=-1 # Parallell träning!
)

# 4. Den kompletta pipelinen (HOG + PCA 140)
ensemble_pipeline = Pipeline([
    ('feature', FeatureUnion([
        ('pca', PCA(n_components=140, whiten=True)),
        ('hog', HogTransformer())
    ])),
    ('scaler', StandardScaler()),
    ('ensemble', voting_clf)
])

print("Ensemblen är nu redo med din vinnande SVC som ordförande!")

In [None]:
# --- CELL 7: TRÄNING AV ENSEMBLE ---
t0 = time.time()
print(f"Startar träning av juryn på {len(X_train_augmented)} rader...")

# Nu tränas SVC, RF och KNN parallellt tack vare n_jobs=-1 i VotingClassifier
ensemble_pipeline.fit(X_train_augmented, y_train_augmented)

print(f">>> Träning klar! Tid: {time.time() - t0:.1f} sekunder")

In [None]:
# --- CELL 8: REN EVALUERING ---
t0 = time.time()

print("Genomför slutgiltig prediktion med Master Ensemble...")
y_pred = ensemble_pipeline.predict(X_test_deskewed)

final_acc = np.mean(y_pred == y_test)
print("\n" + "="*30)
print(f"RESULTAT UTAN TTA")
print("="*30)
print(f"Accuracy: {final_acc:.5f}")
print(f"Antal fel: {np.sum(y_pred != y_test)} av 14 000")
print("="*30)

# Visa de 5 svåraste bilderna
still_wrong = np.where(y_pred != y_test)[0]
plt.figure(figsize=(15, 4))
for i, idx in enumerate(still_wrong[:5]):
    plt.subplot(1, 5, i + 1)
    plt.imshow(X_test_deskewed[idx].reshape(28, 28), cmap='gray')
    plt.title(f"Sann: {y_test[idx]}\nGissad: {y_pred[idx]}")
    plt.axis('off')
plt.show()