In [85]:

import os

traindir = r"C:\Users\Srija\Downloads\train\train"
cat_files = sorted([f for f in os.listdir(traindir) if f.startswith("cat")])
dog_files = sorted([f for f in os.listdir(traindir) if f.startswith("dog")])
print(f"{len(cat_files)} cat files, {len(dog_files)} dog files")


12500 cat files, 12500 dog files


In [47]:


# import numpy as np
# import cv2
# from sklearn.decomposition import PCA
# from sklearn.linear_model import SGDClassifier
# from sklearn.metrics import accuracy_score
# import joblib
# import os

# # ---------- PARAMETERS TO CHANGE FOR EACH NEW BATCH ---------------
# start_cat = 0        # set to 0, then 1000, then 2000, etc. for each run
# start_dog = 0
# batch_size = 1000
# img_size = 64
# traindir = r"C:\Users\Srija\Downloads\train\train"

# # Reload file list, so notebook can use result from Cell 2:
# cat_files = sorted([f for f in os.listdir(traindir) if f.startswith('cat')])
# dog_files = sorted([f for f in os.listdir(traindir) if f.startswith('dog')])

# # Select current batch
# cat_batch_files = cat_files[start_cat:start_cat+batch_size]
# dog_batch_files = dog_files[start_dog:start_dog+batch_size]

# # Helper, loads images given the sliced list
# def load_images(files, label):
#     data, labels = [], []
#     for file in files:
#         try:
#             img = cv2.imread(os.path.join(traindir, file), cv2.IMREAD_GRAYSCALE)
#             img = cv2.resize(img, (img_size, img_size))
#             data.append(img.flatten())
#             labels.append(label)
#         except Exception as e:
#             print(f"Error reading {file}: {e}")
#     return data, labels

# # Load actual images and labels
# x_cat, y_cat = load_images(cat_batch_files, 0)
# x_dog, y_dog = load_images(dog_batch_files, 1)

# x = np.array(x_cat + x_dog)
# y = np.array(y_cat + y_dog)
# print(f"Loaded batch: {len(x_cat)} cats, {len(x_dog)} dogs")

# # Reload PCA/make new if first time
# pca_path = "pca_catsdogs.joblib"
# if os.path.exists(pca_path):
#     pca = joblib.load(pca_path)
#     print("[INFO] Loaded PCA.")
# else:
#     # Fit PCA only ONCE on first batch, then use for all
#     pca = PCA(n_components=n_components, whiten=True)
#     pca.fit(x / 255.0)
#     joblib.dump(pca, pca_path)
#     print("[INFO] Fitted & saved PCA.")

# x_pca = pca.transform(x / 255.0)

# # Reload Checkpointed model or Create if first run
# model_path = "sgd_catsdogs.joblib"
# if os.path.exists(model_path):
#     sgd = joblib.load(model_path)
#     print("[INFO] Loaded model checkpoint.")
#     classes_supplied = False
# else:
#     sgd = SGDClassifier(loss='log_loss', max_iter=1, learning_rate='optimal', warm_start=True)
#     classes_supplied = True

# # Train on this batch
# if classes_supplied:
#     sgd.partial_fit(x_pca, y, classes=[0,1])
# else:
#     sgd.partial_fit(x_pca, y)

# joblib.dump(sgd, model_path)
# print("[INFO] Trained & saved model checkpoint for this batch.")



Loaded batch: 500 cats, 500 dogs
[INFO] Loaded PCA.
[INFO] Loaded model checkpoint.
[INFO] Trained & saved model checkpoint for this batch.


In [29]:
import os
import numpy as np
import cv2
import joblib
import random
from sklearn.decomposition import IncrementalPCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# -------- PARAMETERS ----------
start_index = 12500
batch_size = 1000
img_size = 64
n_components = 100  # for PCA
memory_size = 5000
epochs_per_batch = 3  # we'll retrain multiple times
traindir = r"C:\Users\Srija\Downloads\train\train"

model_path = "svc_catsdogs.joblib"
pca_path = "pca_incremental.joblib"
memory_path = "memory_data.npz"

# -------- Validation set ----------
val_cat = sorted([f for f in os.listdir(traindir) if f.startswith('cat')])[9000:9200]
val_dog = sorted([f for f in os.listdir(traindir) if f.startswith('dog')])[9000:9200]

def load_images(files, label):
    X, y = [], []
    for f in files:
        try:
            img = cv2.imread(os.path.join(traindir, f), cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (img_size, img_size))
            X.append(img.flatten())
            y.append(label)
        except:
            pass
    return X, y

x_val_cat, y_val_cat = load_images(val_cat, 0)
x_val_dog, y_val_dog = load_images(val_dog, 1)
x_val = np.array(x_val_cat + x_val_dog)
y_val = np.array(y_val_cat + y_val_dog)

# -------- Load current batch ----------
cat_files = sorted([f for f in os.listdir(traindir) if f.startswith('cat')])
dog_files = sorted([f for f in os.listdir(traindir) if f.startswith('dog')])

batch = list(zip(cat_files[start_index:start_index+batch_size], [0]*batch_size)) + \
        list(zip(dog_files[start_index:start_index+batch_size], [1]*batch_size))
random.shuffle(batch)
batch_files, batch_labels = zip(*batch)
x_batch, _ = load_images(batch_files, None)
y_batch = list(batch_labels)

# -------- Memory Replay ----------
if os.path.exists(memory_path):
    mem = np.load(memory_path, allow_pickle=True)
    memory_x = list(mem["x"])
    memory_y = list(mem["y"])
else:
    memory_x, memory_y = [], []

memory_x.extend(x_batch)
memory_y.extend(y_batch)
if len(memory_x) > memory_size:
    idx = np.random.choice(len(memory_x), memory_size, replace=False)
    memory_x = [memory_x[i] for i in idx]
    memory_y = [memory_y[i] for i in idx]

np.savez(memory_path, x=memory_x, y=memory_y)

x_train = np.array(memory_x)
y_train = np.array(memory_y)

# -------- PCA ----------
if os.path.exists(pca_path):
    pca = joblib.load(pca_path)
else:
    pca = IncrementalPCA(n_components=n_components, whiten=True)
    pca.partial_fit(x_train / 255.0)

pca.partial_fit(x_train / 255.0)
joblib.dump(pca, pca_path)

x_train_pca = pca.transform(x_train / 255.0)
x_val_pca = pca.transform(x_val / 255.0)

# -------- SVC Training ----------
svc = SVC(kernel='rbf', C=1.0, gamma='scale', probability=False)  # kernel SVC :contentReference[oaicite:1]{index=1}

# Repeated training for stability (optional).
for _ in range(epochs_per_batch):
    svc.fit(x_train_pca, y_train)

# -------- Validation ----------
y_pred = svc.predict(x_val_pca)
val_acc = accuracy_score(y_val, y_pred)
print(f"[INFO] Validation Accuracy after batch {start_index}: {val_acc:.4f}")

# -------- Save Model ----------
joblib.dump(svc, model_path)
print("[INFO] SVC model saved.")


ValueError: not enough values to unpack (expected 2, got 0)