# Product Similarity Search Engine - Training Pipeline

This notebook covers the data engineering and modeling phases:
1.  **Data Loading**: Loading images and labels.
2.  **Feature Extraction**: Using MobileNetV2 to extract features.
3.  **Dimensionality Reduction**: Using PCA.
4.  **Clustering**: K-Means with Elbow Method.
5.  **Classification**: Linear SVM.
6.  **Search Index**: Building a FAISS index.
7.  **Evaluation**: Visualizing and measuring performance.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing import image as keras_image
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import faiss

# Config
DATA_DIR = '../data/images'
CSV_PATH = '../data/images_dataset.csv'
MODELS_DIR = '../models'
IMG_SIZE = (224, 224)
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

## 1. Load Data

In [None]:
# Load Metadata
df = pd.read_csv(CSV_PATH)

# Extract filename from URL (assuming filenames in 'downloaded_images' match the end of the URL)
# Use a more robust check if needed. For now, let's assume standard Amazon format.
def extract_filename(url):
    return url.split('/')[-1]

df['filename'] = df['image'].apply(extract_filename)

# Filter for images that actually exist
existing_files = set(os.listdir(DATA_DIR))
df = df[df['filename'].isin(existing_files)].reset_index(drop=True)

print(f"Found {len(df)} images available locally out of {len(pd.read_csv(CSV_PATH))}.")
df.head()

## 2. Feature Extraction (MobileNetV2)

In [None]:
# Load Pre-trained Model
mobilenet = MobileNetV2(weights='imagenet', include_top=False, pooling='avg', input_shape=(224, 224, 3))
mobilenet.trainable = False

def load_and_preprocess_image(filename):
    path = os.path.join(DATA_DIR, filename)
    img = keras_image.load_img(path, target_size=IMG_SIZE)
    x = keras_image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

# Extract Features
# Using batch processing for efficiency is possible, but loop is simpler for tutorial notebooks
features = []
valid_indices = []

print("Extracting features...")
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        img_data = load_and_preprocess_image(row['filename'])
        feat = mobilenet.predict(img_data, verbose=0)
        features.append(feat.flatten())
        valid_indices.append(i)
    except Exception as e:
        print(f"Error processing {row['filename']}: {e}")

features = np.array(features)
df_clean = df.iloc[valid_indices].reset_index(drop=True)
print(f"Feature matrix shape: {features.shape}")

## 3. PCA Compression & Visualization

In [None]:
# Fit PCA
pca = PCA(n_components=50)
pca_features = pca.fit_transform(features)

# Visualization of Variance
plt.figure(figsize=(10, 5))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()

print(f"Cumulative variance explained by 50 components: {np.sum(pca.explained_variance_ratio_):.4f}")

## 4. Clustering (K-Means) with Elbow Method

In [None]:
wcss = []
k_range = range(2, 16)

print("Running Elbow Method...")
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=SEED, n_init='auto')
    kmeans.fit(pca_features)
    wcss.append(kmeans.inertia_)

# Plot Elbow
plt.figure(figsize=(10, 5))
plt.plot(k_range, wcss, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()

In [None]:
# Train Final K-Means (Let's pick an optimal K, e.g., 6, or inspect the plot above first)
# For automation, let's assume K=6 is reasonable for this dataset size/diversity, or user sets it.
OPTIMAL_K = 6 # Update this based on the plot
kmeans = KMeans(n_clusters=OPTIMAL_K, random_state=SEED, n_init='auto')
clusters = kmeans.fit_predict(pca_features)

df_clean['cluster'] = clusters

# Visualize Clusters (2D PCA)
pca_2d = PCA(n_components=2)
features_2d = pca_2d.fit_transform(features)

plt.figure(figsize=(12, 8))
sns.scatterplot(x=features_2d[:,0], y=features_2d[:,1], hue=clusters, palette='tab10', s=50, alpha=0.7)
plt.title('K-Means Clusters (2D PCA)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(title='Cluster')
plt.show()

## 5. Classification (Linear SVM)

In [None]:
# Prepare Data using CSV Labels
# Mapping category text to int
target_column = 'main_category' # Or 'name' if we want granular, but 'main_category' is likely better for SVM accuracy
le = LabelEncoder()
y = le.fit_transform(df_clean[target_column])

X_train, X_test, y_train, y_test = train_test_split(pca_features, y, test_size=0.2, random_state=SEED, stratify=y)

# Train SVM
svm_model = LinearSVC(random_state=SEED, dual='auto')
svm_model.fit(X_train, y_train)

# Evaluate
y_pred = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - SVM')
plt.show()

## 6. FAISS Retrieval

In [None]:
# Build Index
d = pca_features.shape[1]
index = faiss.IndexFlatL2(d)
index.add(pca_features.astype('float32'))

print(f"Number of vectors in index: {index.ntotal}")

# Save Index
faiss.write_index(index, os.path.join(MODELS_DIR, 'faiss_index.bin'))

## 7. Save Models

In [None]:
with open(os.path.join(MODELS_DIR, 'pca.pkl'), 'wb') as f:
    pickle.dump(pca, f)

with open(os.path.join(MODELS_DIR, 'kmeans.pkl'), 'wb') as f:
    pickle.dump(kmeans, f)

with open(os.path.join(MODELS_DIR, 'svm.pkl'), 'wb') as f:
    pickle.dump(svm_model, f)

with open(os.path.join(MODELS_DIR, 'label_encoder.pkl'), 'wb') as f:
    pickle.dump(le, f)

# Save Processed Metadata with PCA features (optional, for app fast loading)
df_clean.to_pickle(os.path.join(DATA_DIR, 'processed_metadata.pkl'))

print("All models and data saved!")