# Content-Based Image Recommender with Custom CNN (Keras)
This notebook builds a CNN to extract visual features from e-commerce product images and recommends similar products based on image embeddings.

In [None]:
# Step 1: Load CSV and Prepare Image Paths
import pandas as pd
import os

df = pd.read_csv('/mnt/data/fashion.csv')
df['ImagePath'] = df['Image'].apply(lambda x: os.path.join('images', x))
df = df.dropna(subset=['ImageURL'])
df.head()

In [None]:
# Step 2: Download Images
import requests
from tqdm import tqdm
os.makedirs('images', exist_ok=True)

for i, row in tqdm(df.iterrows(), total=len(df)):
    image_path = os.path.join('images', row['Image'])
    if not os.path.exists(image_path):
        try:
            r = requests.get(row['ImageURL'], timeout=5)
            with open(image_path, 'wb') as f:
                f.write(r.content)
        except:
            print(f"Failed to download {row['ImageURL']}")

In [None]:
# Step 3: Image Preprocessing
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_gen = datagen.flow_from_dataframe(
    dataframe=df,
    directory='images',
    x_col='Image',
    y_col='Category',
    target_size=(128, 128),
    class_mode='categorical',
    subset='training',
    batch_size=32
)

val_gen = datagen.flow_from_dataframe(
    dataframe=df,
    directory='images',
    x_col='Image',
    y_col='Category',
    target_size=(128, 128),
    class_mode='categorical',
    subset='validation',
    batch_size=32
)

In [None]:
# Step 4: Build Custom CNN Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential([
    Conv2D(32, (3,3), activation='relu', padding='same', input_shape=(128, 128, 3)),
    MaxPooling2D(2,2),
    Conv2D(64, (3,3), activation='relu', padding='same'),
    MaxPooling2D(2,2),
    Conv2D(128, (3,3), activation='relu', padding='same'),
    MaxPooling2D(2,2),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu', name='embedding'),
    Dense(train_gen.num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Step 5: Train Model
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=10
)

In [None]:
# Step 6: Extract Embeddings
from tensorflow.keras.models import Model
import numpy as np
from tensorflow.keras.preprocessing import image

feature_model = Model(inputs=model.input, outputs=model.get_layer('embedding').output)

embeddings = []
for img_path in tqdm(df['ImagePath']):
    try:
        img = image.load_img(img_path, target_size=(128, 128))
        img_array = image.img_to_array(img) / 255.0
        img_array = np.expand_dims(img_array, axis=0)
        embedding = feature_model.predict(img_array, verbose=0)[0]
        embeddings.append(embedding)
    except:
        embeddings.append(np.zeros(128))

df['embedding'] = embeddings

In [None]:

# Step 7: Recommend Similar Products using Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Convert list of embeddings to numpy array
embedding_matrix = np.array(df['embedding'].tolist())

# Function to get top N similar products
def get_similar_products(product_id, top_n=5):
    idx = df[df['ProductId'] == product_id].index[0]
    query_vec = embedding_matrix[idx].reshape(1, -1)
    similarities = cosine_similarity(query_vec, embedding_matrix)[0]
    similar_indices = similarities.argsort()[::-1][1:top_n+1]
    return df.iloc[similar_indices][['ProductId', 'ProductTitle', 'Category', 'ImagePath']]

# Example: get 5 products similar to ProductId 42419
get_similar_products(42419)
