# Product Recommendation System: Data Exploration and Model Training

This notebook demonstrates the process of building a content-based recommendation system using TF-IDF vectorization and cosine similarity.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import os

# Set style for visualizations
sns.set_style('whitegrid')
plt.style.use('seaborn-v0_8-whitegrid')

## 1. Data Loading and Exploration

In [None]:
# Load the dataset
data_path = '../data/generic_dataset.csv'
products_df = pd.read_csv(data_path)

# Display the first few rows
print(f"Dataset shape: {products_df.shape}")
products_df.head()

In [None]:
# Basic data exploration
products_df.info()

In [None]:
# Check for missing values
missing_data = products_df.isnull().sum()
print("Missing values in each column:")
print(missing_data)

In [None]:
# Explore product categories
category_counts = products_df['category'].value_counts()
print("Product categories:")
print(category_counts)

# Visualize the distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Distribution of Product Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 2. Data Preparation for Content-Based Filtering

In [None]:
# Combine relevant text features to create a content representation
products_df['content'] = products_df['product_name'] + ' ' + products_df['category'] + ' ' + products_df['description']

# Display the combined content field
products_df[['product_id', 'product_name', 'content']].head()

## 3. Create TF-IDF Vectors

In [None]:
# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the content field
tfidf_matrix = tfidf.fit_transform(products_df['content'])

# Show the TF-IDF matrix shape
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Get feature names (terms)
feature_names = tfidf.get_feature_names_out()
print(f"Number of features (terms): {len(feature_names)}")
print("Sample features:", feature_names[:10])

## 4. Calculate Similarity Matrix

In [None]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Show the similarity matrix shape
print(f"Cosine similarity matrix shape: {cosine_sim.shape}")

# Visualize a portion of the similarity matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cosine_sim[:5, :5], annot=True, cmap='YlGnBu')
plt.title('Cosine Similarity Between First 5 Products')
plt.show()

## 5. Test the Recommendation System

In [None]:
# Function to get recommendations
def get_recommendations(product_id, cosine_sim=cosine_sim, df=products_df, num_recommendations=5):
    # Get index of the product
    idx = df[df['product_id'] == product_id].index[0]
    
    # Get similarity scores for all products with the target product
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort products based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N most similar products (excluding itself)
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # Get product indices
    product_indices = [i[0] for i in sim_scores]
    
    # Return recommended products
    return df.iloc[product_indices][['product_id', 'product_name', 'category']]

In [None]:
# Test the recommendation function for a specific product
test_product_id = 4  # Wireless Headphones
test_product = products_df[products_df['product_id'] == test_product_id].iloc[0]
print(f"Recommendations for {test_product['product_name']} (Category: {test_product['category']}):\n")

recommendations = get_recommendations(test_product_id)
recommendations

In [None]:
# Try another product
test_product_id = 9  # Yoga Mat
test_product = products_df[products_df['product_id'] == test_product_id].iloc[0]
print(f"Recommendations for {test_product['product_name']} (Category: {test_product['category']}):\n")

recommendations = get_recommendations(test_product_id)
recommendations

## 6. Save the Model for Production Use

In [None]:
# Prepare the model to be saved
# We'll save the TF-IDF matrix and the product dataframe
model_data = (tfidf_matrix, products_df)

# Create directory if it doesn't exist
model_dir = '../ml_models'
os.makedirs(model_dir, exist_ok=True)

# Save the model using joblib
model_path = os.path.join(model_dir, 'content_model.pkl')
joblib.dump(model_data, model_path)
print(f"Model saved to {model_path}")

## 7. Test Loading the Model

In [None]:
# Load the model from disk
loaded_tfidf_matrix, loaded_products_df = joblib.load(model_path)

# Verify the loaded data
print(f"Loaded TF-IDF matrix shape: {loaded_tfidf_matrix.shape}")
print(f"Loaded products dataframe shape: {loaded_products_df.shape}")

# Make sure the shapes match
assert tfidf_matrix.shape == loaded_tfidf_matrix.shape
assert products_df.shape == loaded_products_df.shape

print("Model successfully loaded and verified!")

## 8. Summary and Next Steps

In this notebook, we have:
1. Loaded and explored the product dataset
2. Created TF-IDF vectors from product text data
3. Calculated cosine similarity between products
4. Built a recommendation function to suggest similar products
5. Saved the model for use in production

Next steps:
- Integrate this model with the FastAPI application
- Consider adding user interaction data for collaborative filtering
- Implement A/B testing to evaluate recommendation quality
- Add monitoring for recommendation performance