In [1]:
# 1. Loading and Exploring the Data
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/sample_products.csv')

# Display the first few rows to understand the data
print(df.head())

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# 2. Feature Engineering
# We'll process the text-based features: categories, tags, or descriptions

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Assuming 'category' is a column with categorical data and 'description' is a column with textual data
# If needed, encode categorical data using LabelEncoder
encoder = LabelEncoder()
df['category'] = encoder.fit_transform(df['category'])

# For text-based features (e.g., 'description', 'tags'), apply TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Use stopwords to remove common words
tfidf_matrix = vectorizer.fit_transform(df['description'])  # Replace 'description' with relevant column

# 3. Content-Based Filtering with Cosine Similarity

from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity between the products based on their TF-IDF representation
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 4. Generating Recommendations
# Function to recommend top N similar products for a given product index
def recommend_products(product_index, cosine_sim=cosine_sim, top_n=10):
    sim_scores = list(enumerate(cosine_sim[product_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude the product itself (index 0)
    
    product_indices = [i[0] for i in sim_scores]
    return df.iloc[product_indices]

# Example: Recommend top 10 products similar to the first product
recommended_products = recommend_products(0, cosine_sim, top_n=10)
print("Recommended Products:")
print(recommended_products)

# 5. Saving the Model using Joblib

import joblib

# Save the TF-IDF vectorizer and the cosine similarity matrix using joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(cosine_sim, 'cosine_similarity_matrix.joblib')

print("Model saved as 'tfidf_vectorizer.joblib' and 'cosine_similarity_matrix.joblib'")


   product_id                       name     category       brand   price  \
0           1  PageTurner Electronics #1  Electronics  PageTurner  439.23   
1           2       PageTurner Beauty #2       Beauty  PageTurner  292.59   
2           3      PageTurner Fashion #3      Fashion  PageTurner  373.81   
3           4            Silk&Co Home #4         Home     Silk&Co  402.76   
4           5       PageTurner Beauty #5       Beauty  PageTurner  337.18   

                                          tags  \
0                battery|wireless|bluetooth|4K   
1  anti-aging|SPF|vitamin-c|sensitive|skincare   
2                 summer|formal|casual|leather   
3                 eco-friendly|durable|minimal   
4  SPF|hydrating|anti-aging|skincare|vitamin-c   

                                         description  
0  PageTurner electronics item designed for batte...  
1  PageTurner beauty item designed for anti-aging...  
2  PageTurner fashion item designed for summer an...  
3  Silk&Co home 