In [1]:
# 03_train_recommender.ipynb

# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Step 2: Load Processed Data
# Load the processed features and labels (these were saved in the feature engineering notebook)
final_features = np.load('processed_features.npy')
labels = np.load('labels.npy')

# Step 3: Train-Test Split for Model Evaluation
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(final_features, labels, test_size=0.2, random_state=42)

# Step 4: Model Training
# We'll use Logistic Regression for a simple classification-based recommendation model.
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 5: Model Evaluation
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model performance
print("\nModel Evaluation - Classification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Step 6: Save the Model
# Save the trained model to a file using joblib
joblib.dump(model, 'recommender_model.joblib')

# Step 7: Generate Product Recommendations Based on Similarity
# We will use cosine similarity to recommend products based on the trained model's prediction.

# Calculate the cosine similarity matrix using the final_features
cos_sim = cosine_similarity(final_features)

# Function to get product recommendations based on a product ID
def get_product_recommendations(product_id, top_n=5):
    # Get the index of the product in the dataset
    product_idx = product_data[product_data['product_id'] == product_id].index[0]
    
    # Get the pairwise similarity scores for the product with all other products
    sim_scores = list(enumerate(cos_sim[product_idx]))
    
    # Sort the products based on similarity scores (descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top n most similar products (excluding the product itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the product IDs of the most similar products
    recommended_product_ids = [product_data['product_id'][i[0]] for i in sim_scores]
    
    return recommended_product_ids

# Example: Get recommendations for a specific product (e.g., product_id = 14)
product_id = 14
recommended_products = get_product_recommendations(product_id, top_n=5)

print("\nRecommended products for product_id", product_id, ":")
print(recommended_products)

ValueError: Found input variables with inconsistent numbers of samples: [60, 800]

In [2]:
# Ensure the number of labels matches the number of features
print(f"final_features shape: {final_features.shape}")
print(f"labels shape: {labels.shape}")

# Check if they align by ensuring that the indices match
# If needed, align the data manually
if final_features.shape[0] != labels.shape[0]:
    # Example: Remove extra rows from final_features to match the number of labels
    final_features = final_features[:labels.shape[0]]

# Now try the train-test split again
X_train, X_test, y_train, y_test = train_test_split(final_features, labels, test_size=0.2, random_state=42)


final_features shape: (60, 48)
labels shape: (800,)


ValueError: Found input variables with inconsistent numbers of samples: [60, 800]

In [3]:
# Check the product ids for final_features
print(final_features.shape)  # (60, 48) -> 60 products

# Get the product IDs from final_features
product_ids_in_features = product_data['product_id'].head(60).values  # Adjust this based on your data

# Filter the interaction labels to only include the product_ids present in final_features
filtered_labels = interaction_data[interaction_data['product_id'].isin(product_ids_in_features)]

# Now make sure filtered_labels has the same number of rows as final_features
print(f"Filtered labels shape: {filtered_labels.shape}")

# If lengths match, proceed with train-test split
if final_features.shape[0] == filtered_labels.shape[0]:
    X_train, X_test, y_train, y_test = train_test_split(final_features, filtered_labels['interaction'], test_size=0.2, random_state=42)


(60, 48)


NameError: name 'product_data' is not defined

In [6]:
# Load product data if it was not already loaded
product_data = pd.read_csv('../data/sample_products.csv')

# Check the shape of final_features
print(final_features.shape)  # (60, 48) -> 60 products

# Get the product IDs from final_features (since it's for the first 60 products)
product_ids_in_features = product_data['product_id'].head(60).values  # Adjust this based on your data

# Filter the interaction labels to only include the product_ids present in final_features
filtered_labels = interaction_data[interaction_data['product_id'].isin(product_ids_in_features)]

# Now make sure filtered_labels has the same number of rows as final_features
print(f"Filtered labels shape: {filtered_labels.shape}")

# If lengths match, proceed with train-test split
if final_features.shape[0] == filtered_labels.shape[0]:
    X_train, X_test, y_train, y_test = train_test_split(final_features, filtered_labels['interaction'], test_size=0.2, random_state=42)
else:
    print("Error: The number of features and labels don't match.")



(60, 48)


NameError: name 'interaction_data' is not defined

In [7]:
# Load the necessary data files from the 'processed' directory
product_data = pd.read_csv('data/processed/sample_products.csv')
interaction_data = pd.read_csv('data/processed/sample_interactions.csv')

# Check the shape of final_features
print(final_features.shape)  # (60, 48) -> 60 products

# Get the product IDs from final_features (since it's for the first 60 products)
product_ids_in_features = product_data['product_id'].head(60).values  # Adjust this based on your data

# Filter the interaction labels to only include the product_ids present in final_features
filtered_labels = interaction_data[interaction_data['product_id'].isin(product_ids_in_features)]

# Now make sure filtered_labels has the same number of rows as final_features
print(f"Filtered labels shape: {filtered_labels.shape}")

# If lengths match, proceed with train-test split
if final_features.shape[0] == filtered_labels.shape[0]:
    X_train, X_test, y_train, y_test = train_test_split(final_features, filtered_labels['interaction'], test_size=0.2, random_state=42)
else:
    print("Error: The number of features and labels don't match.")


FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/sample_products.csv'

In [8]:
# 03_train_recommender.ipynb

# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Step 2: Load Data
# Adjust the path based on the correct directory
product_data = pd.read_csv('../data/processed/sample_products.csv')  # Corrected path
interaction_data = pd.read_csv('../data/processed/sample_interactions.csv')  # Corrected path

# Check the shape of final_features
final_features = np.load('processed_features.npy')  # Assuming processed features are saved here
labels = np.load('labels.npy')  # Assuming labels are saved here

# Ensure that final_features and labels align
print(f"Shape of final_features: {final_features.shape}")  # e.g., (60, 48) -> 60 products
print(f"Shape of labels: {labels.shape}")  # Should match number of rows in final_features

# Step 3: Align Labels with Features
# Get the product IDs from final_features (since it's for the first 60 products)
product_ids_in_features = product_data['product_id'].head(final_features.shape[0]).values

# Filter the interaction labels to only include the product_ids present in final_features
filtered_labels = interaction_data[interaction_data['product_id'].isin(product_ids_in_features)]

# Ensure that filtered_labels has the same number of rows as final_features
print(f"Filtered labels shape: {filtered_labels.shape}")

# Step 4: Train-Test Split for Model Evaluation
# Ensure labels and features have the same number of rows
if final_features.shape[0] == filtered_labels.shape[0]:
    X_train, X_test, y_train, y_test = train_test_split(final_features, filtered_labels['event_type'], test_size=0.2, random_state=42)
else:
    print("Error: The number of features and labels don't match.")
    exit()

# Step 5: Model Training
# We'll use Logistic Regression for simplicity (you can replace this with other models if desired)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 6: Model Evaluation
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model performance
print("\nModel Evaluation - Classification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Step 7: Save the Model
# Save the trained model to a file using joblib
joblib.dump(model, 'recommender_model.joblib')

# Step 8: Generate Product Recommendations Based on Similarity
# We will use cosine similarity to recommend products based on the trained model's prediction.

from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity matrix using the final_features
cos_sim = cosine_similarity(final_features)

# Function to get product recommendations based on a product ID
def get_product_recommendations(product_id, top_n=5):
    # Get the index of the product in the dataset
    product_idx = product_data[product_data['product_id'] == product_id].index[0]
    
    # Get the pairwise similarity scores for the product with all other products
    sim_scores = list(enumerate(cos_sim[product_idx]))
    
    # Sort the products based on similarity scores (descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top n most similar products (excluding the product itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the product IDs of the most similar products
    recommended_product_ids = [product_data['product_id'][i[0]] for i in sim_scores]
    
    return recommended_product_ids

# Example: Get recommendations for a specific product (e.g., product_id = 14)
product_id = 14
recommended_products = get_product_recommendations(product_id, top_n=5)

print("\nRecommended products for product_id", product_id, ":")
print(recommended_products)


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/sample_products.csv'