# Recommendation System Experiments

This notebook demonstrates how to experiment with different recommendation approaches for the Agri-Connect platform.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix

from app.db import db
from app.config import settings

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Load products
products_df = db.get_products()
print(f"Total products: {len(products_df)}")
products_df.head()

In [None]:
# Load user interactions
events_df = db.get_events()
print(f"Total events: {len(events_df)}")
events_df.head()

## 2. Content-Based Filtering

In [None]:
# Create text features
products_df['text'] = (
    products_df['title'].fillna('') + ' ' + 
    products_df['description'].fillna('') + ' ' + 
    products_df['category'].fillna('')
)

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(products_df['text'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

In [None]:
# Calculate similarity for a sample product
sample_idx = 0
sample_product = products_df.iloc[sample_idx]

cosine_similarities = cosine_similarity(tfidf_matrix[sample_idx], tfidf_matrix).flatten()
similar_indices = cosine_similarities.argsort()[-6:-1][::-1]

print(f"\nSimilar products to: {sample_product['title']}\n")
for idx in similar_indices:
    print(f"  {products_df.iloc[idx]['title']}: {cosine_similarities[idx]:.3f}")

## 3. Collaborative Filtering

In [None]:
# Create user-item matrix
interactions_df = db.get_user_product_matrix()

if not interactions_df.empty:
    # Aggregate scores
    interactions_df = interactions_df.groupby(['userId', 'productId'])['score'].sum().reset_index()
    
    # Create mappings
    user_ids = interactions_df['userId'].unique()
    product_ids = interactions_df['productId'].unique()
    
    user_map = {uid: idx for idx, uid in enumerate(user_ids)}
    product_map = {pid: idx for idx, pid in enumerate(product_ids)}
    
    # Create sparse matrix
    rows = interactions_df['userId'].map(user_map).values
    cols = interactions_df['productId'].map(product_map).values
    data = interactions_df['score'].values
    
    user_item_matrix = csr_matrix(
        (data, (rows, cols)),
        shape=(len(user_ids), len(product_ids))
    )
    
    print(f"User-item matrix shape: {user_item_matrix.shape}")
    print(f"Sparsity: {1 - (user_item_matrix.nnz / (user_item_matrix.shape[0] * user_item_matrix.shape[1])):.4f}")
else:
    print("No interaction data available")

In [None]:
# Train ALS model
if not interactions_df.empty:
    model = AlternatingLeastSquares(
        factors=50,
        regularization=0.01,
        iterations=15
    )
    
    model.fit(user_item_matrix)
    print("ALS model trained successfully")
else:
    print("Cannot train ALS model without interaction data")

## 4. Evaluation

In [None]:
# Distribution of product categories
plt.figure(figsize=(12, 6))
products_df['category'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Product Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Event type distribution
if not events_df.empty:
    plt.figure(figsize=(10, 6))
    events_df['type'].value_counts().plot(kind='bar')
    plt.title('User Event Distribution')
    plt.xlabel('Event Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
# Price distribution
plt.figure(figsize=(10, 6))
plt.hist(products_df['price'], bins=50, edgecolor='black')
plt.title('Product Price Distribution')
plt.xlabel('Price (₹)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

## 5. Hybrid Recommendations

Combine content-based and collaborative filtering for better results.

In [None]:
def hybrid_recommendations(product_id, user_id=None, n=10, alpha=0.5):
    """
    Generate hybrid recommendations.
    
    alpha: weight for content-based (1-alpha for collaborative)
    """
    # Content-based scores
    idx = products_df[products_df['id'] == product_id].index[0]
    content_scores = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    
    # Combine with collaborative if available
    if user_id and not interactions_df.empty:
        # Get collaborative scores (simplified)
        collab_scores = content_scores * 0  # Placeholder
        
        # Hybrid
        final_scores = alpha * content_scores + (1 - alpha) * collab_scores
    else:
        final_scores = content_scores
    
    # Get top N
    top_indices = final_scores.argsort()[-(n+1):-1][::-1]
    
    return products_df.iloc[top_indices][['title', 'category', 'price']]

# Example
if len(products_df) > 0:
    sample_id = products_df.iloc[0]['id']
    print(f"Recommendations for: {products_df.iloc[0]['title']}\n")
    print(hybrid_recommendations(sample_id, n=5))

## 6. Save Insights

Export useful statistics and insights.

In [None]:
insights = {
    'total_products': len(products_df),
    'total_events': len(events_df),
    'unique_users': events_df['userId'].nunique() if not events_df.empty else 0,
    'avg_price': products_df['price'].mean(),
    'top_category': products_df['category'].value_counts().index[0] if len(products_df) > 0 else None
}

print("\nKey Insights:")
for key, value in insights.items():
    print(f"  {key}: {value}")