In [1]:
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pickle
import warnings
warnings.filterwarnings('ignore')

# Add src to path
ROOT = Path.cwd().parent
sys.path.append(str(ROOT / "src"))

from models.interaction_matrix import InteractionMatrixBuilder


In [4]:
# Check if features file exists and specify path explicitly
features_path = ROOT / "data" / "features" / "features_table_v2.csv"
if not features_path.exists():
    features_path = ROOT / "data" / "features" / "features_table.csv"

if not features_path.exists():
    raise FileNotFoundError(f"Features file not found. Checked: {features_path}")

print(f"Using features file: {features_path}")

# Initialize matrix builder with explicit path
builder = InteractionMatrixBuilder(features_path=str(features_path))

# Load matrices
matrices_dir = ROOT / "data" / "matrices"
confidence_matrix = builder.load_matrix('confidence', matrices_dir)
quantity_matrix = builder.load_matrix('quantity', matrices_dir)
binary_matrix = builder.load_matrix('binary', matrices_dir)

# Load encoders
encoders = np.load(matrices_dir / "encoders.npz")
user_classes = encoders['user_classes']
item_classes = encoders['item_classes']

print(f"Matrix shape: {confidence_matrix.shape}")
print(f"Users: {len(user_classes)}, Items: {len(item_classes)}")
print(f"Interactions: {confidence_matrix.nnz:,}")
print(f"Density: {confidence_matrix.nnz / (confidence_matrix.shape[0] * confidence_matrix.shape[1]) * 100:.2f}%")


2025-07-27 09:51:11,437 — INFO — Loading confidence matrix from c:\KAUST-Project\data\matrices\confidence_matrix.npz
2025-07-27 09:51:11,445 — INFO — Loading quantity matrix from c:\KAUST-Project\data\matrices\quantity_matrix.npz
2025-07-27 09:51:11,449 — INFO — Loading binary matrix from c:\KAUST-Project\data\matrices\binary_matrix.npz


Using features file: c:\KAUST-Project\data\features\features_table_v2.csv


ValueError: Object arrays cannot be loaded when allow_pickle=False

In [None]:
# Uncomment and run if needed
!pip install implicit scikit-learn matplotlib seaborn


In [None]:
import implicit
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('default')
sns.set_palette("husl")


In [None]:
def create_train_test_split(matrix, test_ratio=0.2, random_state=42):
    """
    Create train/test split by randomly holding out interactions per user.
    """
    np.random.seed(random_state)
    train_matrix = matrix.copy()
    test_interactions = []
    
    for user_id in range(matrix.shape[0]):
        user_items = matrix.getrow(user_id).indices
        if len(user_items) > 1:  # Only split if user has multiple interactions
            n_test = max(1, int(len(user_items) * test_ratio))
            test_items = np.random.choice(user_items, size=n_test, replace=False)
            
            for item_id in test_items:
                test_interactions.append((user_id, item_id, matrix[user_id, item_id]))
                train_matrix[user_id, item_id] = 0
    
    train_matrix.eliminate_zeros()
    return train_matrix, test_interactions

# Create splits
train_matrix, test_interactions = create_train_test_split(confidence_matrix)

print(f"Train interactions: {train_matrix.nnz:,}")
print(f"Test interactions: {len(test_interactions):,}")
print(f"Split ratio: {len(test_interactions) / (train_matrix.nnz + len(test_interactions)):.1%}")


In [None]:
# Train ALS model
model = implicit.als.AlternatingLeastSquares(
    factors=50,
    regularization=0.1,
    iterations=200,
    random_state=42
)

# Fit model (implicit expects item-user matrix)
model.fit(train_matrix.T)

print("Model training completed")


In [None]:
def precision_recall_at_k(model, test_interactions, train_matrix, k=10):
    """
    Calculate Precision@K and Recall@K for the model.
    """
    precisions = []
    recalls = []
    
    # Group test interactions by user
    user_test_items = {}
    for user_id, item_id, rating in test_interactions:
        if user_id not in user_test_items:
            user_test_items[user_id] = []
        user_test_items[user_id].append(item_id)
    
    for user_id, true_items in user_test_items.items():
        # Get recommendations
        recs = model.recommend(user_id, train_matrix[user_id], N=k, filter_already_liked_items=True)
        recommended_items = [item_id for item_id, score in recs]
        
        # Calculate metrics
        hits = len(set(recommended_items) & set(true_items))
        precision = hits / len(recommended_items) if recommended_items else 0
        recall = hits / len(true_items) if true_items else 0
        
        precisions.append(precision)
        recalls.append(recall)
    
    return np.mean(precisions), np.mean(recalls)

# Evaluate at different K values
k_values = [5, 10, 20, 50]
results = []

for k in k_values:
    precision, recall = precision_recall_at_k(model, test_interactions, train_matrix, k=k)
    results.append({'K': k, 'Precision': precision, 'Recall': recall})
    print(f"K={k:2d}: Precision@K={precision:.4f}, Recall@K={recall:.4f}")

results_df = pd.DataFrame(results)


In [None]:
# Test different hyperparameter combinations
param_grid = {
    'factors': [25, 50, 100],
    'regularization': [0.01, 0.1, 0.5],
    'iterations': [15, 20, 30]
}

best_score = 0
best_params = None
tuning_results = []

# Grid search (simplified - testing a subset)
test_configs = [
    {'factors': 25, 'regularization': 0.1, 'iterations': 15},
    {'factors': 50, 'regularization': 0.1, 'iterations': 20},
    {'factors': 100, 'regularization': 0.1, 'iterations': 20},
    {'factors': 50, 'regularization': 0.01, 'iterations': 20},
    {'factors': 50, 'regularization': 0.5, 'iterations': 20}
]

for params in test_configs:
    # Train model with current parameters
    temp_model = implicit.als.AlternatingLeastSquares(
        factors=params['factors'],
        regularization=params['regularization'],
        iterations=params['iterations'],
        random_state=42
    )
    temp_model.fit(train_matrix.T)
    
    # Evaluate
    precision, recall = precision_recall_at_k(temp_model, test_interactions, train_matrix, k=10)
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    result = {**params, 'Precision@10': precision, 'Recall@10': recall, 'F1@10': f1_score}
    tuning_results.append(result)
    
    if f1_score > best_score:
        best_score = f1_score
        best_params = params
        best_model = temp_model

tuning_df = pd.DataFrame(tuning_results)
print("Hyperparameter Tuning Results:")
print(tuning_df.round(4))
print(f"\nBest parameters: {best_params}")
print(f"Best F1@10 score: {best_score:.4f}")


In [None]:
def get_recommendations_for_user(user_idx, model, matrix, encoders, n_recommendations=10):
    """
    Get product recommendations for a specific user.
    """
    user_id = encoders['user_classes'][user_idx]
    
    # Get recommendations
    recs = model.recommend(user_idx, matrix[user_idx], N=n_recommendations, filter_already_liked_items=True)
    
    recommendations = []
    for item_idx, score in recs:
        sku = encoders['item_classes'][item_idx]
        recommendations.append({
            'CustomerID': user_id,
            'SKU': sku,
            'Score': score,
            'Rank': len(recommendations) + 1
        })
    
    return recommendations

# Generate recommendations for first 5 users
sample_recommendations = []
for user_idx in range(5):
    user_recs = get_recommendations_for_user(user_idx, best_model, train_matrix, encoders)
    sample_recommendations.extend(user_recs)

sample_df = pd.DataFrame(sample_recommendations)
print("Sample Recommendations:")
print(sample_df.head(20))


In [None]:
# Analyze user and item factors
user_factors = best_model.user_factors
item_factors = best_model.item_factors

print(f"User factors shape: {user_factors.shape}")
print(f"Item factors shape: {item_factors.shape}")

# Plot factor distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# User factors distribution
axes[0].hist(user_factors.flatten(), bins=50, alpha=0.7)
axes[0].set_title('Distribution of User Factors')
axes[0].set_xlabel('Factor Value')
axes[0].set_ylabel('Frequency')

# Item factors distribution
axes[1].hist(item_factors.flatten(), bins=50, alpha=0.7)
axes[1].set_title('Distribution of Item Factors')
axes[1].set_xlabel('Factor Value')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Most popular items (by interactions)
item_popularity = np.array(confidence_matrix.sum(axis=0)).flatten()
popular_items_idx = np.argsort(item_popularity)[::-1][:10]

print("Top 10 Most Popular Items:")
for i, item_idx in enumerate(popular_items_idx):
    sku = item_classes[item_idx]
    interactions = item_popularity[item_idx]
    print(f"{i+1:2d}. SKU {sku}: {interactions:.0f} interactions")


In [None]:
# Compare performance across different interaction matrices
matrices_to_test = {
    'Confidence': confidence_matrix,
    'Quantity': quantity_matrix,
    'Binary': binary_matrix
}

matrix_comparison = []

for matrix_name, matrix in matrices_to_test.items():
    # Create train/test split for this matrix
    train_mat, test_int = create_train_test_split(matrix)
    
    # Train model
    temp_model = implicit.als.AlternatingLeastSquares(
        factors=best_params['factors'],
        regularization=best_params['regularization'],
        iterations=best_params['iterations'],
        random_state=42
    )
    temp_model.fit(train_mat.T)
    
    # Evaluate
    precision, recall = precision_recall_at_k(temp_model, test_int, train_mat, k=10)
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    matrix_comparison.append({
        'Matrix': matrix_name,
        'Precision@10': precision,
        'Recall@10': recall,
        'F1@10': f1_score
    })

comparison_df = pd.DataFrame(matrix_comparison)
print("Matrix Comparison Results:")
print(comparison_df.round(4))


In [None]:
# Save the best model and metadata
models_dir = ROOT / "models" / "saved_models"
models_dir.mkdir(parents=True, exist_ok=True)

model_artifacts = {
    'model': best_model,
    'best_params': best_params,
    'user_encoder_classes': user_classes,
    'item_encoder_classes': item_classes,
    'evaluation_results': results_df,
    'hyperparameter_results': tuning_df,
    'matrix_comparison': comparison_df,
    'train_matrix_shape': train_matrix.shape,
    'test_interactions_count': len(test_interactions)
}

# Save model artifacts
with open(models_dir / "als_model_artifacts.pkl", 'wb') as f:
    pickle.dump(model_artifacts, f)

print(f"Model saved to {models_dir / 'als_model_artifacts.pkl'}")
print(f"Model can be loaded using: pickle.load(open(path, 'rb'))")


In [None]:
print("=" * 60)
print("MODEL TRAINING SUMMARY")
print("=" * 60)
print(f"Dataset: {confidence_matrix.shape[0]} users × {confidence_matrix.shape[1]} items")
print(f"Total interactions: {confidence_matrix.nnz:,}")
print(f"Matrix density: {confidence_matrix.nnz / (confidence_matrix.shape[0] * confidence_matrix.shape[1]) * 100:.2f}%")
print()
print("Best Model Configuration:")
for param, value in best_params.items():
    print(f"  {param}: {value}")
print()
print("Performance (Best Matrix):")
best_matrix_result = comparison_df.loc[comparison_df['F1@10'].idxmax()]
print(f"  Matrix Type: {best_matrix_result['Matrix']}")
print(f"  Precision@10: {best_matrix_result['Precision@10']:.4f}")
print(f"  Recall@10: {best_matrix_result['Recall@10']:.4f}")
print(f"  F1@10: {best_matrix_result['F1@10']:.4f}")
print()
print(f"Model artifacts saved to: {models_dir / 'als_model_artifacts.pkl'}")
print("=" * 60)
