In [4]:
# Import required libraries
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, save_npz
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import train_test_split
import pickle
import os

print("Libraries imported successfully")

Libraries imported successfully


In [5]:
# Load your preprocessed data
# Adjust the path based on your actual file location
df = pd.read_csv('../../data/cf_interactions.csv')

print(f"Dataset loaded: {len(df):,} interactions")
print(f"Unique users: {df['user_id'].nunique():,}")
print(f"Unique items: {df['product_id'].nunique():,}")
print(f"\nFirst few rows:")
df.head()

Dataset loaded: 1,464 interactions
Unique users: 1,193
Unique items: 1,350

First few rows:


Unnamed: 0,user_id,product_id,rating
0,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...",B07JW9H4J1,4.2
1,"AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...",B098NS6PVG,4.0
2,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...",B096MSW6CT,3.9
3,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...",B08HDJ86NZ,4.2
4,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...",B08CF3B7N1,4.2


In [6]:
# Create categorical codes for users and items
df['user_code'] = df['user_id'].astype('category').cat.codes
df['item_code'] = df['product_id'].astype('category').cat.codes

# Save mappings for later use (to convert codes back to IDs)
user_mapping = df[['user_code', 'user_id']].drop_duplicates().set_index('user_code').sort_index()
item_mapping = df[['item_code', 'product_id']].drop_duplicates().set_index('item_code').sort_index()

print(f"Users encoded: {df['user_code'].nunique()}")
print(f"Items encoded: {df['item_code'].nunique()}")
print("\nExample mappings:")
print(df[['user_id', 'user_code', 'product_id', 'item_code']].head())

Users encoded: 1193
Items encoded: 1350

Example mappings:
                                             user_id  user_code  product_id  \
0  AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...        623  B07JW9H4J1   
1  AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...         88  B098NS6PVG   
2  AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...        848  B096MSW6CT   
3  AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...        254  B08HDJ86NZ   
4  AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...         17  B08CF3B7N1   

   item_code  
0        346  
1        847  
2        818  
3        643  
4        588  


In [7]:
# Split the data into train and test sets
train_df, test_df = train_test_split(
    df,
    test_size=0.2,      # 20% for testing
    random_state=42,    # For reproducibility
    stratify=None       # Can stratify by user if needed
)

print("="*50)
print("DATA SPLIT SUMMARY")
print("="*50)
print(f"Total interactions: {len(df):,}")
print(f"\nTrain set: {len(train_df):,} ({len(train_df)/len(df)*100:.1f}%)")
print(f"Test set:  {len(test_df):,} ({len(test_df)/len(df)*100:.1f}%)")
print(f"\nTrain users: {train_df['user_code'].nunique():,}")
print(f"Test users:  {test_df['user_code'].nunique():,}")
print(f"\nTrain items: {train_df['item_code'].nunique():,}")
print(f"Test items:  {test_df['item_code'].nunique():,}")
print("="*50)

DATA SPLIT SUMMARY
Total interactions: 1,464

Train set: 1,171 (80.0%)
Test set:  293 (20.0%)

Train users: 978
Test users:  273

Train items: 1,092
Test items:  288


In [8]:
# Get matrix dimensions (same for both train and test)
n_users = df['user_code'].max() + 1
n_items = df['item_code'].max() + 1

print(f"Matrix dimensions: {n_users:,} users × {n_items:,} items")
print(f"Total possible interactions: {n_users * n_items:,}")

# Determine what values to use
# If you have ratings, use them. Otherwise, use 1 (binary interaction)
if 'rating' in train_df.columns:
    print("\nUsing ratings as interaction values")
    train_values = train_df['rating'].values
    test_values = test_df['rating'].values
else:
    print("\nNo ratings found - using binary interactions (1 = interacted)")
    train_values = np.ones(len(train_df))
    test_values = np.ones(len(test_df))

Matrix dimensions: 1,193 users × 1,350 items
Total possible interactions: -27,850

Using ratings as interaction values


  print(f"Total possible interactions: {n_users * n_items:,}")


In [9]:
# Create TRAIN interaction matrix
train_matrix = coo_matrix(
    (train_values, (train_df['user_code'], train_df['item_code'])),
    shape=(n_users, n_items)
)

print("TRAIN MATRIX:")
print(f"  Shape: {train_matrix.shape}")
print(f"  Non-zero entries: {train_matrix.nnz:,}")
print(f"  Sparsity: {(1 - train_matrix.nnz / (n_users * n_items)) * 100:.4f}%")
print(f"  Memory: {train_matrix.data.nbytes / 1024 / 1024:.2f} MB")

TRAIN MATRIX:
  Shape: (1193, 1350)
  Non-zero entries: 1,171
  Sparsity: 104.2047%
  Memory: 0.01 MB


  print(f"  Sparsity: {(1 - train_matrix.nnz / (n_users * n_items)) * 100:.4f}%")


In [10]:
# Create TEST interaction matrix
test_matrix = coo_matrix(
    (test_values, (test_df['user_code'], test_df['item_code'])),
    shape=(n_users, n_items)
)

print("TEST MATRIX:")
print(f"  Shape: {test_matrix.shape}")
print(f"  Non-zero entries: {test_matrix.nnz:,}")
print(f"  Sparsity: {(1 - test_matrix.nnz / (n_users * n_items)) * 100:.4f}%")
print(f"  Memory: {test_matrix.data.nbytes / 1024 / 1024:.2f} MB")

TEST MATRIX:
  Shape: (1193, 1350)
  Non-zero entries: 293
  Sparsity: 101.0521%
  Memory: 0.00 MB


  print(f"  Sparsity: {(1 - test_matrix.nnz / (n_users * n_items)) * 100:.4f}%")


In [11]:
# Initialize the ALS model
model = AlternatingLeastSquares(
    factors=50,              # Number of latent factors
    regularization=0.1,      # L2 regularization
    iterations=20,           # Number of training iterations
    random_state=42,         # For reproducibility
    use_gpu=False            # Set to True if you have GPU
)

print("Model initialized with parameters:")
print(f"  Factors: {model.factors}")
print(f"  Regularization: {model.regularization}")
print(f"  Iterations: {model.iterations}")

Model initialized with parameters:
  Factors: 50
  Regularization: 0.1
  Iterations: 20


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  check_blas_config()


In [12]:
# Train the model on TRAIN data only
print("\nTraining model...")
print("This may take a few minutes depending on data size...\n")

model.fit(train_matrix)

print("✓ Training complete!")
print(f"\nModel learned:")
print(f"  User factors shape: {model.user_factors.shape}")
print(f"  Item factors shape: {model.item_factors.shape}")




Training model...
This may take a few minutes depending on data size...



100%|██████████| 20/20 [00:00<00:00, 420.04it/s]

✓ Training complete!

Model learned:
  User factors shape: (1193, 50)
  Item factors shape: (1350, 50)





In [13]:
# Test recommendation for user 0
test_user = 0

# Get recommendations
recommendations = model.recommend(
    test_user,
    train_matrix.tocsr()[test_user],
    N=5,
    filter_already_liked_items=True
)

print(f"Sample recommendations for user {test_user}:")
print("\nItem Code | Score")
print("-" * 25)
for item_id, score in zip(recommendations[0], recommendations[1]):
    product_id = item_mapping.loc[item_id, 'product_id']
    print(f"{item_id:9d} | {score:.4f} | {product_id}")

print("\n✓ Model is working!")

Sample recommendations for user 0:

Item Code | Score
-------------------------
      589 | 0.1319 | B08CF3D7QR
     1282 | 0.1017 | B0BC9BW512
     1140 | 0.1017 | B0B1YZX72F
     1139 | 0.1017 | B0B1YZ9CB8
     1138 | 0.1017 | B0B1YY6JJL

✓ Model is working!


In [16]:
print("Saving model and data...\n")

# 1. Save the trained model
with open('../../ml_service/app/models/collaborative_filtering/als_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("✓ Saved: als_model.pkl")

# 2. Save train matrix (needed for making recommendations)
save_npz('../../ml_service/app/models/collaborative_filtering/train_matrix.npz', train_matrix.tocsr())
print("✓ Saved: train_matrix.npz")

# 3. Save test matrix (for evaluation)
save_npz('../../ml_service/app/models/collaborative_filtering/test_matrix.npz', test_matrix.tocsr())
print("✓ Saved: test_matrix.npz")

# 4. Save user mapping
user_mapping.to_csv('../../ml_service/app/models/collaborative_filtering/user_mapping.csv')
print("✓ Saved: user_mapping.csv")

# 5. Save item mapping
item_mapping.to_csv('../../ml_service/app/models/collaborative_filtering/item_mapping.csv')
print("✓ Saved: item_mapping.csv")

# 6. Save model metadata
metadata = {
    'n_users': n_users,
    'n_items': n_items,
    'train_size': len(train_df),
    'test_size': len(test_df),
    'factors': model.factors,
    'regularization': model.regularization,
    'iterations': model.iterations
}
with open('../../ml_service/app/models/collaborative_filtering/metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)
print("✓ Saved: metadata.pkl")

print("\n" + "="*50)
print("ALL FILES SAVED SUCCESSFULLY!")
print("="*50)
print("\nNext step: Run 02_cf_evaluation.ipynb to evaluate the model")

Saving model and data...

✓ Saved: als_model.pkl
✓ Saved: train_matrix.npz
✓ Saved: test_matrix.npz
✓ Saved: user_mapping.csv
✓ Saved: item_mapping.csv
✓ Saved: metadata.pkl

ALL FILES SAVED SUCCESSFULLY!

Next step: Run 02_cf_evaluation.ipynb to evaluate the model
