In [1]:
# =========================================================
# 1. IMPORTS
# =========================================================
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight


# =========================================================
# 2. LOAD DATA
# =========================================================
df = pd.read_parquet("train.parquet")

# Optional sampling for speed
sample_users = (
    df["user_id"]
    .drop_duplicates()
    .sample(5000, random_state=42)
)

df = df[df["user_id"].isin(sample_users)].copy()


# =========================================================
# 3. INTERACTION WEIGHTING
# =========================================================
df["interaction_weight"] = df["event_type"].map({
    "cart": 1,
    "purchase": 5
})

# Log scaling
df["interaction_weight"] = np.log1p(df["interaction_weight"])


# =========================================================
# 4. TRAIN / TEST SPLIT (Last Interaction)
# =========================================================
train_list = []
test_list = []

for user_id, group in df.groupby("user_id"):
    if len(group) < 2:
        continue

    group = group.sort_values("timestamp")
    train_list.append(group.iloc[:-1])
    test_list.append(group.iloc[-1:])

train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


# =========================================================
# 5. CREATE INDEX MAPPINGS
# =========================================================
user_ids = train_df["user_id"].unique()
product_ids = train_df["product_id"].unique()

user_to_index = {u: i for i, u in enumerate(user_ids)}
product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

train_df["user_index"] = train_df["user_id"].map(user_to_index)
train_df["product_index"] = train_df["product_id"].map(product_to_index)


# =========================================================
# 6. BUILD USER-ITEM MATRIX
# =========================================================
user_item_matrix = csr_matrix(
    (
        train_df["interaction_weight"],
        (train_df["user_index"], train_df["product_index"])
    ),
    shape=(len(user_to_index), len(product_to_index))
).astype("double")

print("Initial matrix shape:", user_item_matrix.shape)


# =========================================================
# 7. REMOVE EMPTY ITEM COLUMNS
# =========================================================
nonzero_items = user_item_matrix.getnnz(axis=0) > 0
user_item_matrix = user_item_matrix[:, nonzero_items]

# Update product mappings
product_ids = product_ids[nonzero_items]

product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

print("Final matrix shape:", user_item_matrix.shape)


# =========================================================
# 8. APPLY BM25 WEIGHTING
# =========================================================
confidence_matrix = bm25_weight(user_item_matrix, K1=100, B=0.8)


# =========================================================
# 9. TRAIN ALS (OPTIMIZED HYPERPARAMETERS)
# =========================================================
als_model = AlternatingLeastSquares(
    factors=128,
    regularization=0.01,
    iterations=40,
    random_state=42
)

als_model.fit(confidence_matrix)

print("Model items:", als_model.item_factors.shape[0])
print("Matrix items:", user_item_matrix.shape[1])


# =========================================================
# 10. POPULARITY BACKUP (BOOST RECALL)
# =========================================================
item_popularity = np.array(user_item_matrix.sum(axis=0)).ravel()
popular_items = np.argsort(-item_popularity)


# =========================================================
# 11. RECOMMEND FUNCTION
# =========================================================
def recommend_als(user_id, n_products=10):

    if user_id not in user_to_index:
        return [index_to_product[i] for i in popular_items[:n_products]]

    user_index = user_to_index[user_id]

    if user_index >= user_item_matrix.shape[0]:
        return [index_to_product[i] for i in popular_items[:n_products]]

    user_row = user_item_matrix[user_index:user_index + 1]

    item_indices, scores = als_model.recommend(
        userid=user_index,
        user_items=user_row,
        N=n_products,
        filter_already_liked_items=True
    )

    # Backfill with popular items if needed
    if len(item_indices) < n_products:
        needed = n_products - len(item_indices)
        item_indices = list(item_indices) + list(popular_items[:needed])

    return [index_to_product[i] for i in item_indices]


# =========================================================
# 12. RECALL@K EVALUATION
# =========================================================
def recall_at_k(k=10):

    hits = 0
    total = 0

    for user_id, group in test_df.groupby("user_id"):

        if user_id not in user_to_index:
            continue

        actual_items = set(group["product_id"])
        recommended_items = set(recommend_als(user_id, n_products=k))

        hits += len(actual_items & recommended_items)
        total += len(actual_items)

    return hits / total if total > 0 else 0


# =========================================================
# 13. EVALUATE
# =========================================================
print("Recall@5 :", recall_at_k(5))
print("Recall@10:", recall_at_k(10))


Train shape: (17050, 20)
Test shape: (3099, 20)
Initial matrix shape: (3099, 4186)
Final matrix shape: (3099, 4186)


  check_blas_config()


  0%|          | 0/40 [00:00<?, ?it/s]

Model items: 4186
Matrix items: 4186
Recall@5 : 0.02968699580509842
Recall@10: 0.03807679896740884


In [2]:
# =========================================================
# 1. IMPORTS
# =========================================================
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares


# =========================================================
# 2. LOAD FULL DATASET (NO SAMPLING)
# =========================================================
df = pd.read_parquet("train.parquet")

print("Total interactions:", len(df))


# =========================================================
# 3. INTERACTION WEIGHTING
# =========================================================
df["interaction_weight"] = df["event_type"].map({
    "cart": 1,
    "purchase": 5
})

# Log scaling
df["interaction_weight"] = np.log1p(df["interaction_weight"])


# =========================================================
# 4. TRAIN / TEST SPLIT (LAST INTERACTION)
# =========================================================
train_list = []
test_list = []

for user_id, group in df.groupby("user_id"):
    if len(group) < 2:
        continue
    
    group = group.sort_values("timestamp")
    train_list.append(group.iloc[:-1])
    test_list.append(group.iloc[-1:])

train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


# =========================================================
# 5. CREATE INDEX MAPPINGS
# =========================================================
user_ids = train_df["user_id"].unique()
product_ids = train_df["product_id"].unique()

user_to_index = {u: i for i, u in enumerate(user_ids)}
product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

train_df["user_index"] = train_df["user_id"].map(user_to_index)
train_df["product_index"] = train_df["product_id"].map(product_to_index)


# =========================================================
# 6. BUILD USER-ITEM MATRIX
# =========================================================
user_item_matrix = csr_matrix(
    (
        train_df["interaction_weight"],
        (train_df["user_index"], train_df["product_index"])
    ),
    shape=(len(user_to_index), len(product_to_index))
).astype("double")

print("Initial matrix shape:", user_item_matrix.shape)


# =========================================================
# 7. REMOVE EMPTY ITEM COLUMNS
# =========================================================
nonzero_items = user_item_matrix.getnnz(axis=0) > 0
user_item_matrix = user_item_matrix[:, nonzero_items]

# Update product mapping
product_ids = product_ids[nonzero_items]

product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

print("Final matrix shape:", user_item_matrix.shape)


# =========================================================
# 8. CONFIDENCE SCALING (BETTER FOR SPARSE DATA)
# =========================================================
alpha = 50
confidence_matrix = user_item_matrix * alpha


# =========================================================
# 9. TRAIN ALS (STRONGER CONFIG)
# =========================================================
als_model = AlternatingLeastSquares(
    factors=128,
    regularization=0.01,
    iterations=40,
    random_state=42
)

als_model.fit(confidence_matrix)

print("Model items:", als_model.item_factors.shape[0])
print("Matrix items:", user_item_matrix.shape[1])


# =========================================================
# 10. POPULARITY SCORES (FOR HYBRID BLEND)
# =========================================================
item_popularity = np.array(user_item_matrix.sum(axis=0)).ravel()
popular_items = np.argsort(-item_popularity)


# =========================================================
# 11. HYBRID RECOMMEND FUNCTION (ALS + POPULARITY)
# =========================================================
def recommend_als(user_id, n_products=10):

    if user_id not in user_to_index:
        return [index_to_product[i] for i in popular_items[:n_products]]

    user_index = user_to_index[user_id]

    user_row = user_item_matrix[user_index:user_index + 1]

    item_indices, scores = als_model.recommend(
        userid=user_index,
        user_items=user_row,
        N=50,  # retrieve more candidates
        filter_already_liked_items=True
    )

    if len(item_indices) == 0:
        return [index_to_product[i] for i in popular_items[:n_products]]

    # Normalize ALS scores
    scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)

    # Popularity scores for same items
    pop_scores = item_popularity[item_indices]
    pop_scores = (pop_scores - pop_scores.min()) / (pop_scores.max() - pop_scores.min() + 1e-8)

    # Blend scores
    final_scores = 0.7 * scores + 0.3 * pop_scores

    top_idx = np.argsort(-final_scores)[:n_products]

    return [index_to_product[item_indices[i]] for i in top_idx]


# =========================================================
# 12. RECALL@K
# =========================================================
def recall_at_k(k=10):

    hits = 0
    total = 0

    for user_id, group in test_df.groupby("user_id"):

        if user_id not in user_to_index:
            continue

        actual_items = set(group["product_id"])
        recommended_items = set(recommend_als(user_id, n_products=k))

        hits += len(actual_items & recommended_items)
        total += len(actual_items)

    return hits / total if total > 0 else 0


# =========================================================
# 13. EVALUATE
# =========================================================
print("Recall@5 :", recall_at_k(5))
print("Recall@10:", recall_at_k(10))


Total interactions: 11495242


KeyboardInterrupt: 

In [1]:
# =========================================================
# 1. IMPORTS
# =========================================================
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares


# =========================================================
# 2. LOAD DATA
# =========================================================
df = pd.read_parquet("train.parquet")

print("Total interactions (full):", len(df))

# =========================================================
# 3. SAMPLE 50K USERS (LAPTOP FRIENDLY)
# =========================================================
sample_users = (
    df["user_id"]
    .drop_duplicates()
    .sample(50000, random_state=42)
)

df = df[df["user_id"].isin(sample_users)]

print("Interactions after sampling:", len(df))
print("Users after sampling:", df["user_id"].nunique())
print("Items after sampling:", df["product_id"].nunique())


# =========================================================
# 4. INTERACTION WEIGHTING
# =========================================================
df["interaction_weight"] = df["event_type"].map({
    "cart": 1,
    "purchase": 5
})

df["interaction_weight"] = np.log1p(df["interaction_weight"])


# =========================================================
# 5. TRAIN / TEST SPLIT (LAST INTERACTION)
# =========================================================
train_list = []
test_list = []

for user_id, group in df.groupby("user_id"):
    if len(group) < 2:
        continue

    group = group.sort_values("timestamp")
    train_list.append(group.iloc[:-1])
    test_list.append(group.iloc[-1:])

train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


# =========================================================
# 6. CREATE INDEX MAPPINGS
# =========================================================
user_ids = train_df["user_id"].unique()
product_ids = train_df["product_id"].unique()

user_to_index = {u: i for i, u in enumerate(user_ids)}
product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

train_df["user_index"] = train_df["user_id"].map(user_to_index)
train_df["product_index"] = train_df["product_id"].map(product_to_index)


# =========================================================
# 7. BUILD USER-ITEM MATRIX
# =========================================================
user_item_matrix = csr_matrix(
    (
        train_df["interaction_weight"],
        (train_df["user_index"], train_df["product_index"])
    ),
    shape=(len(user_to_index), len(product_to_index))
).astype("float32")

print("Matrix shape:", user_item_matrix.shape)


# =========================================================
# 8. REMOVE EMPTY ITEM COLUMNS
# =========================================================
nonzero_items = user_item_matrix.getnnz(axis=0) > 0
user_item_matrix = user_item_matrix[:, nonzero_items]

product_ids = product_ids[nonzero_items]
product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

print("Final matrix shape:", user_item_matrix.shape)


# =========================================================
# 9. CONFIDENCE SCALING
# =========================================================
alpha = 40
confidence_matrix = user_item_matrix * alpha


# =========================================================
# 10. TRAIN ALS (FAST CONFIG)
# =========================================================
als_model = AlternatingLeastSquares(
    factors=32,        # reduced for speed
    regularization=0.1,
    iterations=10,     # fewer iterations
    random_state=42
)

als_model.fit(confidence_matrix)

print("Model trained successfully.")


# =========================================================
# 11. POPULARITY BACKUP
# =========================================================
item_popularity = np.array(user_item_matrix.sum(axis=0)).ravel()
popular_items = np.argsort(-item_popularity)


# =========================================================
# 12. RECOMMEND FUNCTION
# =========================================================
def recommend_als(user_id, n_products=10):

    if user_id not in user_to_index:
        return [index_to_product[i] for i in popular_items[:n_products]]

    user_index = user_to_index[user_id]
    user_row = user_item_matrix[user_index:user_index + 1]

    item_indices, scores = als_model.recommend(
        userid=user_index,
        user_items=user_row,
        N=n_products,
        filter_already_liked_items=True
    )

    return [index_to_product[i] for i in item_indices]


# =========================================================
# 13. RECALL@K
# =========================================================
def recall_at_k(k=10):

    hits = 0
    total = 0

    for user_id, group in test_df.groupby("user_id"):

        if user_id not in user_to_index:
            continue

        actual_items = set(group["product_id"])
        recommended_items = set(recommend_als(user_id, n_products=k))

        hits += len(actual_items & recommended_items)
        total += len(actual_items)

    return hits / total if total > 0 else 0


# =========================================================
# 14. EVALUATE
# =========================================================
print("Recall@5 :", recall_at_k(5))
print("Recall@10:", recall_at_k(10))


Total interactions (full): 11495242
Interactions after sampling: 223313
Users after sampling: 50000
Items after sampling: 27490
Train shape: (173313, 20)
Test shape: (31306, 20)
Matrix shape: (31306, 22773)
Final matrix shape: (31306, 22773)


  check_blas_config()


  0%|          | 0/10 [00:00<?, ?it/s]

Model trained successfully.
Recall@5 : 0.03999233373794161
Recall@10: 0.05803999233373794


In [2]:
# =========================================================
# 1. IMPORTS
# =========================================================
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares


# =========================================================
# 2. LOAD DATA
# =========================================================
df = pd.read_parquet("train.parquet")

print("Total interactions (full):", len(df))


# =========================================================
# 3. SAMPLE 50K USERS (FAST & STABLE)
# =========================================================
sample_users = (
    df["user_id"]
    .drop_duplicates()
    .sample(50000, random_state=42)
)

df = df[df["user_id"].isin(sample_users)]

print("Interactions after sampling:", len(df))
print("Users after sampling:", df["user_id"].nunique())
print("Items after sampling:", df["product_id"].nunique())


# =========================================================
# 4. INTERACTION WEIGHTING
# =========================================================
df["interaction_weight"] = df["event_type"].map({
    "cart": 1,
    "purchase": 5
})

df["interaction_weight"] = np.log1p(df["interaction_weight"])


# =========================================================
# 5. TRAIN / TEST SPLIT (LAST INTERACTION)
# =========================================================
train_list = []
test_list = []

for user_id, group in df.groupby("user_id"):
    if len(group) < 2:
        continue

    group = group.sort_values("timestamp")
    train_list.append(group.iloc[:-1])
    test_list.append(group.iloc[-1:])

train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


# =========================================================
# 6. CREATE INDEX MAPPINGS
# =========================================================
user_ids = train_df["user_id"].unique()
product_ids = train_df["product_id"].unique()

user_to_index = {u: i for i, u in enumerate(user_ids)}
product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

train_df["user_index"] = train_df["user_id"].map(user_to_index)
train_df["product_index"] = train_df["product_id"].map(product_to_index)


# =========================================================
# 7. BUILD USER-ITEM MATRIX
# =========================================================
user_item_matrix = csr_matrix(
    (
        train_df["interaction_weight"],
        (train_df["user_index"], train_df["product_index"])
    ),
    shape=(len(user_to_index), len(product_to_index))
).astype("float32")

print("Matrix shape:", user_item_matrix.shape)


# =========================================================
# 8. REMOVE EMPTY ITEM COLUMNS
# =========================================================
nonzero_items = user_item_matrix.getnnz(axis=0) > 0
user_item_matrix = user_item_matrix[:, nonzero_items]

product_ids = product_ids[nonzero_items]
product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

print("Final matrix shape:", user_item_matrix.shape)


# =========================================================
# 9. CONFIDENCE SCALING
# =========================================================
alpha = 40
confidence_matrix = user_item_matrix * alpha


# =========================================================
# 10. TRAIN ALS MODEL
# =========================================================
als_model = AlternatingLeastSquares(
    factors=32,
    regularization=0.1,
    iterations=10,
    random_state=42
)

als_model.fit(confidence_matrix)

print("Model trained successfully.")


# =========================================================
# 11. POPULARITY BASELINE
# =========================================================
item_popularity = np.array(user_item_matrix.sum(axis=0)).ravel()
popular_items = np.argsort(-item_popularity)

def recommend_popular(k=10):
    return [index_to_product[i] for i in popular_items[:k]]


# =========================================================
# 12. ALS RECOMMENDATION
# =========================================================
def recommend_als(user_id, k=10):

    if user_id not in user_to_index:
        return recommend_popular(k)

    user_index = user_to_index[user_id]
    user_row = user_item_matrix[user_index:user_index + 1]

    item_indices, scores = als_model.recommend(
        userid=user_index,
        user_items=user_row,
        N=k,
        filter_already_liked_items=True
    )

    return [index_to_product[i] for i in item_indices]


# =========================================================
# 13. HYBRID RECOMMENDATION (BEST PRACTICAL MODEL)
# =========================================================
def recommend_hybrid(user_id, k=10):

    popular_recs = [index_to_product[i] for i in popular_items[:50]]

    if user_id not in user_to_index:
        return popular_recs[:k]

    user_index = user_to_index[user_id]
    user_row = user_item_matrix[user_index:user_index + 1]

    item_indices, scores = als_model.recommend(
        userid=user_index,
        user_items=user_row,
        N=50,
        filter_already_liked_items=True
    )

    if len(item_indices) == 0:
        return popular_recs[:k]

    # Normalize ALS scores
    scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)

    # Popularity scores
    pop_scores = item_popularity[item_indices]
    pop_scores = (pop_scores - pop_scores.min()) / (pop_scores.max() - pop_scores.min() + 1e-8)

    # Weighted blend
    final_scores = 0.6 * scores + 0.4 * pop_scores

    top_idx = np.argsort(-final_scores)[:k]

    return [index_to_product[item_indices[i]] for i in top_idx]


# =========================================================
# 14. EVALUATION METRICS
# =========================================================
def recall_at_k(model="hybrid", k=10):

    hits = 0
    total = 0

    for user_id, group in test_df.groupby("user_id"):

        actual_items = set(group["product_id"])

        if model == "als":
            recommended = set(recommend_als(user_id, k))
        elif model == "popular":
            recommended = set(recommend_popular(k))
        else:
            recommended = set(recommend_hybrid(user_id, k))

        hits += len(actual_items & recommended)
        total += len(actual_items)

    return hits / total if total > 0 else 0


def precision_at_k(model="hybrid", k=10):

    total_precision = 0
    user_count = 0

    for user_id, group in test_df.groupby("user_id"):

        actual_items = set(group["product_id"])

        if model == "als":
            recommended = set(recommend_als(user_id, k))
        elif model == "popular":
            recommended = set(recommend_popular(k))
        else:
            recommended = set(recommend_hybrid(user_id, k))

        total_precision += len(actual_items & recommended) / k
        user_count += 1

    return total_precision / user_count if user_count > 0 else 0


# =========================================================
# 15. FINAL RESULTS
# =========================================================
print("\n===== ALS MODEL =====")
print("Recall@10:", recall_at_k("als", 10))
print("Precision@10:", precision_at_k("als", 10))

print("\n===== POPULARITY BASELINE =====")
print("Recall@10:", recall_at_k("popular", 10))
print("Precision@10:", precision_at_k("popular", 10))

print("\n===== HYBRID MODEL =====")
print("Recall@10:", recall_at_k("hybrid", 10))
print("Precision@10:", precision_at_k("hybrid", 10))


Total interactions (full): 11495242
Interactions after sampling: 223313
Users after sampling: 50000
Items after sampling: 27490
Train shape: (173313, 20)
Test shape: (31306, 20)
Matrix shape: (31306, 22773)
Final matrix shape: (31306, 22773)


  check_blas_config()


  0%|          | 0/10 [00:00<?, ?it/s]

Model trained successfully.

===== ALS MODEL =====
Recall@10: 0.05803999233373794
Precision@10: 0.0058039992333736015

===== POPULARITY BASELINE =====
Recall@10: 0.1490129687599821
Precision@10: 0.014901296875999412

===== HYBRID MODEL =====
Recall@10: 0.060755126812751546
Precision@10: 0.006075512681274947


In [3]:
import pickle

pickle.dump(als_model, open("als_model.pkl", "wb"))
pickle.dump(user_item_matrix, open("user_item_matrix.pkl", "wb"))
pickle.dump(user_to_index, open("user_to_index.pkl", "wb"))
pickle.dump(index_to_product, open("index_to_product.pkl", "wb"))


In [4]:
list(user_to_index.keys())[:10]


['110760953',
 '251333420',
 '310119844',
 '339009312',
 '402839293',
 '406827257',
 '407602302',
 '415514618',
 '415987845',
 '417810135']

In [1]:
import pandas as pd

df = pd.read_parquet("train.parquet")
print(df.columns)


Index(['event_time', 'event_type', 'product_id', 'brand', 'price', 'user_id',
       'user_session', 'target', 'cat_0', 'cat_1', 'cat_2', 'cat_3',
       'timestamp', 'ts_hour', 'ts_minute', 'ts_weekday', 'ts_day', 'ts_month',
       'ts_year'],
      dtype='object')
