In [None]:

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares



df = pd.read_parquet("train.parquet")

print("Total interactions (full):", len(df))



sample_users = (
    df["user_id"]
    .drop_duplicates()
    .sample(50000, random_state=42)
)

df = df[df["user_id"].isin(sample_users)]

print("Interactions after sampling:", len(df))
print("Users after sampling:", df["user_id"].nunique())
print("Items after sampling:", df["product_id"].nunique())


df["interaction_weight"] = df["event_type"].map({
    "cart": 1,
    "purchase": 5
})

df["interaction_weight"] = np.log1p(df["interaction_weight"])



train_list = []
test_list = []

for user_id, group in df.groupby("user_id"):
    if len(group) < 2:
        continue

    group = group.sort_values("timestamp")
    train_list.append(group.iloc[:-1])
    test_list.append(group.iloc[-1:])

train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)



user_ids = train_df["user_id"].unique()
product_ids = train_df["product_id"].unique()

user_to_index = {u: i for i, u in enumerate(user_ids)}
product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

train_df["user_index"] = train_df["user_id"].map(user_to_index)
train_df["product_index"] = train_df["product_id"].map(product_to_index)



user_item_matrix = csr_matrix(
    (
        train_df["interaction_weight"],
        (train_df["user_index"], train_df["product_index"])
    ),
    shape=(len(user_to_index), len(product_to_index))
).astype("float32")

print("Matrix shape:", user_item_matrix.shape)



nonzero_items = user_item_matrix.getnnz(axis=0) > 0
user_item_matrix = user_item_matrix[:, nonzero_items]

product_ids = product_ids[nonzero_items]
product_to_index = {p: i for i, p in enumerate(product_ids)}
index_to_product = {i: p for p, i in product_to_index.items()}

print("Final matrix shape:", user_item_matrix.shape)



alpha = 40
confidence_matrix = user_item_matrix * alpha



als_model = AlternatingLeastSquares(
    factors=32,
    regularization=0.1,
    iterations=10,
    random_state=42
)

als_model.fit(confidence_matrix)

print("Model trained successfully.")


item_popularity = np.array(user_item_matrix.sum(axis=0)).ravel()
popular_items = np.argsort(-item_popularity)

def recommend_popular(k=10):
    return [index_to_product[i] for i in popular_items[:k]]



def recommend_als(user_id, k=10):

    if user_id not in user_to_index:
        return recommend_popular(k)

    user_index = user_to_index[user_id]
    user_row = user_item_matrix[user_index:user_index + 1]

    item_indices, scores = als_model.recommend(
        userid=user_index,
        user_items=user_row,
        N=k,
        filter_already_liked_items=True
    )

    return [index_to_product[i] for i in item_indices]



def recommend_hybrid(user_id, k=10):

    popular_recs = [index_to_product[i] for i in popular_items[:50]]

    if user_id not in user_to_index:
        return popular_recs[:k]

    user_index = user_to_index[user_id]
    user_row = user_item_matrix[user_index:user_index + 1]

    item_indices, scores = als_model.recommend(
        userid=user_index,
        user_items=user_row,
        N=50,
        filter_already_liked_items=True
    )

    if len(item_indices) == 0:
        return popular_recs[:k]

    # Normalize ALS scores
    scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)

    # Popularity scores
    pop_scores = item_popularity[item_indices]
    pop_scores = (pop_scores - pop_scores.min()) / (pop_scores.max() - pop_scores.min() + 1e-8)

    # Weighted blend
    final_scores = 0.6 * scores + 0.4 * pop_scores

    top_idx = np.argsort(-final_scores)[:k]

    return [index_to_product[item_indices[i]] for i in top_idx]



def recall_at_k(model="hybrid", k=10):

    hits = 0
    total = 0

    for user_id, group in test_df.groupby("user_id"):

        actual_items = set(group["product_id"])

        if model == "als":
            recommended = set(recommend_als(user_id, k))
        elif model == "popular":
            recommended = set(recommend_popular(k))
        else:
            recommended = set(recommend_hybrid(user_id, k))

        hits += len(actual_items & recommended)
        total += len(actual_items)

    return hits / total if total > 0 else 0


def precision_at_k(model="hybrid", k=10):

    total_precision = 0
    user_count = 0

    for user_id, group in test_df.groupby("user_id"):

        actual_items = set(group["product_id"])

        if model == "als":
            recommended = set(recommend_als(user_id, k))
        elif model == "popular":
            recommended = set(recommend_popular(k))
        else:
            recommended = set(recommend_hybrid(user_id, k))

        total_precision += len(actual_items & recommended) / k
        user_count += 1

    return total_precision / user_count if user_count > 0 else 0



print("\n===== ALS MODEL =====")
print("Recall@10:", recall_at_k("als", 10))
print("Precision@10:", precision_at_k("als", 10))

print("\n===== POPULARITY BASELINE =====")
print("Recall@10:", recall_at_k("popular", 10))
print("Precision@10:", precision_at_k("popular", 10))

print("\n===== HYBRID MODEL =====")
print("Recall@10:", recall_at_k("hybrid", 10))
print("Precision@10:", precision_at_k("hybrid", 10))


Total interactions (full): 11495242
Interactions after sampling: 223313
Users after sampling: 50000
Items after sampling: 27490
Train shape: (173313, 20)
Test shape: (31306, 20)
Matrix shape: (31306, 22773)
Final matrix shape: (31306, 22773)


  check_blas_config()


  0%|          | 0/10 [00:00<?, ?it/s]

Model trained successfully.

===== ALS MODEL =====
Recall@10: 0.05803999233373794
Precision@10: 0.0058039992333736015

===== POPULARITY BASELINE =====
Recall@10: 0.1490129687599821
Precision@10: 0.014901296875999412

===== HYBRID MODEL =====
Recall@10: 0.060755126812751546
Precision@10: 0.006075512681274947


In [3]:
import pickle

pickle.dump(als_model, open("als_model.pkl", "wb"))
pickle.dump(user_item_matrix, open("user_item_matrix.pkl", "wb"))
pickle.dump(user_to_index, open("user_to_index.pkl", "wb"))
pickle.dump(index_to_product, open("index_to_product.pkl", "wb"))


In [4]:
list(user_to_index.keys())[:10]


['110760953',
 '251333420',
 '310119844',
 '339009312',
 '402839293',
 '406827257',
 '407602302',
 '415514618',
 '415987845',
 '417810135']

In [1]:
import pandas as pd

df = pd.read_parquet("train.parquet")
print(df.columns)


Index(['event_time', 'event_type', 'product_id', 'brand', 'price', 'user_id',
       'user_session', 'target', 'cat_0', 'cat_1', 'cat_2', 'cat_3',
       'timestamp', 'ts_hour', 'ts_minute', 'ts_weekday', 'ts_day', 'ts_month',
       'ts_year'],
      dtype='object')


In [4]:
import os
os.listdir()


['.git',
 '.gitignore',
 'als_model.pkl',
 'app.py',
 'index_to_item.pkl',
 'index_to_product.pkl',
 'popular_products.pkl',
 'product_names.pkl',
 'project.ipynb',
 'README.md',
 'recommender.py',
 'requirements.txt',
 'train_model.py',
 'user_item_matrix.pkl',
 'user_to_index.pkl',
 '__pycache__']

In [None]:
import pandas as pd

train_df = pd.read_parquet("train.parquet")


train_df["price"] = pd.to_numeric(train_df["price"], errors="coerce")


product_metadata = train_df.groupby("product_id").agg({
    "brand": "first",
    "cat_0": "first",
    "price": "mean"
}).reset_index()

product_metadata["price"] = product_metadata["price"].fillna(0)

product_metadata.to_csv("product_metadata.csv", index=False)

purchase_counts = (
    train_df["product_id"]
    .value_counts()
    .reset_index()
)

purchase_counts.columns = ["product_id", "purchase_count"]

purchase_counts.to_csv("product_popularity.csv", index=False)

print("Files created successfully.")


Files created successfully.


In [7]:
import pandas as pd

df = pd.read_parquet("train.parquet")

df["price"] = pd.to_numeric(df["price"], errors="coerce")

product_metadata = (
    df.groupby("product_id")
    .agg({
        "brand": "first",
        "cat_0": "first",
        "price": "mean"
    })
    .reset_index()
)

product_metadata["price"] = product_metadata["price"].fillna(0)

product_metadata.to_csv("product_metadata.csv", index=False)

print("Metadata saved:", len(product_metadata))


Metadata saved: 164453


In [8]:
import pandas as pd

train_df = pd.read_parquet("train.parquet")

train_df["price"] = pd.to_numeric(train_df["price"], errors="coerce")

product_metadata = (
    train_df
    .groupby("product_id")
    .agg({
        "brand": "first",
        "cat_0": "first",
        "price": "mean"
    })
    .reset_index()
)

product_metadata["price"] = product_metadata["price"].fillna(0)

product_metadata.to_csv("product_metadata.csv", index=False)


In [9]:
product_popularity = (
    train_df
    .groupby("product_id")
    .size()
    .reset_index(name="purchase_count")
)

product_popularity.to_csv("product_popularity.csv", index=False)


In [11]:
import pickle
import pandas as pd

index_to_product = pickle.load(open("index_to_product.pkl", "rb"))

product_metadata = pd.read_csv("product_metadata.csv")


In [12]:
print(type(list(index_to_product.values())[0]))
print(product_metadata["product_id"].dtype)


<class 'str'>
int64
