In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
raw_df = pd.read_excel("CC_Synthetic_Training_Data.xlsx", header=None)
raw_df.shape
raw_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Aaron Bechtelar,85,"Sherylfort, ND",Soccer,Adidas,Adidas Soccer Jersey,3,945.16
1,Aaron Bechtelar,85,"Sherylfort, ND",Football,Nike,Nike Football Jersey,3,925.27
2,Aaron Bechtelar,84,"Sherylfort, ND",Football,Riddell,Riddell Football Helmet,3,586.16
3,Aaron Bechtelar,83,"Sherylfort, ND",Swimming,TYR,TYR Swimming Goggles,3,602.34
4,Aaron Bechtelar,82,"Sherylfort, ND",Baseball,Wilson,Wilson Baseball Bat,3,722.47


In [5]:
raw_df.columns = ["customer_name", "customer_age", "location", "sport", "brand", "product_name", "quantity", "order_amount",]
df = raw_df.copy()
df.head()

Unnamed: 0,customer_name,customer_age,location,sport,brand,product_name,quantity,order_amount
0,Aaron Bechtelar,85,"Sherylfort, ND",Soccer,Adidas,Adidas Soccer Jersey,3,945.16
1,Aaron Bechtelar,85,"Sherylfort, ND",Football,Nike,Nike Football Jersey,3,925.27
2,Aaron Bechtelar,84,"Sherylfort, ND",Football,Riddell,Riddell Football Helmet,3,586.16
3,Aaron Bechtelar,83,"Sherylfort, ND",Swimming,TYR,TYR Swimming Goggles,3,602.34
4,Aaron Bechtelar,82,"Sherylfort, ND",Baseball,Wilson,Wilson Baseball Bat,3,722.47


In [6]:
df["user_id"], user_index = pd.factorize(df["customer_name"])
df["item_id"], item_index = pd.factorize(df["product_name"])

print("Number of unique users:", df["user_id"].nunique())
print("Number of unique items:", df["item_id"].nunique())

df[["customer_name", "product_name", "user_id", "item_id"]].head(11)

Number of unique users: 1000
Number of unique items: 215


Unnamed: 0,customer_name,product_name,user_id,item_id
0,Aaron Bechtelar,Adidas Soccer Jersey,0,0
1,Aaron Bechtelar,Nike Football Jersey,0,1
2,Aaron Bechtelar,Riddell Football Helmet,0,2
3,Aaron Bechtelar,TYR Swimming Goggles,0,3
4,Aaron Bechtelar,Wilson Baseball Bat,0,4
5,Adaline Kuhic,Nike Weight Lifting Towel,1,5
6,Adaline Kuhic,Leon Paul Fencing Mask,1,6
7,Adaline Kuhic,Under Armour Basketball Hoop,1,7
8,Adaline Kuhic,Fencing Post Fencing Sword,1,8
9,Adaline Kuhic,TYR Swimming Goggles,1,3


In [7]:
item_id_to_sport = df.drop_duplicates("item_id").set_index("item_id")["sport"]

In [8]:

interaction_df = (df.groupby(["user_id", "item_id"])["quantity"].sum().reset_index())

interaction_df.head(11)

Unnamed: 0,user_id,item_id,quantity
0,0,0,3
1,0,1,3
2,0,2,3
3,0,3,3
4,0,4,3
5,1,3,3
6,1,5,4
7,1,6,4
8,1,7,2
9,1,8,1


In [9]:
user_item_matrix = interaction_df.pivot_table(index="user_id", columns="item_id", values="quantity", aggfunc="sum", fill_value=0)

user_item_matrix.shape

(1000, 215)

In [10]:
item_sim_matrix = cosine_similarity(user_item_matrix.T)

item_sim_df = pd.DataFrame(item_sim_matrix, index=user_item_matrix.columns, columns=user_item_matrix.columns)

item_sim_df.iloc[:5, :5]

item_id,0,1,2,3,4
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.0,0.06557,0.057242,0.035258,0.061754
1,0.06557,1.0,0.094529,0.092473,0.053989
2,0.057242,0.094529,1.0,0.050829,0.047133
3,0.035258,0.092473,0.050829,1.0,0.029031
4,0.061754,0.053989,0.047133,0.029031,1.0


In [11]:
item_popularity = (df.groupby("item_id")["quantity"].sum().sort_values(ascending=False))

item_popularity.head()

item_id
22     185
27     176
72     168
125    161
112    158
Name: quantity, dtype: int64

In [12]:
def get_user_sport_preferences(user_id):
    """Return a frequency map of sports the user has purchased from."""
    user_rows = df[df["user_id"] == user_id]
    sport_counts = user_rows.groupby("sport")["quantity"].sum().sort_values(ascending=False)
    return sport_counts

In [13]:
def recommend_for_user_id(user_id, top_n=5, user_item=None, sim_df=None):
    """
    Recommend top_n items for a given integer user_id.
    
    Returns a list of dicts:
        [{ "item_id": ..., "product_name": ..., "brand": ..., "sport": ... }, ...]
    """
    if user_item is None:
        user_item = user_item_matrix
    if sim_df is None:
        sim_df = item_sim_df

    if user_id not in user_item.index:
        print(f"user_id {user_id} not found; returning empty list.")
        return []

    # Vector of that user's interactions
    user_vector = user_item.loc[user_id]

    # Items the user has already interacted with
    purchased_items = user_vector[user_vector > 0].index.tolist()

    # If the user has no history → cold-start: return most popular items
    if not purchased_items:
        candidate_item_ids = [
            item_id for item_id in item_popularity.index
            if item_id not in purchased_items
        ][:top_n]
    else:
        # Start with zero scores for each item
        num_items = len(item_index)
        scores = np.zeros(num_items, dtype=float)

        # For each purchased item, add its similarity score to all items
        for item_id in purchased_items:
            scores += sim_df.loc[item_id].values

        # Exclude items the user already has
        scores[purchased_items] = -5.0  # So they won't be recommended
        
        #Same Sport Boost
        SPORT_BOOST_WEIGHT = 0.15   # Tune between 0.1–1.0

        sport_pref = get_user_sport_preferences(user_id)
        if len(sport_pref) > 0:
            top_sports = sport_pref.index.tolist()
    
            for iid in range(len(scores)):
                item_sport = item_id_to_sport.get(iid, None)
                if item_sport in top_sports:
                    rank = top_sports.index(item_sport)   # 0 = most preferred
                    boost = SPORT_BOOST_WEIGHT / (rank + 1)
                    scores[iid] += boost
                    
            # -------- TEMPORARY CATEGORY PENALTY --------
        CATEGORY_PENALTY = 1.5   # tune (1–5). Larger → stronger blocking.

        # Extract purchased item categories
        purchased_info = df[df["item_id"].isin(purchased_items)][["item_id", "sport", "product_name"]]
        purchased_categories = set(
            (row["sport"], row["product_name"].split()[-1])   # (Sport, Category)
            for _, row in purchased_info.iterrows()
        )

        # Apply penalty to items matching both sport and category
        for iid in range(num_items):
            row = df[df["item_id"] == iid].iloc[0]
            candidate_sport = row["sport"]
            candidate_category = row["product_name"].split()[-1]

            if (candidate_sport, candidate_category) in purchased_categories:
                scores[iid] -= CATEGORY_PENALTY   # penalize similar equipment

        # Get top_n item IDs by score
        candidate_item_ids = scores.argsort()[::-1][:top_n]  # descending

    # Build readable output (product name, brand, sport, etc.)
    # We'll use the original df to look up metadata
    recommendations = []
    for iid in candidate_item_ids:
        product_name = item_index[iid]
        # Grab one sample row for metadata
        row = df[df["item_id"] == iid].iloc[0]
        recommendations.append({"item_id": int(iid), "product_name": product_name, "brand": row["brand"], "sport": row["sport"]})

    return recommendations


def recommend_for_customer_name(customer_name, top_n=5):
    """
    Wrapper that takes the human-readable customer name.
    We find the corresponding user_id (first match) and call recommend_for_user_id.
    """
    user_ids = df.loc[df["customer_name"] == customer_name, "user_id"].unique()
    if len(user_ids) == 0:
        print(f"No user found with name '{customer_name}'")
        return []
    user_id = user_ids[0]
    return recommend_for_user_id(user_id, top_n=top_n)

In [14]:
random_user_id = np.random.choice(df["user_id"].unique())
print("Random user_id:", random_user_id)

sample_recs = recommend_for_user_id(random_user_id, top_n=5)
sample_recs

Random user_id: 735


[{'item_id': 35,
  'product_name': 'Puma Soccer Shin Guards',
  'brand': 'Puma',
  'sport': 'Soccer'},
 {'item_id': 186,
  'product_name': 'Pioneer Weight Lifting Barbell',
  'brand': 'Pioneer',
  'sport': 'Weight Lifting'},
 {'item_id': 29,
  'product_name': 'New Balance Soccer Jersey',
  'brand': 'New Balance',
  'sport': 'Soccer'},
 {'item_id': 111,
  'product_name': 'Nike Basketball Hoop',
  'brand': 'Nike',
  'sport': 'Basketball'},
 {'item_id': 209,
  'product_name': 'Adidas Weight Lifting Bag',
  'brand': 'Adidas',
  'sport': 'Weight Lifting'}]

In [15]:
some_name = df["customer_name"].iloc[0]
print("Customer name:", some_name)

recs_by_name = recommend_for_customer_name(some_name, top_n=5)
recs_by_name

Customer name: Aaron Bechtelar


[{'item_id': 100,
  'product_name': 'TaylorMade Golf Gloves',
  'brand': 'TaylorMade',
  'sport': 'Golf'},
 {'item_id': 77,
  'product_name': 'Wilson Football Ball',
  'brand': 'Wilson',
  'sport': 'Football'},
 {'item_id': 55,
  'product_name': 'Rawlings Baseball Gloves',
  'brand': 'Rawlings',
  'sport': 'Baseball'},
 {'item_id': 117,
  'product_name': 'Rawlings Baseball Baseball',
  'brand': 'Rawlings',
  'sport': 'Baseball'},
 {'item_id': 145,
  'product_name': 'Under Armour Basketball Shoes',
  'brand': 'Under Armour',
  'sport': 'Basketball'}]

In [16]:
def simple_holdout_eval(top_n=5, max_users=200):
    # Prepare a list of per-user interactions
    user_items_series = df.groupby("user_id")["item_id"].apply(list)

    # Restrict to users with at least 2 interactions
    user_items_series = user_items_series[user_items_series.str.len() >= 2]

    hits = 0
    total = 0

    for user_id, items in user_items_series.iloc[:max_users].items():
        items = list(items)
        # Hold out the last item as "test"
        test_item = items[-1]
        train_items = items[:-1]

        # Build a mini user vector for this user based only on train items
        temp_user_vector = np.zeros(len(item_index))
        for iid in train_items:
            temp_user_vector[iid] += 1

        # Build a temporary user-item matrix with just this user
        temp_user_item = pd.DataFrame(
            [temp_user_vector],
            index=[user_id],
            columns=user_item_matrix.columns
        )

        # Reuse the same item similarity (item_sim_df)
        recs = recommend_for_user_id(
            user_id,
            top_n=top_n,
            user_item=temp_user_item,
            sim_df=item_sim_df
        )

        recommended_item_ids = [r["item_id"] for r in recs]
        if test_item in recommended_item_ids:
            hits += 1
        total += 1

    hit_rate = hits / total if total > 0 else 0
    print(f"Hit rate@{top_n}: {hit_rate:.3f} (on {total} users)")
    return hit_rate

_ = simple_holdout_eval(top_n=5, max_users=200)

Hit rate@5: 0.490 (on 200 users)


In [17]:
# ============================================================
# PICKLE EXPORTER: Save trained artifacts for Recommender.py
# ============================================================

import pickle
import os

# Ensure your user_item_matrix and item_sim_df are already computed in the notebook
# user_index, item_index, item_id_to_sport, item_popularity should also exist

# Directory to save pickle
MODEL_DIR = "model_artifacts"
os.makedirs(MODEL_DIR, exist_ok=True)

# Bundle all artifacts
artifacts = {
    "item_similarity": item_sim_df,      # Item-item cosine similarity matrix
    "user_index": user_index,            # Mapping user_id -> customer_name
    "item_index": item_index,            # Mapping item_id -> product_name
    "item_id_to_sport": item_id_to_sport, # Mapping item_id -> sport category
    "user_item_matrix": user_item_matrix, # User-item pivot table
    "item_popularity": item_popularity   # For cold start fallback
}

# Save pickle
pickle_path = os.path.join(MODEL_DIR, "recommender_artifacts.pkl")
with open(pickle_path, "wb") as f:
    pickle.dump(artifacts, f)

print(f"✅ Pickle saved at: {pickle_path}")
print("Contains keys:", list(artifacts.keys()))

✅ Pickle saved at: model_artifacts\recommender_artifacts.pkl
Contains keys: ['item_similarity', 'user_index', 'item_index', 'item_id_to_sport', 'user_item_matrix', 'item_popularity']
