# 1. Feature Engineering for Two Tower

In [1]:
from sklearn.preprocessing import LabelEncoder
import joblib


In [2]:
import numpy as np
import pandas as pd

train = pd.read_parquet("../datasets/train_balanced.parquet")
train.head()

Unnamed: 0,rating,title_x,text,images_x,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,description,price,images_y,videos,store,categories,details,user_id_enc,asin_enc,label
0,5.0,Great Clips!,These clips surprisingly hold up your hair ver...,[],B000X1YING,B000X1YING,AELSOLRGGMBFVURYNN3V22KOMDDQ,1417722859000,0,True,...,['Large butterfly clamps which is perfect for ...,3.0,{'hi_res': array(['https://m.media-amazon.com/...,"{'title': array([], dtype=object), 'url': arra...",Soft 'N Style,[],"{""Brand"": ""Soft 'N Style"", ""Hair Type"": ""All"",...",23924,270,1
1,1.0,Not the Essie brand I know!,"If you knew me, you’ve known I’ve been looking...","[{'attachment_type': 'IMAGE', 'large_image_url...",B01DNS649G,B01DNS649G,AHBNLFLA2MV6JB5CN6TMW4T2666Q,1563110981138,1,True,...,['Product Description'\n 'Why create so many c...,12.23,{'hi_res': array(['https://m.media-amazon.com/...,"{'title': array([], dtype=object), 'url': arra...",essie,[],"{""Brand"": ""Essie"", ""Item Form"": ""Liquid"", ""Col...",139069,5770,0
2,5.0,Light spray that holds,Light spray for fine hair that dosn't blow awa...,[],B079V2XBS9,B0B6ZM9XJN,AE2Q7VG3C4X2OKHMQX7OZC5A6ZQQ,1628344950779,0,True,...,[],50.49,{'hi_res': array(['https://m.media-amazon.com/...,{'title': array(['Watch me use the viral TikTo...,TRESemmé,[],"{""Scent"": ""Natural"", ""Liquid Volume"": ""5.5 Flu...",908,10148,1
3,1.0,Too Big,Very big for the face and does not fit because...,[],B091L8XGHF,B091KRGHP8,AHIGSNLM7CLY3D36VTXMWDC2RSDA,1684331185179,0,True,...,[],8.99,{'hi_res': array(['https://m.media-amazon.com/...,{'title': array(['Cute Reusable Cooling Eye Ge...,ZNÖCUETÖD,[],"{""Brand"": ""ZN\u00d6CUET\u00d6D"", ""Unit Count"":...",148235,17342,0
4,5.0,I will keep buying this one,Just like the original. Cuts great and lasts. ...,[],B0019WZHXI,B0019WZHXI,AERBDZBVPURHA7FWMIAIMIJLOMAA,1505828211250,0,True,...,['A'],19.39,{'hi_res': array(['https://m.media-amazon.com/...,"{'title': array(['Fast Shipping', 'Wahl Brand ...",Wahl Professional,[],"{""Recommended Uses For Product"": ""Professional...",31322,367,1


In [3]:
train.columns

Index(['rating', 'title_x', 'text', 'images_x', 'asin', 'parent_asin',
       'user_id', 'timestamp', 'helpful_vote', 'verified_purchase',
       'main_category', 'title_y', 'average_rating', 'rating_number',
       'features', 'description', 'price', 'images_y', 'videos', 'store',
       'categories', 'details', 'user_id_enc', 'asin_enc', 'label'],
      dtype='object')

In [4]:
def prepare_features_for_two_tower(df):
    """
    Create user and item features for two tower models
    """

    # user features -> user tower
    user_features = df.groupby("user_id_enc").agg(
        {
            'rating':['mean','std','count'],  # user's rating behaviour
            'verified_purchase':'mean',    # verified purchase ratio 
            'helpful_vote':'mean',   # mean helpful votes
            'price':['mean','median'], # price preference
            'label':'mean' # positivity rate
        }
    
    ).reset_index()

    user_features.columns = ['user_id_enc','user_avg_rating','user_rating_std','user_review_count','user_verified_ratio','user_helpful_avg','user_price_mean',
    'user_price_median','user_positivity_rate']


    # item features -> item tower
    item_features = df.groupby('asin_enc').agg({
        'rating':['mean','count'],
        'average_rating':'first',
        'rating_number':'first',
        'price':'first',
        'verified_purchase':'mean',
        'helpful_vote':'mean'
        
        
    }).reset_index()

    item_features.columns = ['asin_enc','item_avg_rating','item_review_count','product_rating','total_reviews','price','item_verified_ratio','item_helful_avg']

    ## adding categorical encodings
    store_encoder = LabelEncoder()
    main_cat_encoder = LabelEncoder()

    df['store_enc'] = store_encoder.fit_transform(df['store'])
    df['category_enc'] = main_cat_encoder.fit_transform(df['main_category'])

    # saving encoder models
    joblib.dump(store_encoder,"../models/store_encoder.joblib")
    joblib.dump(main_cat_encoder,"../models/category_encoder.joblib")

    return user_features,item_features,df

user_features, item_features, train_bal = prepare_features_for_two_tower(train)
print(f"User features shape: {user_features.shape}")
print(f"Item features shape: {item_features.shape}")
    

User features shape: (68253, 9)
Item features shape: (13201, 8)


# 2. Negative Sampling

In [5]:
class NegativeSampler:

    def __init__(self,train_df,n_negatives = 4):
        self.n_negatives = n_negatives

        # user-item interaction matrix
        self.user_items = train_df[train_df['label']==1].groupby('user_id_enc')['asin_enc'].apply(set).to_dict()
        self.all_items = set(train_df['asin_enc'].unique())


    def sample_negatives(self, user_id,exclude_items= None,method="uniform"):
        if exclude_items is None:
            exclude_items = self.user_items.get(user_id,set())

        candidate_items = list(self.all-items - exclude_items)

        if len(candidate_items) < self.n_negatives:
            return candidate_items

        if method=="uniform:
            return np.random.choice(candidate_items, self.n_negatives, replace=False)
        elif method='popularity":
            weights = [self.item_weights.get(item,1e-6) for item in candidate_items]
            
            weights = np.array(weights)

        
        

        