# Ranking Data Generation
Interaction data provides insight into the likelihood that a customer will engage with an item (e.g., through clicks or purchases). However, this dataset is limited to customer and product attributes, meaning it does not leverage additional transaction-level features such as purchase date (which may capture seasonality), price paid, or sales channel which are factors that could influence customer behaviour. To incorporate these variables, a separate ranking model must be developed and trained using a dataset specifically designed to include these transaction characteristics.

### Load Sampled Transaction, Customer, and Articles Data

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

# Setup path to data folder
data_path = Path('../data/')
processed_data_path = data_path / 'processed'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
transactions_df: pd.DataFrame = pd.read_pickle(
    processed_data_path / 'sampled_transactions.pkl'
)
customers_df: pd.DataFrame = pd.read_pickle(
    processed_data_path / 'sampled_customers.pkl'
)
all_articles_df: pd.DataFrame = pd.read_pickle(processed_data_path / 'articles.pkl')

In [3]:
# Only use article_id and customer_id from the transactions data
transactions_df = transactions_df[['customer_id', 'article_id']]

# Use all features of the articles data except article description and the embeddings.
articles_df = all_articles_df.drop(columns=['article_description', 'embeddings'])

# Only use customer_id and age from the customers data
customers_df = customers_df[['customer_id', 'age']]

In [4]:
# Merge the datasets
df = transactions_df.merge(articles_df, on='article_id', how='left')
df = df.merge(customers_df, on='customer_id', how='left')

df.head(5)

Unnamed: 0,customer_id,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,age
0,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,637549005,637549,Bama,252,Sweater,Garment Upper body,1010010,Melange,93,...,Knitwear,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1003,Knitwear,27.0
1,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,673677003,673677,Henry polo (1),252,Sweater,Garment Upper body,1010016,Solid,73,...,Knitwear,A,Ladieswear,1,Ladieswear,11,Womens Tailoring,1003,Knitwear,27.0
2,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,500435003,500435,Cool Bree bandana,74,Hair/alice band,Accessories,1010001,All over pattern,9,...,Hair Accessories,C,Ladies Accessories,1,Ladieswear,66,Womens Small accessories,1019,Accessories,27.0
3,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,685687004,685687,W YODA KNIT OL OFFER,252,Sweater,Garment Upper body,1010010,Melange,93,...,Campaigns,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1023,Special Offers,27.0
4,2a9b247a2bbee014c862efef7ec5927dd4aa93e43f8397...,663327002,663327,Lasagne,265,Dress,Garment Full body,1010022,Jacquard,10,...,Dress,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1013,Dresses Ladies,64.0


### Create Query Features

In [5]:
query_features = ['customer_id', 'age', 'article_id']
df = df[query_features]

# Create positive pairs
positive_pairs = df.copy()

Then, we create the negative pairs

In [6]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Calculate the number of negative samples
n_negative = len(positive_pairs) * 10

# Sample negative article_ids, customer_ids, and other features
article_ids = np.random.choice(
    articles_df['article_id'].unique(), replace=True, size=n_negative
)
customer_ids = np.random.choice(
    customers_df['customer_id'], replace=True, size=n_negative
)
other_features = np.random.choice(df['age'], replace=True, size=n_negative)

# Construct negative pairs DataFrame
negative_pairs = pd.DataFrame(
    {'article_id': article_ids, 'customer_id': customer_ids, 'age': other_features}
)

Next, we add the labels and concatenate (vertically) the positive and negative pairs into one DataFrame

In [7]:
positive_pairs['label'] = 1
negative_pairs['label'] = 0

ranking_df = pd.concat([positive_pairs, negative_pairs])

ranking_df.head(5)

Unnamed: 0,customer_id,age,article_id,label
0,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,27.0,637549005,1
1,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,27.0,673677003,1
2,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,27.0,500435003,1
3,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,27.0,685687004,1
4,2a9b247a2bbee014c862efef7ec5927dd4aa93e43f8397...,64.0,663327002,1


### Create Item Features

In [8]:
item_features = [
    'article_id',
    'product_type_name',
    'product_group_name',
    'graphical_appearance_name',
    'colour_group_name',
    'perceived_colour_value_name',
    'perceived_colour_master_name',
    'department_name',
    'index_name',
    'index_group_name',
    'section_name',
    'garment_group_name',
]

item_df = all_articles_df.copy()
item_df = item_df[item_features]

### Create Ranking Dataset

In [9]:
ranking_df = ranking_df.merge(item_df, on='article_id', how='left')

In [10]:
ranking_df.head(5)

Unnamed: 0,customer_id,age,article_id,label,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name
0,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,27.0,637549005,1,Sweater,Garment Upper body,Melange,Dark Green,Dark,Green,Knitwear,Ladieswear,Ladieswear,Womens Everyday Collection,Knitwear
1,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,27.0,673677003,1,Sweater,Garment Upper body,Solid,Dark Blue,Dark,Blue,Knitwear,Ladieswear,Ladieswear,Womens Tailoring,Knitwear
2,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,27.0,500435003,1,Hair/alice band,Accessories,All over pattern,Black,Dark,Black,Hair Accessories,Ladies Accessories,Ladieswear,Womens Small accessories,Accessories
3,16501613daf68dad9dc9bd34880b12efe4f473196b9f33...,27.0,685687004,1,Sweater,Garment Upper body,Melange,Dark Green,Dark,Green,Campaigns,Ladieswear,Ladieswear,Womens Everyday Collection,Special Offers
4,2a9b247a2bbee014c862efef7ec5927dd4aa93e43f8397...,64.0,663327002,1,Dress,Garment Full body,Jacquard,White,Light,White,Dress,Ladieswear,Ladieswear,Womens Everyday Collection,Dresses Ladies


In [11]:
ranking_df.isnull().sum()

customer_id                     0
age                             0
article_id                      0
label                           0
product_type_name               0
product_group_name              0
graphical_appearance_name       0
colour_group_name               0
perceived_colour_value_name     0
perceived_colour_master_name    0
department_name                 0
index_name                      0
index_group_name                0
section_name                    0
garment_group_name              0
dtype: int64

### Save the Engineered Features
Lastly, we pickle it to save the generated ranking dataset

In [12]:
ranking_df.to_pickle(processed_data_path / 'ranking.pkl')