As mentioned in this [thread](https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/307288), one way to approach this problem is to generate candidates with different models and then rank them using item features and user features. This notebook provides basic `item features` that you can use using ranking models, for example [this](https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/309220).

My previous notebook with [user features](https://www.kaggle.com/alexvishnevskiy/ranking-user-features/edit/run/88745460).

In [34]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Union

In [35]:
data_path = Path('../input/h-and-m-personalized-fashion-recommendations')
transactions_train = pd.read_csv(data_path/'transactions_train.csv')
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
customers_df = pd.read_csv(data_path/'customers.csv')
articles_df = pd.read_csv(data_path/'articles.csv')

In [36]:
articles_df.shape

(105542, 25)

In [37]:
articles_df['article_id'].unique().shape

(105542,)

Basically, I am using abstraction like this below. Every class should have `get` method and should output pandas DataFrame. Then, collect all features using another class `UserFeaturesCollector`.

In [38]:
class ItemFeatures(ABC):
    @abstractmethod
    def get(self, *args, **kwargs) -> pd.DataFrame:
        """
        article_id -> features
        """
        pass

In [39]:
class CategoryTransform(ItemFeatures):
    """
    factorize all articles columns
    """
    def __init__(self, articles_df: pd.DataFrame):
        self.articles_df = articles_df

    def get(self):
        self.__feature_columns = list(filter(lambda x: 'name' in x, self.articles_df.columns))[1:]
        filtered_articles = self.articles_df[self.__feature_columns]
        filtered_articles = filtered_articles.apply(lambda x: pd.factorize(x)[0])
        filtered_articles['article_id'] = self.articles_df['article_id']

        features = filtered_articles.set_index('article_id').astype('int8')
        return features

    def get_columns(self):
        return self.__feature_columns

In [40]:
class AggrTransform(ItemFeatures):
    """
    aggregation transactions features : mean, max and etc...
    """
    def __init__(self, articles_df: pd.DataFrame, transactions_df: pd.DataFrame):
        self.articles_df = articles_df
        self.transactions_df = transactions_df

    def get(self):
        stats = self._get_stats()
        return stats

    def _get_stats(self):
        transactions_more = self.transactions_df.merge(self.articles_df, on = ('article_id'))
        grouped = (
            transactions_more.
            groupby('article_id')
        )

        counts = (
            grouped['article_id']
            .count()
            .to_frame()
            .rename(columns = {'article_id': 'count'})
            .astype('int16')
            .reset_index()
            .set_index('article_id')
        )
        sums = (
            grouped['price']
            .sum()
            .to_frame()
            .astype('float32')
            .rename(columns = {
                'price': 'sum_price'
            })
        )
        means = (
            grouped['price']
            .mean()
            .to_frame()
            .astype('float32')
            .rename(columns = {
                'price': 'mean_price'
            })
        )
        mins = (
            grouped['price']
            .min()
            .to_frame()
            .astype('float32')
            .rename(columns = {
               'price': 'min_price' 
            })
        )
        maxs = (
            grouped['price']
            .max()
            .to_frame()
            .astype('float32')
            .rename(columns = {
                'price': 'max_price'
            })
        )
        
        output_df = (
            counts
            .merge(sums, on = ('article_id'))
            .merge(means, on = ('article_id'))
            .merge(mins, on = ('article_id'))
            .merge(maxs, on = ('article_id'))
        )
        return output_df

In [41]:
class TopTransforms(ItemFeatures):
    """
    whether category appears in top categories
    """
    def __init__(self, articles_df: pd.DataFrame, topk = 3):
        self.articles_df = articles_df
        self.topk = topk
    
    def get(self):
        name_cols = list(filter(lambda x: 'name' in x, self.articles_df.columns))  
        
        value_counts = self._get_value_counts(name_cols)
        value_counts = {
            f'{k}_{self.topk}': self.articles_df[k].isin(v).astype('int8') for k, v in value_counts.items()
        }
        
        output_df = self.articles_df.assign(**value_counts)
        output_df = output_df[['article_id'] + list(value_counts.keys())].set_index('article_id')
        return output_df
        
    def _get_value_counts(self, name_cols: List[str]):
        value_counts = self.articles_df[name_cols].apply(pd.Series.value_counts)
        get_index = lambda x: value_counts.sort_values(x, ascending = False)[x][:self.topk].index  
        value_counts = dict(zip(name_cols, map(lambda x: get_index(x), name_cols)))
        return value_counts

In [42]:
class ItemFeaturesCollector:
    @staticmethod
    def collect(features: Union[List[ItemFeatures], List[str]], **kwargs) -> pd.DataFrame:
        output_df = None

        for feature in tqdm(features):
            if isinstance(feature, ItemFeatures):
                feature_out = feature.get(**kwargs)
            if isinstance(feature, str):
                try:
                    feature_out = pd.read_csv(feature)
                except:
                    feature_out = pd.read_parquet(feature)

            if output_df is None:
                output_df = feature_out
            else:
                output_df = output_df.merge(feature_out, on = ('article_id'))
        return output_df

For simplicity let's take only first 100k transactions.

In [43]:
item_features = ItemFeaturesCollector.collect([
    CategoryTransform(articles_df),
    AggrTransform(articles_df, transactions_train.iloc[:100_000]),
    TopTransforms(articles_df)
])

100%|██████████| 3/3 [00:01<00:00,  2.21it/s]


In [44]:
item_features.head()

Unnamed: 0_level_0,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,...,product_group_name_3,graphical_appearance_name_3,colour_group_name_3,perceived_colour_value_name_3,perceived_colour_master_name_3,department_name_3,index_name_3,index_group_name_3,section_name_3,garment_group_name_3
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108775015,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,0,1
108775044,0,0,0,1,1,1,0,0,0,0,...,1,1,1,1,1,0,1,1,0,1
108775051,0,0,1,2,2,1,0,0,0,0,...,1,0,0,1,1,0,1,1,0,1
110065001,1,1,0,0,0,0,1,1,0,1,...,0,1,1,1,1,0,0,1,0,0
110065002,1,1,0,1,1,1,1,1,0,1,...,0,1,1,1,1,0,0,1,0,0


In [45]:
item_features.shape

(15581, 28)

In [46]:
item_features.to_parquet('item_features.parquet')

In [47]:
class UserFeatures(ABC):
    @abstractmethod
    def get(self) -> pd.DataFrame:
        """
        customer_id -> features
        """
        pass

In [48]:
class AggrFeatures(UserFeatures):
    """
    basic aggregation features(min, max, mean and etc...)
    """
    def __init__(self, transactions_df):
        self.groupby_df = transactions_df.groupby('customer_id', as_index = False)

    def get(self):
        output_df = (
            self.groupby_df['price']
            .agg({
                'mean_transactions': 'mean',
                'max_transactions': 'max',
                'min_transactions': 'min',
                'median_transactions': 'median',
                'sum_transactions': 'sum',
                'max_minus_min_transactions': lambda x: x.max()-x.min()
            })
            .set_index('customer_id')
            .astype('float32')
        )
        return output_df

In [49]:
class CountFeatures(UserFeatures):
    """
    basic features connected with transactions
    """
    def __init__(self, transactions_df, topk = 10):
        self.transactions_df = transactions_df
        self.topk = topk

    def get(self):
        grouped = self.transactions_df.groupby('customer_id', as_index = False)
        #number of transactions, number of online articles,
        #number of transactions bigger than mean price of transactions
        a = (
            grouped
            .agg({
                'article_id': 'count',
                'price': lambda x: sum(np.array(x) > x.mean()),
                'sales_channel_id': lambda x: sum(x == 2),
            })
            .rename(columns = {
                'article_id': 'n_transactions',
                'price': 'n_transactions_bigger_mean',
                'sales_channel_id': 'n_online_articles'
            })
            .set_index('customer_id')
            .astype('int8')
        )
        #number of unique articles, number of store articles
        b = (
            grouped
            .agg({
                'article_id': 'nunique',
                'sales_channel_id': lambda x: sum(x == 1),
            })
            .rename(columns = {
                'article_id': 'n_unique_articles',
                'sales_channel_id': 'n_store_articles',
            })
            .set_index('customer_id')
            .astype('int8')
        )
        #number of transactions that are in top
        topk_articles = self.transactions_df['article_id'].value_counts()[:self.topk].index
        c = (
            grouped['article_id']
            .agg({
               f'top_article_{i}':  lambda x: sum(x == k) for i, k in enumerate(topk_articles)
            }
            )
            .set_index('customer_id')
            .astype('int8')
        )
        
        output_df = a.merge(b, on = ('customer_id')).merge(c, on = ('customer_id'))
        return output_df

In [50]:
class CustomerFeatures(UserFeatures):
    """
    All columns from customers dataframe
    """
    def __init__(self, customers_df):
        self.customers_df = self._prepare_customers(customers_df)
    
    def _prepare_customers(self, customers_df):
        customers_df['FN'] = customers_df['FN'].fillna(0).astype('int8')
        customers_df['Active'] = customers_df['Active'].fillna(0).astype('int8')
        customers_df['club_member_status'] = customers_df['club_member_status'].fillna('UNKNOWN')
        customers_df['age'] = customers_df['age'].fillna(customers_df['age'].mean()).astype('int8')
        customers_df['fashion_news_frequency'] = (
            customers_df['fashion_news_frequency']
            .replace('None', 'NONE')
            .replace(np.nan, 'NONE')
        )
        return customers_df

    def get(self):
        output = (
            self.customers_df[filter(lambda x: x != 'postal_code', customers_df.columns)]
            .set_index('customer_id')
        )
        return output

In [51]:
class ArticlesFeatures(UserFeatures):
    """
    returns article features: whether category appears in top categories
    """
    def __init__(self, transactions_df, articles, topk = 10):
        self.merged_df = transactions_df.merge(articles, on = ('article_id'))
        self.articles = articles
        self.topk = topk
    
    def get(self):
        output_df = None

        for col in tqdm(self.articles.columns, desc = 'extracting features'):
            if 'name' in col:
                if output_df is None:
                    output_df = self.aggregate_topk(self.merged_df, col, self.topk)
                else:
                    intermediate_out = self.aggregate_topk(self.merged_df, col, self.topk)
                    output_df = output_df.merge(intermediate_out, on = ('customer_id'))
        return output_df

    def return_value_counts(self, df, column_name, k):
        value_counts = df[column_name].value_counts()[:k].index
        value_counts = list(map(lambda x: x[1], value_counts))
        return value_counts

    def aggregate_topk(self, merged_df, column_name, k):
        grouped_df_indx = merged_df.groupby('customer_id')
        grouped_df = merged_df.groupby('customer_id', as_index = False)
        
        topk_values = self.return_value_counts(grouped_df_indx, column_name, k)
        #how many transactions appears in top category(column)
        n_top_k = (
            grouped_df[column_name]
            .agg({
                f'top_{column_name}_{i}': lambda x: sum(x == k) for i, k in enumerate(topk_values)
            })
            .set_index('customer_id')
            .astype('int16')
        )
        return n_top_k

In [52]:
class UserFeaturesCollector:
    """
    collect all features and aggregate them
    """
    @staticmethod
    def collect(features: Union[List[UserFeatures], List[str]], **kwargs) -> pd.DataFrame:
        output_df = None

        for feature in tqdm(features):
            if isinstance(feature, UserFeatures):
                feature_out = feature.get(**kwargs)
            if isinstance(feature, str):
                try:
                    feature_out = pd.read_csv(feature)
                except:
                    feature_out = pd.read_parquet(feature)

            if output_df is None:
                output_df = feature_out
            else:
                output_df = output_df.merge(feature_out, on = ('customer_id'))
        return output_df

In [56]:
user_features = UserFeaturesCollector.collect([
    AggrFeatures(transactions_train.iloc[:]),
    CountFeatures(transactions_train.iloc[:], 3),
    CustomerFeatures(customers_df),
    ArticlesFeatures(transactions_train.iloc[:], articles_df, 3),
])

 75%|███████▌  | 3/4 [19:54<06:10, 370.58s/it]
extracting features:   0%|          | 0/25 [00:00<?, ?it/s][A
extracting features:  12%|█▏        | 3/25 [10:40<1:18:18, 213.57s/it][A
extracting features:  20%|██        | 5/25 [21:04<1:27:11, 261.57s/it][A
extracting features:  24%|██▍       | 6/25 [31:26<1:52:20, 354.79s/it][A
extracting features:  32%|███▏      | 8/25 [41:48<1:35:11, 335.99s/it][A
extracting features:  40%|████      | 10/25 [52:11<1:21:41, 326.77s/it][A
extracting features:  48%|████▊     | 12/25 [1:02:37<1:09:45, 321.95s/it][A
extracting features:  56%|█████▌    | 14/25 [1:13:00<58:23, 318.53s/it]  [A
extracting features:  64%|██████▍   | 16/25 [1:23:20<47:22, 315.81s/it][A
extracting features:  72%|███████▏  | 18/25 [1:33:38<36:34, 313.55s/it][A
extracting features:  80%|████████  | 20/25 [1:43:54<25:59, 311.87s/it][A
extracting features:  88%|████████▊ | 22/25 [1:54:12<15:32, 310.98s/it][A
extracting features: 100%|██████████| 25/25 [2:04:28<00:00, 298.7

In [54]:
user_features.head()


Unnamed: 0_level_0,mean_transactions,max_transactions,min_transactions,median_transactions,sum_transactions,max_minus_min_transactions,n_transactions,n_transactions_bigger_mean,n_online_articles,n_unique_articles,...,top_index_name_2,top_index_group_name_0,top_index_group_name_1,top_index_group_name_2,top_section_name_0,top_section_name_1,top_section_name_2,top_garment_group_name_0,top_garment_group_name_1,top_garment_group_name_2
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0.040661,0.050831,0.030492,0.040661,0.081322,0.020339,2,1,2,2,...,0,0,0,0,0,0,0,0,0,0
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,0.017271,0.020322,0.015237,0.016932,0.086356,0.005085,5,1,5,5,...,2,2,2,2,2,2,2,2,2,2
00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280,0.038119,0.053373,0.030492,0.030492,0.190593,0.022881,5,2,1,5,...,0,0,0,0,0,0,0,0,0,0
0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2,0.021424,0.022525,0.020322,0.021424,0.042847,0.002203,2,1,2,2,...,2,2,2,2,0,0,0,2,2,2
000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8,0.023768,0.042356,0.008458,0.016932,0.713051,0.033898,30,14,30,15,...,18,18,18,18,2,2,2,2,2,2


In [55]:
user_features.shape

(2954, 55)