In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix
from scipy.stats import logistic
from typing import Iterable

In [84]:
class ModelShit():
    def __init__(self):
        self.freq = pd.DataFrame()

    def fill_data(self, data : pd.DataFrame()):
        self.data = data

    def fill_freq(self, freq : pd.DataFrame()):
        self.freq = freq

    def __get_i2i_matrix(self) -> pd.DataFrame:
        from sklearn.preprocessing import MultiLabelBinarizer

        item_lists = self.data.groupby(['receipt_id']).item_id.apply(list).tolist()

        mlb = MultiLabelBinarizer()
        item_matrix = mlb.fit_transform(item_lists)

        item_matrix_csr = csr_matrix(item_matrix)
        item_to_item_matrix = item_matrix_csr.T.dot(item_matrix_csr)

        item_to_item_matrix.setdiag(0)
        item_to_item_df = pd.DataFrame(item_to_item_matrix.toarray(), columns=mlb.classes_, index=mlb.classes_)

        return item_to_item_df

    def fit(self):
        self.freq = self.__get_i2i_matrix()

    def update(self):
        pass

    def predict(self, basket: set) -> list:
        try:
            basket = basket.intersection(self.freq.index.values)
        except AttributeError:
            print('Empy basket')
            return np.nan

        if len(basket) == 0:
            print('No such product')
            return np.nan

        res = pd.DataFrame(index=list(self.freq.columns))
        res['numerator'] = self.freq[list(basket)].sum(axis=1)
        res['denominator'] = self.freq[list(set(self.freq.keys()).difference(basket))].sum(axis=1)
        res['proba'] = res.numerator / res.T[list(basket)].T.denominator.sum()
        res['proba_calibr'] = logistic.cdf(res.proba)
        res = res.sort_values(by='proba', ascending=False)
        best_offer = res[~res.index.isin(basket)].head(1).index[0]

        return best_offer, list(zip(res.index, res.proba))

In [86]:
data = pd.read_table(r'./data/cosmetic_train.tsv')

model = ModelShit()

model.fill_data(data)

model.fit()

# fill matrix_freq if you already have it
# vvvvvvvvvvvv
# matrix_freq = pd.read_table(r'./data/item_to_item_matrix.tsv', index_col = 0)
# matrix_freq.columns = matrix_freq.columns.astype(int)
# model.fill_freq(matrix_freq)

Unnamed: 0,200000,200001,200002,200003,200004,200005,200006,200007,200008,200009,...,200666,200667,200668,200669,200670,200671,200672,200673,200674,200675
200000,0,0,1,0,4,0,0,0,1,4,...,0,0,0,0,0,0,0,0,0,0
200001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200002,1,0,0,0,4,0,0,0,2,3,...,6,0,0,0,0,0,0,0,0,0
200003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200004,4,0,4,0,0,0,0,0,2,4,...,2,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
200672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200674,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
cosmetics_train = pd.read_table('./data/cosmetic_train.tsv')
cosmetics_valid = pd.read_table('./data/cosmetic_val.tsv')
cosmetics_val_target = pd.read_table('./data/cosmetic_val_target.tsv')

In [88]:
valid_receipts = cosmetics_valid.groupby('receipt_id').item_id.apply(set).reset_index()
valid_receipts.rename(columns={'item_id': 'item_ids'}, inplace=True)
valid_receipts = valid_receipts.merge(cosmetics_val_target, on='receipt_id', how='left')
# valid_receipts.head()

In [89]:
y_pred = pd.Series(map(model.predict, valid_receipts.item_ids))

In [90]:
y_pred

0        (200588, [(200588, 0.034970857618651124), (200...
1        (200559, [(200559, 0.060136506179671645), (200...
2        (200389, [(200389, 0.10016420361247948), (2003...
3        (200221, [(200221, 0.030285941750149273), (200...
4        (200629, [(200629, 0.03121927236971485), (2002...
                               ...                        
22756    (200234, [(200234, 0.08913733192325124), (2006...
22757    (200478, [(200478, 0.09685137333716146), (2002...
22758    (200530, [(200530, 0.04948244617133028), (2004...
22759    (200049, [(200049, 0.0521909318255953), (20022...
22760    (200232, [(200220, 0.027750959832955387), (200...
Length: 22761, dtype: object

In [91]:
valid_receipts['item_id_pred'] = y_pred.apply(lambda p: p[0])
print('Accuracy: {:.2%}'.format((valid_receipts.item_id == valid_receipts.item_id_pred).mean()))

Accuracy: 15.86%
