In [24]:
import pandas as pd
import yaml

In [25]:
pd.__version__

'1.3.5'

#### Loading config parameters

In [26]:
params_path = "../config/argentina_parameters.json"

In [27]:
with open(params_path, "r") as fp:
    params = yaml.safe_load(fp)

In [5]:
params

{'dataset': {'country': 'argentina',
  'sales_org': 'AR00',
  'grouping_cols': ['cust_id',
   'city',
   'customer_classification',
   'subsector',
   'category',
   'brand',
   'product_key'],
  'product_features_cols': ['product_key', 'subsector', 'category', 'brand'],
  'customer_features_cols': ['cust_id', 'city', 'customer_classification'],
  'columns': [],
  'column_to_predict': 'target',
  'columns_with_attr': [],
  'test_size': 0.2,
  'nfolds': 3,
  'num_samples': 5,
  'num_threads': 1,
  'recency': 180},
 'models': {}}

#### Read joined dataset

In [28]:
df_joined = pd.read_csv("gs://pg-explore/data/sample_dataset/intermediate/joined/df_joined_Argentina.csv", storage_options={"token":"/mnt/d/PnG/keys/tiger-mle-8c54fa5ce18f.json"})

In [29]:
df_joined.head()

Unnamed: 0.1,Unnamed: 0,sales_org,cust_id,sales_order_number,sales_document_date,product_key,order_qty,net_value,total,order_uom,...,assortment_groups,customer_classification,subsector,subsector_id,category,category_id,brand,brand_id,item_gtin,prod_name
0,0,Argentina,63000659,1037305840,2019-08-31,39144,1579,198261,34059,value_one,...,574,16,1,1,44,44,22,22,994,994
1,1,Argentina,63000269,1037344309,2021-03-26,39144,1166,114493,28226,value_one,...,184,4,1,1,44,44,22,22,994,994
2,2,Argentina,63000678,1037343942,2020-06-29,39144,1711,137972,38747,value_one,...,593,17,1,1,44,44,22,22,994,994
3,3,Argentina,63000282,1037307772,2019-12-26,39144,2193,152942,11309,value_one,...,197,17,1,1,44,44,22,22,994,994
4,4,Argentina,63000699,1037329902,2020-03-10,39144,2095,52454,27578,value_one,...,614,2,1,1,44,44,22,22,994,994


### Preprocessing logic

In [30]:
def aggregate_per_grouping_col(df_joined, params):
    """To do."""
    df_prod = df_joined.groupby(params["dataset"]["grouping_cols"]) \
        .agg({"order_qty": "size", "net_value": "sum"}).reset_index()
    df_prod = df_prod.rename(columns={"order_qty": "order_freq"})
    return df_prod


def get_rating_sku_storemaster_data(df_prod, params):
    """To do."""
    df_rating = df_prod[["cust_id", "product_key", "order_freq"]] \
        .rename(columns={"cust_id": "store", "product_key": "sku", "order_freq": "rating"})
    df_skumaster = df_prod[params["dataset"]["product_features_cols"]] \
        .set_index("product_key")
    df_storemaster = df_prod[params["dataset"]["customer_features_cols"]] \
        .set_index("cust_id")
    return df_rating, df_skumaster, df_storemaster


def compute_positive_ratings(df_rating):
    """To do."""
    from do_it.nbsku import ProcessRatings
    df_positive = ProcessRatings(df_rating[["store", "sku"]], rating=False) \
        .threshold_purchases().df_rating
    return df_positive


def build_inputs(df_rating, df_storemaster, df_skumaster):
    """To do."""
    from do_it.nbsku import InputsBuilder
    df_dataset, user_feature_matrix, item_feature_matrix = InputsBuilder(
        df_rating,
        df_storemaster=df_storemaster[df_storemaster.index.isin(df_rating["store"])],
        df_skumaster=df_skumaster[df_skumaster.index.isin(df_rating["sku"])]).build()
    return df_dataset, user_feature_matrix, item_feature_matrix

In [31]:
df_prod = aggregate_per_grouping_col(df_joined, params)

In [45]:
df_prod.head()

Unnamed: 0,cust_id,city,customer_classification,subsector,category,brand,product_key,order_freq,net_value
0,63000086,1,1,1,1,1,38151,1,189620
1,63000086,1,1,1,1,77,38551,1,136636
2,63000086,1,1,1,1,101,38251,1,165103
3,63000086,1,1,1,1,115,38751,1,125417
4,63000086,1,1,1,1,139,38451,1,175216


In [33]:
df_rating, df_skumaster, df_storemaster = get_rating_sku_storemaster_data(
                df_prod, params)

In [48]:
df_rating

Unnamed: 0,store,sku,rating
0,63000086,38151,1
1,63000086,38551,1
2,63000086,38251,1
3,63000086,38751,1
4,63000086,38451,1
...,...,...,...
286161,63000872,38175,1
286162,63000872,39120,1
286163,63000872,38829,1
286164,63000872,38344,1


In [51]:
df_rating = df_rating.drop_duplicates(subset=['store', 'sku'])

In [36]:
df_rating.count()

store     286166
sku       286166
rating    286166
dtype: int64

In [52]:
df_positive = compute_positive_ratings(df_rating)

In [54]:
df_dataset, user_feature_matrix, item_feature_matrix = build_inputs(
                df_rating, df_storemaster, df_skumaster)

In [56]:
df_dataset

<lightfm.data.Dataset at 0x7f69adb6b700>

In [57]:
user_feature_matrix

<525x527 sparse matrix of type '<class 'numpy.float32'>'
	with 1575 stored elements in Compressed Sparse Row format>

In [58]:
item_feature_matrix

<824x827 sparse matrix of type '<class 'numpy.float32'>'
	with 3296 stored elements in Compressed Sparse Row format>

#### Saving interim dataset for training

In [59]:
dict_prod = {}
dict_positive = {}
dict_dataset = {}
dict_user_feature_matrix = {}
dict_item_feature_matrix = {}
country = "Argentina"

In [61]:
dict_prod[country] = df_prod
dict_positive[country] = df_positive
dict_dataset[country] = df_dataset
dict_user_feature_matrix[country] = user_feature_matrix
dict_item_feature_matrix[country] = item_feature_matrix

In [None]:

with customers_fs.open(customer_overview_path) as customers:
    df_customer_overview = pd.read_csv(customers)

In [70]:
import pickle
import gcsfs

def register_interim_dataset(data, dataset_name):
    if isinstance(data, (dict, list)):
        interim_fs = gcsfs.GCSFileSystem(project= "tiger-mle", token="/mnt/d/PnG/keys/tiger-mle-8c54fa5ce18f.json")
        pickle_file = f"gs://pg-explore/data/magento/interim/{dataset_name}.pkl"
        with interim_fs.open(pickle_file, "wb") as interim_dataset_pickle:
            pickle.dump(data, interim_dataset_pickle)

In [71]:
register_interim_dataset(dict_positive, "dict_positive")
register_interim_dataset(dict_prod, "dict_prod")
register_interim_dataset(dict_dataset, "dict_dataset")
register_interim_dataset(dict_user_feature_matrix, "dict_user_feature_matrix")
register_interim_dataset(dict_item_feature_matrix, "dict_item_feature_matrix")