In [8]:
import pandas as pd
import datetime

# Retrive data

Download the **transactions data** here : https://storage.cloud.google.com/cerebra_sales_data/uniqlo/parquet_files/consolidated/transformed_transaction_lines.parquet

Download the **inventory data**,  you can use the file with the latest date suffix ( will be used for other purposes later ) :
https://console.cloud.google.com/storage/browser/cerebra_sales_data/uniqlo/metrics/invent[…]B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false

# Preprocess data

In [17]:
def get_processed_data(df, number= "100k"):
    
    COL_USER = "customer_id"
    COL_ITEM = "variant_id"
    COL_TIME = "order_date"
    date_limit = datetime.datetime(2021,1,1)

    if number == "100k" :
        DATA_SIZE = 100000 
    elif number == "1M":
        DATA_SIZE = 1000000
    elif number == "all":
        DATA_SIZE = 42
        
    # remove row with no customer_id 
    df = df[(~df[COL_USER].isna()) &(df[COL_USER] != 'nan')].copy()
    # groupby col_user and col_item
    df = df.groupby([COL_USER,COL_ITEM]).agg({"quantity": 'sum', COL_TIME: 'max'}).reset_index()
    
    if number != "all":
        # groupby and sort by the number of transaction
        number_transaction = df.groupby(COL_USER).agg({COL_ITEM: 'count'}).reset_index()
        number_transaction.columns = [COL_USER, 'nb_transaction']
        number_transaction = number_transaction.sort_values(by=['nb_transaction'], ascending=False)
        # filter by customer_after_date
        customer_after_date_limit = df[df[COL_TIME] > date_limit][COL_USER]
        number_transaction = number_transaction[number_transaction[COL_USER].isin(customer_after_date_limit)]
        # filter by number 
        number_transaction = number_transaction[(number_transaction["nb_transaction"]<=300)]
        number_transaction = number_transaction[(number_transaction["nb_transaction"]>=10)]
        # cumsum
        number_transaction["nb_transaction_cumsum"] = number_transaction["nb_transaction"].cumsum()
        number_transaction = number_transaction[number_transaction["nb_transaction_cumsum"]<DATA_SIZE]
        # filter customer_id 
        df = df[df[COL_USER].isin(number_transaction[COL_USER])]
        
        # to make sure we have the right amount of data
        if DATA_SIZE - len(df) > 10000:
            print("Modify the filter on nb_transaction. The number of data in df is not correct")
    
    return df 

# Load and save preprocessed data

In [15]:
######## TO MODIFY ###########
path_to_transactions_df = 'uniqlo_parquet_files_consolidated_transformed_transaction_lines.parquet'
#############################

transactions_df = pd.read_parquet(path_to_transactions_df,engine="pyarrow")

for number in ["100k","1M","all"]:
    df = get_processed_data(transactions_df,number)
    df.to_pickle(f'transaction_{number}_df.pkl')