### Get full user data frame with registration date added

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

users_df = pd.read_json('./user_data/total_user_data.json')

shuffled_registration_dates_list = []

for i in range(0, 13):
    df = pd.read_csv(f'./user_registration_data/user_registrations_data_{i}.csv', index_col=False)
    shuffled_registration_dates_list.extend(df.values.flatten().tolist())

users_df['id'] = users_df.index
users_df['registration_date'] = shuffled_registration_dates_list[:len(users_df)]
users_df = users_df.sort_values(by='registration_date')

### Get likelihood of buying in different situations

In [2]:
# Likelihood of buying - male clients

men_ft_df = pd.read_csv('./likelihood_to_buy/likelihood_men_1.csv') # first time buyers
men_st_df = pd.read_csv('./likelihood_to_buy/likelihood_men_2.csv') # second time buyers
men_td_df = pd.read_csv('./likelihood_to_buy/likelihood_men_3.csv') # third time and beyond buyers

# Likelihood of buying - female clients

women_ft_df = pd.read_csv('./likelihood_to_buy/likelihood_women_1.csv') # first time buyers
women_st_df = pd.read_csv('./likelihood_to_buy/likelihood_women_2.csv') # second time buyers
women_td_df = pd.read_csv('./likelihood_to_buy/likelihood_women_3.csv') # third time and beyond buyers

# Likelihood of buying - unspecified gender clients

unspecified_ft_df = pd.read_csv('./likelihood_to_buy/likelihood_not_1.csv') # first time buyers
unspecified_st_df = pd.read_csv('./likelihood_to_buy/likelihood_not_2.csv') # second time buyers
unspecified_td_df = pd.read_csv('./likelihood_to_buy/likelihood_not_3.csv') # third time and beyond buyers

In [3]:
buyers_df = pd.DataFrame(columns=users_df.columns)
orders_df = pd.DataFrame(columns=['date', 'userId', 'productId'])

orders_date_df = pd.read_csv('./boxing_gloves_order_data/gloves_order_date_data.csv')

import bisect
import random
from enum import Enum

import warnings
warnings.filterwarnings('ignore')

class BuyerType(Enum):
    FIRST_TIME = "1st_time"
    SECOND_TIME = "2nd_time"
    THIRD_TIME = "3rd_time"

class Gender(Enum):
    MALE = "MALE"
    FEMALE = "FEMALE"
    NOT_SPECIFIED = "NOT_MENTIONED"

def find_users_registered_on_date(date):
    dates = users_df['registration_date'].to_list()
    left = bisect.bisect_left(dates, date)
    right = bisect.bisect_right(dates, date)

    return users_df.iloc[left:right]

import itertools

def get_probability_by_age_gender(age, gender, buyerType):
    buyer_mapping = {
        Gender.MALE.value: { BuyerType.FIRST_TIME.value: men_ft_df, BuyerType.SECOND_TIME.value: men_st_df, BuyerType.THIRD_TIME.value: men_td_df },
        Gender.FEMALE.value: { BuyerType.FIRST_TIME.value: women_ft_df, BuyerType.SECOND_TIME.value: women_st_df, BuyerType.THIRD_TIME.value: women_td_df },
        Gender.NOT_SPECIFIED.value: { BuyerType.FIRST_TIME.value: unspecified_ft_df, BuyerType.SECOND_TIME.value: unspecified_st_df, BuyerType.THIRD_TIME.value: unspecified_td_df }
    }

    df = buyer_mapping.get(gender).get(buyerType)

    newBuyerType = BuyerType.SECOND_TIME.value if buyerType == BuyerType.FIRST_TIME.value else BuyerType.THIRD_TIME.value
    
    return df.loc[df['Age'] == int(age), 'Probability'].values[0], newBuyerType

def assign_probability_to_first_time_buyers(date, _firstTimeBuyers_df):
    buyers_on_date = find_users_registered_on_date(date)

    if not buyers_on_date.empty:
        # Calculate probability in bulk (vectorized)
        buyers_on_date['probability'] = buyers_on_date.apply(
            lambda buyer: get_probability_by_age_gender(buyer['age'], buyer['gender'], BuyerType.FIRST_TIME.value)[0], axis=1
        )
        buyers_on_date['buyer_type'] = BuyerType.FIRST_TIME.value
        return pd.concat([_firstTimeBuyers_df, buyers_on_date])

def pick_based_on_probability(probabilities):
    cumulative_probs = list(itertools.accumulate(probabilities))
    total_probs = cumulative_probs[-1]

    rand_num = random.uniform(0, total_probs)
    
    index = bisect.bisect_left(cumulative_probs, rand_num)

    return index

def create_orders(buyers_df, orders_df):
    for index, row in orders_date_df.iterrows():
        buyers_df = assign_probability_to_first_time_buyers(row['date'], buyers_df)

        if buyers_df is not None and not buyers_df.empty:
            for i in range(int(row['number_of_orders'])):
                userIndex = pick_based_on_probability(buyers_df['probability'].values)

                pickedBuyer = buyers_df.iloc[userIndex]
                orderEntry = pd.DataFrame({'date': [row['date']], 'userId': pickedBuyer['id']})
                
                newProbability, newBuyerType = get_probability_by_age_gender(pickedBuyer['age'], pickedBuyer['gender'], pickedBuyer['buyer_type'])

                buyers_df.iloc[userIndex, buyers_df.columns.get_loc('probability')] = newProbability
                buyers_df.iloc[userIndex, buyers_df.columns.get_loc('buyer_type')] = newBuyerType

                orders_df = pd.concat([orders_df, orderEntry])

    return orders_df

orders_df = create_orders(buyers_df=buyers_df, orders_df=orders_df)

In [4]:
orders_df['productId'] = 0

orders_list = []

def dataframe_to_json_chunks(df, chunk_size=5000):
    chunks = [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
    json_chunks = [chunk.to_json(orient='records') for chunk in chunks]
    return json_chunks

# Example usage:
# df = pd.read_csv("your_file.csv")
json_chunks = dataframe_to_json_chunks(orders_df)

# Saving each chunk as a separate file
for i, chunk in enumerate(json_chunks):
    with open(f'./boxing_gloves_orders/boxing_gloves_orders_chunk_{i}.json', 'w') as f:
        f.write(chunk)