In [3]:
import pandas as pd
import random

# random.seed()  # Reseed the generator for more randomness

alpha = 0.4  
beta = 0.6   
purchasing_power = 100.0
k = 7   # size of bundles
C_max = 100.0  
theta = 1  
gamma = 0.25  
delta = 0.75
n_categories = 7  # max number of categories allowed per bundle
num_bundles = 2 # number of bundles
input_csv = "../src/data/files/processed_data_1000_C.csv"  
output_csv = "../src/data/files/bundle_data_1000_C.csv"  

def read_data_from_csv(csv_path):
    """Read product data from CSV."""
    return pd.read_csv(csv_path)

def save_bundle_to_csv(bundle, output_path):
    """Save the final bundle to a CSV file."""
    bundle_df = pd.DataFrame(bundle)
    bundle_df.to_csv(output_path, index=False)

def normalize(value, min_value, max_value):
    """Normalize the value to a common scale between 0 and 1."""
    if min_value == max_value:
        return 0
    return (value - min_value) / (max_value - min_value)

def calculate_combined_score(df, alpha, beta):
    """Calculate combined score using purchase frequency and recency."""
    min_f = df['purchase_frequency'].min()
    max_f = df['purchase_frequency'].max()
    min_r = df['recency'].min()
    max_r = df['recency'].max()
    
    df = df.copy()  # Ensure we're working with a copy
    df['normalized_f'] = df['purchase_frequency'].apply(lambda x: normalize(x, min_f, max_f))
    df['normalized_r'] = df['recency'].apply(lambda x: normalize(x, min_r, max_r))
    
    df['combined_score'] = (alpha * df['normalized_f']) + (beta * df['normalized_r'])
    return df

def select_anchor_product(df):
    """Select the anchor product based on the highest combined score."""
    return df.loc[df['combined_score'].idxmax()]

def generate_candidates(bundle, df):
    """Exclude products already in the bundle."""
    bundle_product_ids = {p['product_id'] for p in bundle}
    candidates = df[~df['product_id'].isin(bundle_product_ids)]
    return candidates

def score_candidates(candidates, bundle, gamma, delta):
    """Score candidates based on category and business rules."""
    def category_score(p):
        # Score is 1 if product's category is not in current bundle
        return 1 if p['category'] not in [b['category'] for b in bundle] else 0

    def business_score(p):
        val = random.randint(0, 3)
        print(f"Business Score Debug: product_id={p['product_id']} category={p['category']} score={val}")
        return val

    candidates = candidates.copy()
    candidates['category_score'] = candidates.apply(category_score, axis=1)
    candidates['business_score'] = candidates.apply(business_score, axis=1)
    candidates['score'] = gamma * candidates['category_score'] + delta * candidates['business_score']

    print("Candidate Categories:", candidates['category'].unique())
    print(candidates[['product_id', 'category', 'category_score', 'business_score', 'score']])
    return candidates


def build_bundle(df, anchor_product, purchasing_power, alpha, beta, k, C_max, theta, gamma, delta, n_categories):
    """
    Build a bundle that can contain up to n_categories categories.
    """
    bundle = [anchor_product]
    current_total_cost = anchor_product['price']
    categories_in_bundle = {anchor_product['category']}

    # Generate and score candidates
    candidates = generate_candidates(bundle, df)
    candidates = score_candidates(candidates, bundle, gamma, delta)

    # Filter candidates based on price constraint
    candidates = candidates[candidates['price'] <= theta * C_max]

    # Sort candidates by score in descending order for efficient selection
    candidates = candidates.sort_values(by='score', ascending=False)

    # Iterate through candidates to build the bundle
    for _, candidate in candidates.iterrows():
        if len(bundle) >= k:
            break  # Stop if bundle size is reached

        candidate_category = candidate['category']
        price = candidate['price']

        # If candidate category is new and within allowed category limit
        if candidate_category not in categories_in_bundle:
            if len(categories_in_bundle) < n_categories and current_total_cost + price <= C_max:
                bundle.append(candidate)
                current_total_cost += price
                categories_in_bundle.add(candidate_category)
        else:
            # Category already present in the bundle
            if current_total_cost + price <= C_max:
                bundle.append(candidate)
                current_total_cost += price

    # Ensure that bundles with a single product are not allowed
    if len(bundle) < 2:
        return []

    return bundle

# Read data from CSV
df = read_data_from_csv(input_csv)

bundles = []

# Iterate over each consultant
for consultant_id, group in df.groupby('consultant_id'):
    print(f"Generating bundles for Consultant {consultant_id}")
    
    # Use the average_purchasing_power column as C_max
    consultant_C_max = group['average_purchasing_power'].iloc[0]
    print(f"Average spending - {consultant_C_max}")
    
    for bundle_id in range(num_bundles):  # Generate 'num_bundles' bundles for each consultant
        unique_bundle_id = f"{consultant_id}_Bundle_{bundle_id + 1}"
        print(f"unique_bundle_id - {unique_bundle_id}")
        
        # Calculate combined score
        group_scored = calculate_combined_score(group, alpha, beta)
        
        # Select anchor product
        anchor_product = select_anchor_product(group_scored)

        # Build the bundle using average spending as C_max
        bundle = build_bundle(group_scored, anchor_product, purchasing_power, alpha, beta, k, consultant_C_max, theta, gamma, delta, n_categories)
        
        # Skip empty bundles or bundles with a single product
        if len(bundle) < 2:
            continue

        # Calculate bundle score as the average of combined scores
        bundle_score = sum([product['combined_score'] for product in bundle]) / len(bundle)

        # Save bundle details
        for idx, product in enumerate(bundle):
            product_copy = product.copy()
            product_copy['consultant_id'] = consultant_id
            product_copy['bundle_id'] = unique_bundle_id
            product_copy['is_anchor'] = 1 if idx == 0 else 0
            product_copy['bundle_score'] = bundle_score  # Add bundle score to each product in the bundle
            
            # Include the intermediate scores if they exist in product
            if 'category_score' in product:
                product_copy['category_score'] = product['category_score']
            if 'business_score' in product:
                product_copy['business_score'] = product['business_score']
            if 'score' in product:
                product_copy['selection_score'] = product['score']

            bundles.append(product_copy)

# Save the final bundles to CSV
save_bundle_to_csv(bundles, output_csv)


Generating bundles for Consultant 3441296
Average spending - 430.195
unique_bundle_id - 3441296_Bundle_1
Business Score Debug: product_id=200090349 category=CUIDADO PERSONAL score=1
Business Score Debug: product_id=200095864 category=FRAGANCIAS score=3
Business Score Debug: product_id=200107183 category=MAQUILLAJE score=0
Business Score Debug: product_id=200108045 category=TRATAMIENTO CORPORAL score=1
Business Score Debug: product_id=200095159 category=CUIDADO PERSONAL score=0
Business Score Debug: product_id=200106440 category=MAQUILLAJE score=0
Business Score Debug: product_id=200107144 category=MAQUILLAJE score=3
Business Score Debug: product_id=200108160 category=FRAGANCIAS score=3
Business Score Debug: product_id=200100407 category=TRATAMIENTO CORPORAL score=3
Business Score Debug: product_id=200096220 category=TRATAMIENTO CORPORAL score=3
Business Score Debug: product_id=200095159 category=CUIDADO PERSONAL score=0
Business Score Debug: product_id=200106488 category=TRATAMIENTO FA