In [7]:
import pandas as pd
import random

# Original data from the case study
data = {
    "Product": [
        "Budweiser Can (Pack of 6)", "Budweiser Bottle (Pack of 6)", "Budweiser Can (Pack of 12)",
        "Stella Artois Can (Pack of 6)", "Stella Artois Bottle (Pack of 6)", "Stella Artois Can (Pack of 12)",
        "Competitor Can (Pack of 6)", "Competitor Bottle (Pack of 6)", "Competitor Can (Pack of 12)"
    ],
    "Prices":[600, 660, 1100, 660, 720, 1200, 540, 600, 1000],
    "Store A": [20, 6, 8, 123, 108, 78, 68, 37, 12],
    "Store B": [65, 35, 24, 163, 105, 28, 274, 143, 53],
    "Store C": [163, 82, 74, 32, 10, 5, 11, 8, 8]
}

# Bias configuration
gender_distribution = {"Male": 0.7, "Female": 0.3}  # Probability of gender
regional_bias = {
    "Near Shop A": {"Stella Artois": 0.6, "Budweiser": 0.3, "Competitor": 0.1},
    "Near Shop B": {"Stella Artois": 0.5, "Budweiser": 0.4, "Competitor": 0.1},
    "Near Shop C": {"Stella Artois": 0.2, "Budweiser": 0.4, "Competitor": 0.4},
}

# Ensure products are always selected for every customer
def get_product_bias(product, location):
    if "Stella Artois" in product:
        return regional_bias[location]["Stella Artois"]
    elif "Budweiser" in product:
        return regional_bias[location]["Budweiser"]
    else:
        return regional_bias[location]["Competitor"]

# Generate customer-level data
def generate_dataset(df, store_column, location_label):
    dataset = []
    for _, row in df.iterrows():
        product = row["Product"]
        prices = row["Prices"]
        num_customers = row[store_column]
        for _ in range(num_customers):
            gender = random.choices(["Male", "Female"], weights=[gender_distribution["Male"], gender_distribution["Female"]])[0]
            age = random.randint(21, 35) if "Can" in product else random.randint(36, 65)
            dataset.append({
                "Customer Name": f"Customer {len(dataset) + 1}",
                "Age": age,
                "Gender": gender,
                "Location": location_label,
                "Choice of Beer": product,
                "Prices": prices
            })
    return dataset

# Convert to DataFrame
df_original = pd.DataFrame(data)

# Generate datasets for each store
dataset_a = generate_dataset(df_original, "Store A", "Near Shop A")
dataset_b = generate_dataset(df_original, "Store B", "Near Shop B")
dataset_c = generate_dataset(df_original, "Store C", "Near Shop C")

# Combine all datasets
full_dataset = pd.DataFrame(dataset_a + dataset_b + dataset_c)

# Save to a CSV file
full_dataset.to_csv("Customer_Data.csv", index=False)
