In [1]:
import pandas as pd
import numpy as np
from random import randrange

In [2]:
# From https://www.ranker.com/list/the-tastiest-pizza-toppings/chef-jen
pizza_toppings = [
    'Alfredo Sauce',
    'Anchovy',
    'Artichoke Hearts',
    'Artichoke',
    'Arugula',
    'Asiago Cheese',
    'Bacon',
    'Banana Peppers',
    'Basil',
    'BBQ Pulled Pork',
    'BBQ Sauce',
    'Beef',
    'Black Olives',
    'Broccoli',
    'Buffalo Chicken Strips',
    'Buffalo Mozzarella',
    'Canadian Bacon',
    'Capers',
    'Capicola',
    'Capsicum',
    'Caramelised Onions',
    'Cherry Tomatoes',
    'Chicken',
    'Chorizo',
    'Crab Meat',
    'Crackerbread',
    'Crushed Red Pepper',
    'Duck',
    'Egg',
    'Eggplant',
    'Emmental',
    'Falafel',
    'Feta Cheese',
    'Fried clams',
    'Garlic',
    'Goat cheese',
    'Gorgonzola',
    'Green Bell Peppers',
    'Green Olives',
    'Gyro Meat',
    'Habanero Peppers',
    'Ham',
    'Hamburger',
    'Hot dog',
    'Italian Sausage',
    'Italian Sweet Pepper',
    'Jalapeño',
    'Meatballs',
    'Mozzarella',
    'Mushroom',
    'Onion',
    'Oregano',
    'Oysters',
    'Parmigiano Reggiano',
    'Peperoncini',
    'Pepperoni',
    'Pesto',
    'Philly Steak',
    'Pineapple',
    'Prosciutto',
    'Provolone',
    'Red Bell Pepper',
    'Red Onion',
    'Ricotta Cheese',
    'Salami',
    'Sauerkraut',
    'Sausage',
    'Seafood',
    'Shellfish',
    'Shrimps',
    'Spinach',
    'Squid',
    'Sun-Dried Tomatoes',
    'Swedish Meatballs',
    'Sweetcorn',
    'Tomato',
    'Tuna',
    'Venison'
]

In [3]:
def how_many():
    how_many = np.random.randint(5, size=1)
    return how_many[0]

def choose_toppings():
    n_toppings = how_many()
    if n_toppings == 0:
        return np.nan
    toppings = np.random.choice(pizza_toppings, n_toppings)
    return list(toppings)

def choose_date(start, end):
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds 
    random_second = randrange(int_delta)
    return start + pd.Timedelta(seconds=random_second)

def count_toppings(toppings):
    try: 
        n = len(toppings)
    except:
        n = 0 
    return n

def add_calories(toppings):
    n_toppings = count_toppings(toppings)
    cal_per_top = int(np.random.normal(20, 5))
    extra_calories = n_toppings * cal_per_top
    return extra_calories

In [4]:
# Set base data parameters
pepe_mean = 250
modern_mean = 300
sample_size = 100
base_std = 15

In [25]:
# Create base data
pepe_samples = np.random.normal(pepe_mean, base_std, sample_size).astype(int)
modern_samples = np.random.normal(modern_mean, base_std, sample_size).astype(int)

# Create DataFrame
df = pd.DataFrame(dict(value=np.r_[pepe_samples, modern_samples], group=np.r_[['Pepe\'s']*len(pepe_samples), ['Modern']*len(modern_samples)]))
df.columns = ['base_calories', 'pizzeria']

# And toppings
toppings_col = []
for x in range(df.shape[0]):
    rand_toppings = choose_toppings()
    toppings_col.append(rand_toppings)

toppings_df = pd.DataFrame({'toppings': toppings_col})
df = pd.concat([df, toppings_df], axis = 1)

# Add extra calories
for lab, row in df.iterrows():
    df.loc[lab, 'calories'] = df.loc[lab, 'base_calories'] + add_calories(df.loc[lab, 'toppings'])
df['calories'] = df['calories'].astype(int)

# Add dates
start = pd.Timestamp('2006-01-01')
end = pd.Timestamp('2007-01-01')
for lab, row in df.iterrows():
    df.loc[lab, 'date'] = choose_date(start, end)

df = df.sort_values('date')

In [26]:
data = df[['date', 'pizzeria', 'toppings', 'calories']]
data.to_csv('pizza-data.csv', index = False)

In [7]:
sample_size = 10

rand_choice = np.random.randint(2)

if rand_choice == 0:
    mystery_samples = np.random.normal(pepe_mean, base_std, sample_size).astype(int)
else:
    mystery_samples = np.random.normal(modern_mean, base_std, sample_size).astype(int)

# Create DataFrame 
df = pd.DataFrame(data = {'base_calories':mystery_samples})

# And toppings
toppings_col = []
for x in range(df.shape[0]):
    rand_toppings = choose_toppings()
    toppings_col.append(rand_toppings)

toppings_df = pd.DataFrame({'toppings': toppings_col})
df = pd.concat([df, toppings_df], axis = 1)

# Add extra calories
for lab, row in df.iterrows():
    df.loc[lab, 'calories'] = df.loc[lab, 'base_calories'] + add_calories(df.loc[lab, 'toppings'])
df['calories'] = df['calories'].astype(int)

In [8]:
data = df[['toppings', 'calories']]
data.to_csv('mystery-pizza-data.csv', index = False)