# This notebook is used to generate synthetic data 

In [4]:
import seaborn as sns
import pandas as pd
import numpy as np

# Load the original Seaborn 'tips' dataset
tips = sns.load_dataset('tips')

# Display the first few rows of the original dataset
tips.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## B4 has three use-cases:

  - B4.1 The waiter kept collecting data; we want to append this data to our existing dataset
  - B4.2 Another waiter in the same restaurant collected data. But used a different sequence of the columns.
  - B4.3 In two other restaurants similar data was collected. We want to merge all this data into one dataset.

### B4.1 create 500 new entries and export as csv

In [5]:
# Define a function to generate a single random tip record
def generate_random_tip():
    total_bill = np.random.normal(loc=tips['total_bill'].mean(), scale=tips['total_bill'].std())
    tip = np.random.normal(loc=tips['tip'].mean(), scale=tips['tip'].std())
    sex = np.random.choice(['Male', 'Female'])
    smoker = np.random.choice(['Yes', 'No'])
    day = np.random.choice(['Thur', 'Fri', 'Sat', 'Sun'])
    time = np.random.choice(['Lunch', 'Dinner'])
    size = np.random.choice([1, 2, 3, 4, 5, 6], p=[0.15, 0.3, 0.3, 0.15, 0.05, 0.05])
    
    return [total_bill, tip, sex, smoker, day, time, size]

# Generate 500 random tip records
random_tips = [generate_random_tip() for _ in range(500)]

# Create a DataFrame with the generated data
random_tips_df = pd.DataFrame(random_tips, columns=tips.columns)

# Display the first few rows of the generated dataset
print(random_tips_df.head())

# Save the generated dataset to a CSV file
random_tips_df.to_csv('B4.1tips.csv', index=False)

   total_bill       tip     sex smoker  day    time  size
0   27.710399  4.821066    Male     No  Sat  Dinner     1
1   23.910417  2.727648  Female    Yes  Fri   Lunch     2
2   26.748398  2.943932    Male    Yes  Sun   Lunch     5
3   20.627826  5.729807  Female    Yes  Fri  Dinner     1
4   19.055699  2.999827  Female    Yes  Fri  Dinner     2


### B4.2 Different Waiter
- 400 records (to make it easily distinguishable)
- is better because gets on avg $2 more
- while recrods the same columns, the order is different

In [6]:
# Define a function to generate a single random tip record
def generate_better_waiter_tip():
    total_bill = np.random.normal(loc=tips['total_bill'].mean(), scale=tips['total_bill'].std())
    original_tip = np.random.normal(loc=tips['tip'].mean(), scale=tips['tip'].std())
    adjusted_tip = original_tip + 2
    sex = np.random.choice(['Male', 'Female'])
    smoker = np.random.choice(['Yes', 'No'])
    day = np.random.choice(['Thur', 'Fri', 'Sat', 'Sun'])
    time = np.random.choice(['Lunch', 'Dinner'])
    size = np.random.choice([1, 2, 3, 4, 5, 6], p=[0.15, 0.3, 0.3, 0.15, 0.05, 0.05])
    
    return {
        'total_bill': total_bill,
        'tip': adjusted_tip,
        'sex': sex,
        'smoker': smoker,
        'day': day,
        'time': time,
        'size': size
    }
    
    # return [total_bill, tip, sex, smoker, day, time, size]

# Generate 500 random tip records
waiter2_tips = [generate_better_waiter_tip() for _ in range(400)]

# Create a DataFrame with the generated data
waiter2_tips_df = pd.DataFrame(waiter2_tips, columns=tips.columns)


# Shuffle the columns of the DataFrame
shuffled_waiter2_tips_df = waiter2_tips_df.sample(frac=1, axis=1)

# Display the first few rows of the shuffled DataFrame
shuffled_waiter2_tips_df.head()

# Save the shuffled DataFrame to a CSV file
shuffled_waiter2_tips_df.to_csv('B4.2tips.csv', index=False)


In [7]:
random_tips_df.describe()


Unnamed: 0,total_bill,tip,size
count,500.0,500.0,500.0
mean,19.351315,2.937514,2.838
std,8.71423,1.412871,1.289605
min,-3.33384,-0.604297,1.0
25%,13.414918,2.039147,2.0
50%,19.038793,2.843053,3.0
75%,25.317973,3.872428,4.0
max,44.148825,7.426728,6.0


In [8]:
shuffled_waiter2_tips_df.describe()

Unnamed: 0,tip,size,total_bill
count,400.0,400.0,400.0
mean,5.046623,2.7575,19.244289
std,1.349415,1.295333,8.954652
min,1.25363,1.0,-7.495987
25%,4.164131,2.0,13.186137
50%,5.040953,3.0,19.228136
75%,5.9869,3.0,24.680867
max,9.04363,6.0,48.825816


## B4.3 In two other restaurants similar data was collected

- the biggest party is 5 not 6
- the bill is twice as high
- the tip is 1.5 times as high 


In [11]:
# Define a function to generate a single random tip record with custom specifications
def generate_other_restaurant_tip():
    total_bill = np.random.normal(loc=tips['total_bill'].mean() * 2, scale=tips['total_bill'].std())
    original_tip = np.random.normal(loc=tips['tip'].mean(), scale=tips['tip'].std())
    adjusted_tip = original_tip * 1.5
    sex = np.random.choice(['Male', 'Female'], p=[0.1, 0.9])
    smoker = np.random.choice(['Yes', 'No'])
    day = np.random.choice(['Thur', 'Fri', 'Sat', 'Sun'])
    time = np.random.choice(['Lunch', 'Dinner'])
    size = np.random.choice([1, 2, 3, 4, 5], p=[0.15, 0.3, 0.3, 0.15, 0.1])
    
    return {
        'total_bill': total_bill,
        'tip': adjusted_tip,
        'sex': sex,
        'smoker': smoker,
        'day': day,
        'time': time,
        'size': size
    }

# Generate 700 random tip records
restaurant2_tips = [generate_other_restaurant_tip() for _ in range(700)]

# Create a DataFrame with the generated data
restaurant2_tips_df = pd.DataFrame(restaurant2_tips, columns=tips.columns)

# Display the first few rows of the shuffled DataFrame
restaurant2_tips_df.head()

# Save the shuffled DataFrame to a CSV file
restaurant2_tips_df.to_csv('B4.3tips.csv', index=False)
