In [2]:
import plotly.express as px
import numpy as np
import pandas as pd

# Tipping data
## Description
One waiter recorded information about each tip he received over a period of a few months working in one restaurant. He collected several variables:

## Details
tip in dollars,  
bill in dollars,  
sex of the bill payer,  
whether there were smokers in the party,  
day of the week,  
time of day,  
size of the party.   
number of the table

In [None]:
raw_tip_data = pd.read_excel("tip_data.xlsx")
raw_tip_data.head(10)

Unnamed: 0,bill,tip,sex,smoker,day,time,size,number
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0
1,10.34,1.66,Male,No,Sun,Dinner,3,
2,21.01,3.5,Male,No,Sun,Dinner,3,
3,23.68,3.31,Male,No,Sun,Dinner,2,
4,24.59,3.61,Female,No,Sun,Dinner,4,
5,25.29,4.71,Male,No,Sun,Dinner,4,
6,8.77,2.0,Male,No,Sun,Dinner,2,
7,26.88,3.12,Male,No,Sun,Dinner,4,
8,15.04,,Male,No,Sun,Dinner,2,
9,14.78,3.23,Male,No,Sun,Dinner,2,2.0


In [None]:
raw_tip_data.shape

(244, 8)

In [None]:
raw_tip_data.columns

Index(['bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size', 'number'], dtype='object')

In [None]:
raw_tip_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bill    240 non-null    float64
 1   tip     241 non-null    float64
 2   sex     244 non-null    object 
 3   smoker  244 non-null    object 
 4   day     244 non-null    object 
 5   time    244 non-null    object 
 6   size    244 non-null    int64  
 7   number  10 non-null     float64
dtypes: float64(3), int64(1), object(4)
memory usage: 15.4+ KB


In [None]:
raw_tip_data.isna().sum()

bill        4
tip         3
sex         0
smoker      0
day         0
time        0
size        0
number    234
dtype: int64

In [None]:
raw_tip_data.drop(columns=['number'],inplace=True)

In [None]:
raw_tip_data['bill'] = raw_tip_data['bill'].fillna(raw_tip_data['bill'].mean())
raw_tip_data['tip'] = raw_tip_data['tip'].fillna(raw_tip_data['tip'].mean())

In [None]:
raw_tip_data.isna().sum()

bill      0
tip       0
sex       0
smoker    0
day       0
time      0
size      0
dtype: int64

In [None]:
raw_tip_data.describe()

Unnamed: 0,bill,tip,size
count,244.0,244.0,244.0
mean,19.77925,3.003776,2.569672
std,8.892165,1.381587,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.815,3.0,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [None]:
raw_tip_data.head()

Unnamed: 0,bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
raw_tip_data['total_bill'] = raw_tip_data['bill'] + raw_tip_data['tip']

In [None]:
raw_tip_data['tip%'] = (raw_tip_data['tip'] / raw_tip_data['total_bill'])
raw_tip_data.head()

Unnamed: 0,bill,tip,sex,smoker,day,time,size,total_bill,tip%
0,16.99,1.01,Female,No,Sun,Dinner,2,18.0,0.056111
1,10.34,1.66,Male,No,Sun,Dinner,3,12.0,0.138333
2,21.01,3.5,Male,No,Sun,Dinner,3,24.51,0.142799
3,23.68,3.31,Male,No,Sun,Dinner,2,26.99,0.122638
4,24.59,3.61,Female,No,Sun,Dinner,4,28.2,0.128014


In [None]:
tip_data = raw_tip_data[['total_bill','bill','tip','tip%','size','day','time','sex','smoker']]
tip_data = tip_data.round(2)
tip_data 

Unnamed: 0,total_bill,bill,tip,tip%,size,day,time,sex,smoker
0,18.00,16.99,1.01,0.06,2,Sun,Dinner,Female,No
1,12.00,10.34,1.66,0.14,3,Sun,Dinner,Male,No
2,24.51,21.01,3.50,0.14,3,Sun,Dinner,Male,No
3,26.99,23.68,3.31,0.12,2,Sun,Dinner,Male,No
4,28.20,24.59,3.61,0.13,4,Sun,Dinner,Female,No
...,...,...,...,...,...,...,...,...,...
239,34.95,29.03,5.92,0.17,3,Sat,Dinner,Male,No
240,29.18,27.18,2.00,0.07,2,Sat,Dinner,Female,Yes
241,21.78,19.78,2.00,0.09,2,Sat,Dinner,Male,Yes
242,19.57,17.82,1.75,0.09,2,Sat,Dinner,Male,No


In [None]:
tip_data['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [None]:
tip_data['day'].value_counts(ascending=True)

day
Fri     19
Thur    62
Sun     76
Sat     87
Name: count, dtype: int64

In [None]:
fig = px.pie(
    tip_data, 
    values='tip', 
    names='day')
fig.update_layout(
    legend_title="day",
)
fig.show()

In [None]:
fig = px.histogram(
    x=tip_data['tip%'],
    color=tip_data['sex'],
    text_auto=True
    )
fig.update_layout(bargap=0.2)
fig.update_layout(
    xaxis_title="tip % bins",
    yaxis_title="count",
    legend_title="sex",
)
fig.show()

In [None]:
fig = px.scatter(
    x=tip_data['total_bill'],
    y=tip_data['tip%'],
    color=tip_data['day'],
    size=tip_data['tip'],
)
fig.update_layout(
    xaxis_title="total bill",
    yaxis_title="tip %",
    legend_title="day",
)
fig.show()

2222222222
