In [1]:
import pandas as pd
import seaborn as sns

In [2]:
import warnings

# Turn ON all warnings (for debugging)
warnings.filterwarnings("ignore")

In [3]:
# Load dataset
df = sns.load_dataset("tips")

In [4]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [5]:
print(df.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
None


# 1. Check for Missing Values

In [7]:
print(df.isnull().sum())

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64


In [8]:
# introduce some missing values artificially
df.loc[3, "tip"] = None
df.loc[5, "smoker"] = None
print(df.isnull().sum())

total_bill    0
tip           1
sex           0
smoker        1
day           0
time          0
size          0
dtype: int64


# 2. Handle Missing Values

In [9]:
# Fill numeric missing with median
df["tip"] = df["tip"].fillna(df["tip"].median())

In [10]:
# Fill categorical missing with mode
df["smoker"] = df["smoker"].fillna(df["smoker"].mode()[0])

In [11]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,2.88,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


# 3. Remove Duplicates

In [12]:
df = df.drop_duplicates()

In [13]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,2.88,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


# 4. Fix Data Types

In [14]:
# Check datatypes
print(df.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object


In [15]:
# Convert 'size' to float (example)
df["size"] = df["size"].astype(float)

In [16]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2.0
1,10.34,1.66,Male,No,Sun,Dinner,3.0
2,21.01,3.50,Male,No,Sun,Dinner,3.0
3,23.68,2.88,Male,No,Sun,Dinner,2.0
4,24.59,3.61,Female,No,Sun,Dinner,4.0
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2.0
242,17.82,1.75,Male,No,Sat,Dinner,2.0


# 5. Handle Outliers (IQR method on total_bill)

In [17]:
Q1 = df["total_bill"].quantile(0.25)
Q3 = df["total_bill"].quantile(0.75)
IQR = Q3 - Q1

df = df[(df["total_bill"] >= (Q1 - 1.5 * IQR)) & 
        (df["total_bill"] <= (Q3 + 1.5 * IQR))]


In [18]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2.0
1,10.34,1.66,Male,No,Sun,Dinner,3.0
2,21.01,3.50,Male,No,Sun,Dinner,3.0
3,23.68,2.88,Male,No,Sun,Dinner,2.0
4,24.59,3.61,Female,No,Sun,Dinner,4.0
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2.0
242,17.82,1.75,Male,No,Sat,Dinner,2.0


# 6. Clean Strings (Formatting)

In [19]:
df["sex"] = df["sex"].str.strip().str.lower()  # lowercase
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,female,No,Sun,Dinner,2.0
1,10.34,1.66,male,No,Sun,Dinner,3.0
2,21.01,3.50,male,No,Sun,Dinner,3.0
3,23.68,2.88,male,No,Sun,Dinner,2.0
4,24.59,3.61,female,No,Sun,Dinner,4.0
...,...,...,...,...,...,...,...
239,29.03,5.92,male,No,Sat,Dinner,3.0
240,27.18,2.00,female,Yes,Sat,Dinner,2.0
241,22.67,2.00,male,Yes,Sat,Dinner,2.0
242,17.82,1.75,male,No,Sat,Dinner,2.0


In [20]:
df["smoker"] = df["smoker"].str.title()        # Title case
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,female,No,Sun,Dinner,2.0
1,10.34,1.66,male,No,Sun,Dinner,3.0
2,21.01,3.50,male,No,Sun,Dinner,3.0
3,23.68,2.88,male,No,Sun,Dinner,2.0
4,24.59,3.61,female,No,Sun,Dinner,4.0
...,...,...,...,...,...,...,...
239,29.03,5.92,male,No,Sat,Dinner,3.0
240,27.18,2.00,female,Yes,Sat,Dinner,2.0
241,22.67,2.00,male,Yes,Sat,Dinner,2.0
242,17.82,1.75,male,No,Sat,Dinner,2.0


# 7. Rename Columns

In [21]:
df = df.rename(columns={
    "total_bill": "TotalBill",
    "tip": "TipAmount",
    "size": "PartySize"
})

In [22]:
df

Unnamed: 0,TotalBill,TipAmount,sex,smoker,day,time,PartySize
0,16.99,1.01,female,No,Sun,Dinner,2.0
1,10.34,1.66,male,No,Sun,Dinner,3.0
2,21.01,3.50,male,No,Sun,Dinner,3.0
3,23.68,2.88,male,No,Sun,Dinner,2.0
4,24.59,3.61,female,No,Sun,Dinner,4.0
...,...,...,...,...,...,...,...
239,29.03,5.92,male,No,Sat,Dinner,3.0
240,27.18,2.00,female,Yes,Sat,Dinner,2.0
241,22.67,2.00,male,Yes,Sat,Dinner,2.0
242,17.82,1.75,male,No,Sat,Dinner,2.0


# 8. Encode Categorical Variables

In [23]:
df = pd.get_dummies(df, columns=["sex", "smoker", "day", "time"], drop_first=True)

In [24]:
df

Unnamed: 0,TotalBill,TipAmount,PartySize,sex_male,smoker_Yes,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2.0,False,False,False,False,True,True
1,10.34,1.66,3.0,True,False,False,False,True,True
2,21.01,3.50,3.0,True,False,False,False,True,True
3,23.68,2.88,2.0,True,False,False,False,True,True
4,24.59,3.61,4.0,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3.0,True,False,False,True,False,True
240,27.18,2.00,2.0,False,True,False,True,False,True
241,22.67,2.00,2.0,True,True,False,True,False,True
242,17.82,1.75,2.0,True,False,False,True,False,True


# 9. Standardization (Scaling Numeric Data)

In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[["TotalBill", "TipAmount", "PartySize"]] = scaler.fit_transform(df[["TotalBill", "TipAmount", "PartySize"]])

In [26]:
df

Unnamed: 0,TotalBill,TipAmount,PartySize,sex_male,smoker_Yes,day_Fri,day_Sat,day_Sun,time_Dinner
0,-0.246806,-1.547054,-0.576779,False,False,False,False,True,True
1,-1.141975,-1.015983,0.520508,True,False,False,False,True,True
2,0.294334,0.487356,0.520508,True,False,False,False,True,True
3,0.653748,-0.019204,-0.576779,True,False,False,False,True,True
4,0.776245,0.577230,1.617794,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...
239,1.373922,2.464575,0.520508,True,False,False,True,False,True
240,1.124890,-0.738192,-0.576779,False,True,False,True,False,True
241,0.517790,-0.738192,-0.576779,True,True,False,True,False,True
242,-0.135078,-0.942450,-0.576779,True,False,False,True,False,True


# 10. Save Cleaned Dataset

In [27]:
df.to_csv("cleaned_tips.csv", index=False)
print("✅ Tips dataset cleaned and saved!")

✅ Tips dataset cleaned and saved!
