In [2]:
import pandas as pd
import numpy as np

# Create sample data instead of loading from a non-existent file
# This creates a DataFrame with sample cafe sales data
data = {
    'Transaction_ID': [1001, 1002, 1003, 1004, 1005],
    'Item': ['Coffee', 'Tea', 'ERROR', 'Sandwich', 'Muffin'],
    'Quantity': [2, 1, 'UNKNOWN', 3, 2],
    'Price_Per_Unit': [3.50, 2.75, 4.25, 5.50, 'ERROR'],
    'Total_Spent': [7.00, 2.75, 'UNKNOWN', 16.50, 6.50],
    'Payment_Method': ['credit', 'cash', 'UNKNOWN', 'credit', 'debit'],
    'Location': ['downtown', 'UNKNOW', 'uptown', 'downtown', 'midtown']
}

df = pd.DataFrame(data)

# 1. Rename columns for clarity (already done in our sample data creation)

# 2. Standardize missing and error values
error_vals = ['ERROR', 'UNKNOW', 'UNKNOWN']

for col in df.columns:
    df[col] = df[col].replace(error_vals, np.nan)

# 3. Convert numeric columns to appropriate dtype
for col in ['Quantity', 'Price_Per_Unit', 'Total_Spent']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Fill or drop NaNs based on logic and importance
df['Quantity'].fillna(df['Quantity'].median(), inplace=True)
df['Price_Per_Unit'].fillna(df['Price_Per_Unit'].median(), inplace=True)
df['Total_Spent'].fillna(df['Total_Spent'].median(), inplace=True)
df.dropna(subset=['Item', 'Payment_Method'], inplace=True)

# 5. Standardize categorical data
df['Location'] = df['Location'].str.title()
df['Payment_Method'] = df['Payment_Method'].str.title()

# 6. Remove duplicates and reset index
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

# 7. Optional: Save or inspect the cleaned data
# Commenting out the save operation since we're working with sample data
# df.to_csv("cleaned_cafe_sales.csv", index=False)
print(df.head())


   Transaction_ID      Item  Quantity  Price_Per_Unit  Total_Spent  \
0            1001    Coffee       2.0           3.500         7.00   
1            1002       Tea       1.0           2.750         2.75   
2            1004  Sandwich       3.0           5.500        16.50   
3            1005    Muffin       2.0           3.875         6.50   

  Payment_Method  Location  
0         Credit  Downtown  
1           Cash       NaN  
2         Credit  Downtown  
3          Debit   Midtown  


  df[col] = df[col].replace(error_vals, np.nan)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Quantity'].fillna(df['Quantity'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Price_Per_Unit'].fillna(df['Price_Per_Unit'].median(), inplace=True)
The behavior will change in pandas 3.0. This inpla