# 📊 Cafe Sales Data Cleaning and Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

import warnings
warnings.filterwarnings('ignore')


### Load the dataset

In [2]:
df = pd.read_csv('dirty_cafe_sales.csv')  
df.head()


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


#### Drop duplicate records to ensure data integrity

In [3]:
df.drop_duplicates(inplace=True)

### Initial Data Cleaning
#### Replace 'ERROR' in 'Total Spent' with np.nan

In [4]:
df['Total Spent'] = pd.to_numeric(df['Total Spent'], errors='coerce')


#### Replace 'UNKNOWN' with np.nan for consistency

In [5]:
df.replace('UNKNOWN', np.nan, inplace=True)


#### Convert 'Transaction Date' to datetime format


In [6]:
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'], errors='coerce')


#### Check for missing values


In [7]:
df.isnull().sum()

Transaction ID         0
Item                 677
Quantity             309
Price Per Unit       343
Total Spent          502
Payment Method      2872
Location            3603
Transaction Date     460
dtype: int64

## Fill missing values using simple imputation strategies
### Mode imputation for categorical column

In [8]:
df['Payment Method'].fillna(df['Payment Method'].mode()[0], inplace=True)
df['Location'].fillna(df['Location'].mode()[0], inplace=True)

### Convert numeric columns explicitly


In [9]:
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
df['Price Per Unit'] = pd.to_numeric(df['Price Per Unit'], errors='coerce')
df['Total Spent'] = pd.to_numeric(df['Total Spent'], errors='coerce')

### Drop any rows with remaining missing values


In [10]:
df.dropna(inplace=True)


### Confirm all missing values handled


In [11]:
df.isnull().sum()

Transaction ID      0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

## Extract temporal features from 'Transaction Date'


In [12]:
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
df['year'] = df['Transaction Date'].dt.year
df['month'] = df['Transaction Date'].dt.month
df['day'] = df['Transaction Date'].dt.day
df['dayofweek'] = df['Transaction Date'].dt.dayofweek
df['hour'] = df['Transaction Date'].dt.hour


## Encode categorical variables with LabelEncoder


In [13]:
label_encoders = {}
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


## Store encoders for Payment Method, Location, and Item


In [14]:
payment_encoder = LabelEncoder()
location_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['Payment Method'] = payment_encoder.fit_transform(df['Payment Method'])
df['Location'] = location_encoder.fit_transform(df['Location'])


### Encode target column (Item)


In [15]:
df['Item'] = item_encoder.fit_transform(df['Item'])


### Show encoded classes


In [16]:
print("Item label classes:", item_encoder.classes_)

Item label classes: [0 1 2 3 4 5 6 7 8]


## Define features and target


In [17]:
X = df[['Quantity', 'Price Per Unit', 'Payment Method', 'Location']]
y = df['Item']

# Split data for training and testing


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Feature Scaling


In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

### Train Random Forest Classifier


In [20]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

## Save the trained model and encoders


In [21]:
# Save Model & Encoders
joblib.dump(model, "final_rf_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(payment_encoder, "payment_encoder.pkl")
joblib.dump(location_encoder, "location_encoder.pkl")
joblib.dump(item_encoder, "item_label_encoder.pkl")

print("\u2705 All assets saved successfully!")

✅ All assets saved successfully!
