In [37]:
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error
import numpy as np

In [7]:
def preprocess_data(data):
    """
    Preprocesses data by handling missing values and applying one-hot encoding to categorical columns.
    """

    num_cols = [col for col in data.columns if data[col].dtype != 'object'] 
    data[num_cols].fillna(data[num_cols].mean(), inplace=True)

    categorical_cols = [col for col in data.columns if data[col].dtype == 'object']
    data = pd.get_dummies(data, columns=categorical_cols)

    return data

In [9]:
# Load the dataset 
data = pd.read_csv("supermarket_sales.csv") 

In [46]:
data.head(20)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3
5,699-14-3026,C,Naypyitaw,Normal,Male,Electronic accessories,85.39,7,29.8865,627.6165,3/25/2019,18:30,Ewallet,597.73,4.761905,29.8865,4.1
6,355-53-5943,A,Yangon,Member,Female,Electronic accessories,68.84,6,20.652,433.692,2/25/2019,14:36,Ewallet,413.04,4.761905,20.652,5.8
7,315-22-5665,C,Naypyitaw,Normal,Female,Home and lifestyle,73.56,10,36.78,772.38,2/24/2019,11:38,Ewallet,735.6,4.761905,36.78,8.0
8,665-32-9167,A,Yangon,Member,Female,Health and beauty,36.26,2,3.626,76.146,1/10/2019,17:15,Credit card,72.52,4.761905,3.626,7.2
9,692-92-5582,B,Mandalay,Member,Female,Food and beverages,54.84,3,8.226,172.746,2/20/2019,13:27,Credit card,164.52,4.761905,8.226,5.9


In [10]:
# Preprocess the data
preprocessed_data = preprocess_data(data.copy()) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[num_cols].fillna(data[num_cols].mean(), inplace=True)


In [11]:
# Assuming 'Total' is your target variable for sales prediction
X = preprocessed_data.drop("Total", axis=1) 
y = preprocessed_data["Total"]

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [26]:
model = keras.Sequential(
    [
        keras.layers.Dense(units=128, activation="relu", input_shape=(X_train.shape[1],)),  # Deeper network
        keras.layers.Dense(units=64, activation="relu"),
        keras.layers.Dense(units=32, activation="relu"),
        keras.layers.Dense(units=1),
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [27]:
# Compile the model
model.compile(loss="mse", optimizer="adam")

In [28]:
# Train the model
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 147425.3281   
Epoch 2/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 41948.2461 
Epoch 3/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2583.4031 
Epoch 4/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 495.9294 
Epoch 5/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 322.5365 
Epoch 6/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 242.4600 
Epoch 7/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 184.8192 
Epoch 8/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 108.0239 
Epoch 9/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 34.1703 
Epoch 10/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x7fb9e87d9d80>

In [29]:
predictions = model.predict(X_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


In [43]:
loss = model.evaluate(X_test, y_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1864 


In [39]:
# Evaluate on testing data
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
medae = median_absolute_error(y_test, predictions)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 895us/step - loss: 0.1864


In [40]:
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("Median Absolute Error (MedAE):", medae)
print("R-Squared:", r2) 

Mean Squared Error (MSE): 0.19038912381729595
Mean Absolute Error (MAE): 0.34534157779693603
Root Mean Squared Error (RMSE): 0.4363360216820243
Median Absolute Error (MedAE): 0.29972186279296764
R-Squared: 0.9999970735931191


In [41]:
# Calculate metrics on testing data 
mse_test = mean_squared_error(y_test, predictions)
mae_test = mean_absolute_error(y_test, predictions)
rmse_test = np.sqrt(mse_test)
medae_test = median_absolute_error(y_test, predictions)
r2_test  = r2_score(y_test, predictions) 

In [42]:
print("Testing Set Results:")
print("Mean Squared Error (MSE):", mse_test)
print("Mean Absolute Error (MAE):", mae_test)
print("Root Mean Squared Error (RMSE):", rmse_test)
print("Median Absolute Error (MedAE):", medae_test)
print("R-Squared:", r2_test) 

Testing Set Results:
Mean Squared Error (MSE): 0.19038912381729595
Mean Absolute Error (MAE): 0.34534157779693603
Root Mean Squared Error (RMSE): 0.4363360216820243
Median Absolute Error (MedAE): 0.29972186279296764
R-Squared: 0.9999970735931191
