In [50]:
import pandas as pd
# Read a CSV file
data = pd.read_csv('../../datasets/cleaned_loss_rate_dataset.csv')
# Display the dataframe
data.head()

Unnamed: 0,Month,Category Name,Item Name,Quantity Sold (kilo),Unit Selling Price (RMB/kg),Wholesale Price (RMB/kg),Loss Rate (%),total_sales,Sale or Return,Discount (Yes/No)
0,1,capsicum,Green Hot Peppers,0.475929,8.287794,5.049097,6.72,28.38,sale,False
1,1,capsicum,Green Hot Peppers,0.6,8.287794,5.049097,6.72,15.48,sale,False
2,1,capsicum,Green Hot Peppers,0.2,8.287794,5.049097,6.72,5.16,sale,False
3,1,capsicum,Green Hot Peppers,0.3,8.287794,5.049097,6.72,7.74,sale,False
4,1,capsicum,Green Hot Peppers,0.3,8.287794,5.049097,6.72,7.74,sale,False


In [51]:
data.shape

(475428, 10)

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming 'Loss Rate (%)' is the target variable
y = data['Loss Rate (%)']

# Select features for model training
numerical_columns = ['Month', 'Quantity Sold (kilo)', 'Unit Selling Price (RMB/kg)', 'Wholesale Price (RMB/kg)', 'total_sales']
categorical_columns = ['Category Name', 'Item Name', 'Sale or Return', 'Discount (Yes/No)']

# Extract numerical features
X_numerical = data[numerical_columns]

# Extract categorical features
X_categorical = data[categorical_columns]

# Apply one-hot encoding using pd.get_dummies for categorical variables
X_categorical_encoded = pd.get_dummies(X_categorical, columns=categorical_columns, drop_first=True)

# Concatenate numerical and encoded categorical features
X = pd.concat([X_numerical, X_categorical_encoded], axis=1)

In [53]:
# Split data into training and test sets (75% training, 25% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [54]:
# Initialize and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)

In [55]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.0004425320828728834


In [56]:
import numpy as np
# Calculate the variance of y_pred
variance_y_pred = np.var(y_pred)
print(f"Variance of y_pred: {variance_y_pred}")

Variance of y_pred: 20.99007474218773


In [57]:
# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 0.021036446536259003


In [58]:
from sklearn.metrics import mean_absolute_error
# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Round the MAE to 4 decimal places
rounded_mae = round(mae, 4)

print(f"Mean Absolute Error (MAE): {rounded_mae}")

Mean Absolute Error (MAE): 0.0001


In [59]:
# Predict Loss Rate for the first 5 values in the test data
X_test_subset = X_test.head()
predictions = model.predict(X_test_subset)

# Display the results along with Month, Item Name, Category Name, and Discount information
result_df = pd.DataFrame({
    'Month': data.loc[X_test_subset.index, 'Month'],  # Assuming 'Month' is the original month numbers
    'Predicted Loss Rate (%)': predictions
})
result_df.head()

Unnamed: 0,Month,Predicted Loss Rate (%)
33771,2,10.8
11108,1,13.82
317832,4,5.7
371715,7,24.05
256082,12,5.54


In [60]:
y_test[:5]

33771     10.80
11108     13.82
317832     5.70
371715    24.05
256082     5.54
Name: Loss Rate (%), dtype: float64

In [61]:
predictions[:5]

array([10.8 , 13.82,  5.7 , 24.05,  5.54])

In [62]:
y_test.shape[0]

118857

In [63]:
import pickle

# save model
with open('lossRatemodel.pickle', 'wb') as file:
    pickle.dump(model, file)