In [10]:
import pandas as pd
# Read a CSV file
data = pd.read_csv('../../datasets/cleaned_loss_rate_dataset.csv')
# Display the dataframe
data.head()

Unnamed: 0,Month,category_name,item_name,quantity_sold_kg,unit_selling_price_rmb/kg,wholesale_price_(rmb/kg),loss_rate_(%),total_sales,sale_or_return,discount
0,1,capsicum,Green Hot Peppers,0.475929,8.287794,5.049097,6.72,28.38,sale,False
1,1,capsicum,Green Hot Peppers,0.6,8.287794,5.049097,6.72,15.48,sale,False
2,1,capsicum,Green Hot Peppers,0.2,8.287794,5.049097,6.72,5.16,sale,False
3,1,capsicum,Green Hot Peppers,0.3,8.287794,5.049097,6.72,7.74,sale,False
4,1,capsicum,Green Hot Peppers,0.3,8.287794,5.049097,6.72,7.74,sale,False


In [11]:
data.shape

(475428, 10)

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming 'Loss Rate (%)' is the target variable
y = data['loss_rate_(%)']

# Select features for model training
numerical_columns = ['Month', 'quantity_sold_kg', 'unit_selling_price_rmb/kg', 'wholesale_price_(rmb/kg)', 'total_sales']
categorical_columns = ['category_name', 'item_name', 'sale_or_return', 'discount']

# Extract numerical features
X_numerical = data[numerical_columns]

# Extract categorical features
X_categorical = data[categorical_columns]

# Apply one-hot encoding using pd.get_dummies for categorical variables
X_categorical_encoded = pd.get_dummies(X_categorical, columns=categorical_columns, drop_first=True)

# Concatenate numerical and encoded categorical features
X = pd.concat([X_numerical, X_categorical_encoded], axis=1)

In [13]:
# Split data into training and test sets (75% training, 25% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
# Initialize and train a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)

In [15]:
from sklearn.metrics import mean_absolute_error
# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Round the MAE to 4 decimal places
rounded_mae = round(mae, 4)

print(f"Mean Absolute Error (MAE): {rounded_mae}")

Mean Absolute Error (MAE): 0.0001


In [16]:
from sklearn.linear_model import Lasso
lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
lasso_predictions = lasso_model.predict(X_test)
lasso_mae = mean_absolute_error(y_test, lasso_predictions)
print(f"MAE for Lasso Regression: {lasso_mae:.4f}")

MAE for Lasso Regression: 3.6523


In [17]:
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_mae = mean_absolute_error(y_test, dt_predictions)
print(f"MAE for Decision Tree Regressor: {dt_mae:.4f}")

MAE for Decision Tree Regressor: 0.0004


In [18]:
# Predict Loss Rate for the first 5 values in the test data
X_test_subset = X_test.head()
predictions = model.predict(X_test_subset)

# Display the results along with Month, Item Name, Category Name, and Discount information
result_df = pd.DataFrame({
    'Month': data.loc[X_test_subset.index, 'Month'],  # Assuming 'Month' is the original month numbers
    'Predicted Loss Rate (%)': predictions
})
result_df.head()

Unnamed: 0,Month,Predicted Loss Rate (%)
33771,2,10.8
11108,1,13.82
317832,4,5.7
371715,7,24.05
256082,12,5.54


In [19]:
y_test[:5]

33771     10.80
11108     13.82
317832     5.70
371715    24.05
256082     5.54
Name: loss_rate_(%), dtype: float64

In [20]:
predictions[:5]

array([10.8 , 13.82,  5.7 , 24.05,  5.54])

In [21]:
y_test.shape[0]

118857

In [22]:
import pickle

# save model
with open('lossRatemodel.pickle', 'wb') as file:
    pickle.dump(model, file)