In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

# Load the data
item_info = pd.read_csv('../data/item_info.csv')
transactions = pd.read_csv('../data/transactions_info.csv')
merged_data = pd.read_csv('../data/processed_data.csv')
item_features = pd.read_csv('../data/item_features.csv')
sales_features = pd.read_csv('../data/sales_features.csv')
target_data = pd.read_csv('../data/target_variable.csv')
time_features = pd.read_csv('../data/time_features.csv')

# Merge data to create the master table
master_table = pd.merge(merged_data, item_features, on='item_code')
master_table = pd.merge(master_table, sales_features, on=['item_code', 'invoice_time'])
master_table = pd.merge(master_table, target_data, on=['item_code', 'invoice_time'])
master_table = pd.merge(master_table, time_features, on='Primary Key')


In [2]:
# Identify non-numeric columns
non_numeric_columns = master_table.select_dtypes(exclude=['number']).columns

# Encode non-numeric columns if necessary (e.g., 'item_num')
le = LabelEncoder()
master_table['invoice_num'] = le.fit_transform(master_table['invoice_num'])
# Drop specified columns from the master_table
columns_to_drop = ['item_sub_segment_x', 'item_segment_x', 'item_category', 'item_sub_department_x', 'item_department_x']
master_table.drop(columns=columns_to_drop, inplace=True)


In [3]:
columns_to_drop = ['item_sub_segment_y', 'item_segment_y', 'item_sub_department_y', 'item_department_y']
master_table.drop(columns=columns_to_drop, inplace=True)

In [4]:
master_table.columns

Index(['item_code', 'invoice_num', 'invoice_time', 'item_qty', 'Primary Key',
       'item_cat_Accessories', 'item_cat_Acne Preparations',
       'item_cat_Adult Diapers', 'item_cat_After Shave Care',
       'item_cat_Agro Chemicals',
       ...
       'item_cat_Wrist/Ankle Ware', 'item_cat_XMas Decorations',
       'item_cat_living', 'mean_sales', 'max_sales', 'min_sales', 'sales',
       'hour_of_day', 'day_of_week', 'month'],
      dtype='object', length=746)

In [5]:
master_table.head()

Unnamed: 0,item_code,invoice_num,invoice_time,item_qty,Primary Key,item_cat_Accessories,item_cat_Acne Preparations,item_cat_Adult Diapers,item_cat_After Shave Care,item_cat_Agro Chemicals,...,item_cat_Wrist/Ankle Ware,item_cat_XMas Decorations,item_cat_living,mean_sales,max_sales,min_sales,sales,hour_of_day,day_of_week,month
0,1080988,51247,2022-03-11 09:37:02+00:00,1.0,1080988_2022-03-11_9,0,0,0,0,0,...,0,0,0,1.0,1.0,1.0,1.0,9,4,3
1,1080988,15514,2022-03-11 18:56:29+00:00,1.0,1080988_2022-03-11_18,0,0,0,0,0,...,0,0,0,1.0,1.0,1.0,1.0,18,4,3
2,1080988,15514,2022-03-11 18:56:29+00:00,1.0,1080988_2022-03-11_18,0,0,0,0,0,...,0,0,0,1.0,1.0,1.0,1.0,18,4,3
3,1080988,15514,2022-03-11 18:56:29+00:00,1.0,1080988_2022-03-11_18,0,0,0,0,0,...,0,0,0,1.0,1.0,1.0,1.0,18,4,3
4,1080988,15514,2022-03-11 18:56:29+00:00,1.0,1080988_2022-03-11_18,0,0,0,0,0,...,0,0,0,1.0,1.0,1.0,1.0,18,4,3


In [6]:
# Define features and target variable
X = master_table.drop(columns=['sales', 'Primary Key', 'invoice_time'])
y = master_table['sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a simple model (e.g., Random Forest Regressor)
model = RandomForestRegressor()

In [7]:
# Fit the model
model.fit(X_train, y_train)

In [8]:
predictions = model.predict(X_test)

# Calculate MAPE
mape = mean_absolute_error(y_test, predictions) / y_test.mean() * 100
print(f"MAPE: {mape:.2f}%")

MAPE: 0.05%
