In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [42]:
data = pd.read_csv("data_set_final2.csv", low_memory=False)

In [43]:
data.drop(columns=['rolling_avg_DESCIDA_PRECO','rolling_avg_AUMENTO_PRECO','rolling_avg_SKUS_DOWN','rolling_avg_items','rolling_avg_sales','rolling_avg_SKUS_UP'], inplace=True)

In [44]:
data['DATA_VENDA'] = pd.to_datetime(data['DATA_VENDA'])

# Calculate day of the week and median sales
data['day_of_week'] = data['DATA_VENDA'].dt.day_name()
median_sales = data.groupby(['LOJA', 'day_of_week'])['VALOR_VENDA'].median().reset_index()
median_sales.rename(columns={'VALOR_VENDA': 'median_sales'}, inplace=True)
data = pd.merge(data, median_sales, on=['LOJA', 'day_of_week'], how='left')

# Define the target categories based on median sales
def classify_sales(row):
    if row['VALOR_VENDA'] > row['median_sales'] * 1.1:
        return 'above'
    elif row['VALOR_VENDA'] < row['median_sales'] * 0.9:
        return 'below'
    else:
        return 'similar'

data['sales_category'] = data.apply(classify_sales, axis=1)

# Create lag and rolling mean features
data['lag_7'] = data.groupby('LOJA')['VALOR_VENDA'].shift(7)
data['rolling_mean_15'] = data.groupby('LOJA')['VALOR_VENDA'].shift(1).rolling(window=15).mean()

# Handle missing values
data['lag_7'].fillna(0, inplace=True)
data['rolling_mean_15'].fillna(data['rolling_mean_15'].mean(), inplace=True)

# Select specific features to include
selected_categorical_cols = ['LOJA','day_of_week']  # Example: Include only 'day_of_week' and 'LOJA'
numeric_cols = ['TOTAL_COLABORADORES', 'SKUS','PRODUTIVIDADE_HORA','SKUS_UP','SKUS_DOWN','ITEMS','SELF_CHECKOUT','lag_7', 'rolling_mean_15']  # Example selected numeric features

# Setup the ColumnTransformer with OneHotEncoder for selected categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), selected_categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ])

# Create a pipeline with preprocessing and the classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Prepare features and target
features = selected_categorical_cols + numeric_cols
X = data[features]
y = data['sales_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print('Accuracy:', accuracy)
print('Classification Report:')
print(class_report)

Accuracy: 0.7629589095612377
Classification Report:
              precision    recall  f1-score   support

       above       0.81      0.70      0.75      5437
       below       0.80      0.67      0.73      4674
     similar       0.73      0.84      0.78      9991

    accuracy                           0.76     20102
   macro avg       0.78      0.74      0.75     20102
weighted avg       0.77      0.76      0.76     20102



In [10]:
X_train

Unnamed: 0,CIDADE,REGIAO,PRODUTIVIDADE_HORA,TOTAL_COLABORADORES,SKUS,CAIXAS_TRADICIONAIS,SELF_CHECKOUT,ABERTURA_LOJA,FECHO_LOJA,ITEMS,...,FIM_EVENTO,TEMPO_ABERTURA,HORAS_ABERTURA,day_of_week,rolling_avg_sales,rolling_avg_items,rolling_avg_SKUS_UP,rolling_avg_SKUS_DOWN,rolling_avg_AUMENTO_PRECO,rolling_avg_DESCIDA_PRECO
26983,Guimarães,Região Norte,153.0,31.0,19248.0,13.0,7.0,1900-01-01 08:00:00,1900-01-01 21:00:00,19158.0,...,,0 days 13:00:00,13.0,Monday,106933.087500,20382.00,2484.250000,2829.500000,16.967500,15.735000
87213,Vila Franca de Xira,Lisboa Norte,0.0,11.0,2912.0,17.0,0.0,1900-01-01 10:00:00,1900-01-01 20:00:00,2642.0,...,,0 days 10:00:00,10.0,Monday,3684.816667,2467.00,493.333333,424.333333,16.226667,12.983333
59188,Lisboa,Lisboa Central,162.0,13.0,3224.0,17.0,0.0,1900-01-01 09:00:00,1900-01-01 21:00:00,3138.0,...,,0 days 12:00:00,12.0,Thursday,5493.442500,3242.75,490.500000,481.750000,15.142500,14.427500
54291,Lisboa,Lisboa Central,0.0,13.0,3172.0,15.0,0.0,1900-01-01 09:00:00,1900-01-01 21:00:00,2716.0,...,,0 days 12:00:00,12.0,Sunday,5007.850000,2851.00,440.000000,477.750000,12.967500,13.607500
59788,Lisboa,Lisboa Central,162.0,13.0,3224.0,17.0,0.0,1900-01-01 09:00:00,1900-01-01 21:00:00,3731.0,...,,0 days 12:00:00,12.0,Tuesday,7041.350000,3676.25,212.750000,1159.250000,6.530000,39.870000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,Caldas da Rainha,Lisboa Norte,136.0,7.0,2760.0,4.0,4.0,1900-01-01 07:00:00,1900-01-01 20:00:00,4640.0,...,,0 days 13:00:00,13.0,Wednesday,9693.142500,4965.25,417.500000,505.000000,13.537500,14.230000
54886,Lisboa,Lisboa Central,0.0,13.0,3172.0,15.0,0.0,1900-01-01 09:00:00,1900-01-01 21:00:00,2441.0,...,,0 days 12:00:00,12.0,Sunday,4811.582500,2550.25,510.000000,526.750000,14.502500,17.387500
76820,Almada,Margem Sul,225.0,16.0,3059.0,17.0,0.0,1900-01-01 08:00:00,1900-01-01 20:00:00,3175.0,...,,0 days 12:00:00,12.0,Sunday,5734.230000,2902.00,513.250000,452.000000,13.712500,15.645000
860,Alfragide,Lisboa Central,269.0,85.0,35052.0,26.0,8.0,1900-01-01 08:00:00,1900-01-01 21:00:00,49489.0,...,,0 days 13:00:00,13.0,Wednesday,155368.105000,48719.75,4779.750000,4854.000000,12.407500,13.570000


In [11]:
y

0           above
1         similar
2         similar
3         similar
4         similar
           ...   
100502      above
100503      above
100504      above
100505      above
100506      above
Name: sales_category, Length: 100507, dtype: object