In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv("data_set_final2.csv", low_memory=False)

In [3]:
data.drop(columns=['rolling_avg_DESCIDA_PRECO','rolling_avg_AUMENTO_PRECO','rolling_avg_SKUS_DOWN','rolling_avg_items','rolling_avg_sales','rolling_avg_SKUS_UP'], inplace=True)

In [4]:
data['DATA_VENDA'] = pd.to_datetime(data['DATA_VENDA'])

# Calculate day of the week and median sales
data['day_of_week'] = data['DATA_VENDA'].dt.day_name()
median_sales = data.groupby(['LOJA', 'day_of_week'])['VALOR_VENDA'].median().reset_index()
median_sales.rename(columns={'VALOR_VENDA': 'median_sales'}, inplace=True)
data = pd.merge(data, median_sales, on=['LOJA', 'day_of_week'], how='left')

# Define the target categories based on median sales
def classify_sales(row):
    if row['VALOR_VENDA'] > row['median_sales'] * 1.1:
        return 'above'
    elif row['VALOR_VENDA'] < row['median_sales'] * 0.9:
        return 'below'
    else:
        return 'similar'

data['sales_category'] = data.apply(classify_sales, axis=1)

# Create lag and rolling mean features
data['lag_7'] = data.groupby('LOJA')['VALOR_VENDA'].shift(7)
data['rolling_mean_15'] = data.groupby('LOJA')['VALOR_VENDA'].shift(1).rolling(window=15).mean()

# Handle missing values
data['lag_7'].fillna(0, inplace=True)
data['rolling_mean_15'].fillna(data['rolling_mean_15'].mean(), inplace=True)

# Select specific features to include
selected_categorical_cols = ['LOJA','day_of_week']  # Example: Include only 'day_of_week' and 'LOJA'
numeric_cols = ['TOTAL_COLABORADORES', 'SKUS','PRODUTIVIDADE_HORA','SKUS_UP','SKUS_DOWN','ITEMS','SELF_CHECKOUT','lag_7', 'rolling_mean_15']  # Example selected numeric features

# Setup the ColumnTransformer with OneHotEncoder for selected categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), selected_categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ])

# Create a pipeline with preprocessing and the classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Prepare features and target
features = selected_categorical_cols + numeric_cols
X = data[features]
y = data['sales_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print('Accuracy:', accuracy)
print('Classification Report:')
print(class_report)

Accuracy: 0.7629589095612377
Classification Report:
              precision    recall  f1-score   support

       above       0.81      0.70      0.75      5437
       below       0.80      0.67      0.73      4674
     similar       0.73      0.84      0.78      9991

    accuracy                           0.76     20102
   macro avg       0.78      0.74      0.75     20102
weighted avg       0.77      0.76      0.76     20102

