In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [94]:
data = pd.read_csv("data_set_final2.csv", low_memory=False)

In [95]:
data.drop(columns=['rolling_avg_DESCIDA_PRECO','rolling_avg_AUMENTO_PRECO','rolling_avg_SKUS_DOWN','rolling_avg_items','rolling_avg_sales','rolling_avg_SKUS_UP'], inplace=True)

In [102]:
# Ensure 'DATA_VENDA' is parsed as datetime
data['DATA_VENDA'] = pd.to_datetime(data['DATA_VENDA'])

# Create lag and rolling mean features
data['lag_7'] = data.groupby('LOJA')['VALOR_VENDA'].shift(7)
data['rolling_mean_15'] = data.groupby('LOJA')['VALOR_VENDA'].shift(1).rolling(window=15).mean()

# Handle missing values
data['lag_7'].fillna(0, inplace=True)
data['rolling_mean_15'].fillna(data['rolling_mean_15'].mean(), inplace=True)

# Identify categorical and numerical columns 
categorical_cols = ['LOJA','day_of_week']  # Modify as necessary
numeric_cols = ['TOTAL_COLABORADORES', 'SKUS','PRODUTIVIDADE_HORA','SKUS_UP','SKUS_DOWN','ITEMS','SELF_CHECKOUT','lag_7', 'rolling_mean_15']

# Preprocessing for categorical data: OneHotEncoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Setup the pipeline 
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Define features and target
features = numeric_cols + categorical_cols  # Make sure this is updated if features change
target = 'VALOR_VENDA'

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 814271493.9323468
Mean Absolute Error: 3570.624270087065
R-squared: 0.7885619855300134


In [103]:
def forecast_and_compare(store_name, model, data, features): 
    # Filter data for the specified store and sort by date
    store_data = data[data['LOJA'] == store_name]
    last_year = store_data['DATA_VENDA'].max().year - 1
    historical_data = store_data[store_data['DATA_VENDA'].dt.year == last_year]

    # Calculate historical weekly averages
    weekly_averages = historical_data.groupby(historical_data['DATA_VENDA'].dt.day_name())['VALOR_VENDA'].mean().to_dict()

    # Prepare forecast for each day of the week in the next 7 days
    predictions = {}
    current_date = pd.Timestamp.today()  # Assuming forecasting starts from today
    for i in range(7):
        forecast_date = current_date + pd.Timedelta(days=i)
        day_of_week = forecast_date.day_name()
        
        # Simulate the necessary input for the model (ensure to match the preprocessor's expected format)
        # For simplicity, let's assume last available data for other features
        input_data = store_data.iloc[-1][features].copy()
        input_data['DATA_VENDA'] = forecast_date  # Update the date
        
        # Adjust input format to match model's expectation
        formatted_input = pd.DataFrame([input_data])
        prediction = model.predict(formatted_input)[0]
        
        # Store prediction
        predictions[day_of_week] = predictions.get(day_of_week, []) + [prediction]

    # Average predictions for each day if multiple predictions exist
    for day, values in predictions.items():
        predictions[day] = sum(values) / len(values)

    # Calculate differences from historical averages
    differences = {day: predictions[day] - weekly_averages.get(day, 0) for day in predictions}

    return predictions, weekly_averages, differences

# Example usage
store_name = input("Enter the store name: ")
predictions, averages, differences = forecast_and_compare(store_name, model, data, features)
print("Predictions by day of the week:", predictions)
print("Historical averages by day of the week:", averages)
print("Differences from averages:", differences)

Enter the store name: Gaia
Predictions by day of the week: {'Friday': 69664.83591508378, 'Saturday': 69664.83591508378, 'Sunday': 69664.83591508378, 'Monday': 69664.83591508378, 'Tuesday': 69664.83591508378, 'Wednesday': 69664.83591508378, 'Thursday': 69664.83591508378}
Historical averages by day of the week: {'Friday': 105794.19788461538, 'Monday': 93089.48211538464, 'Saturday': 134454.23088050314, 'Sunday': 99980.61012820514, 'Thursday': 93888.86512820514, 'Tuesday': 88534.497948718, 'Wednesday': 90208.81576923077}
Differences from averages: {'Friday': -36129.3619695316, 'Saturday': -64789.39496541936, 'Sunday': -30315.77421312136, 'Monday': -23424.646200300864, 'Tuesday': -18869.662033634217, 'Wednesday': -20543.979854146994, 'Thursday': -24224.029213121365}


