# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Time series
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose

# ML
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

import warnings
warnings.filterwarnings('ignore')

# Loading data

In [4]:
# Load sales and market data
sales_df = pd.read_csv('../Data/Case2_Sales data.csv', sep=";")
market_df = pd.read_excel('/Users/diogocarvalho/Documents/GitHub/Case-2--Siemens-Sales-Forecast/Data/Case2_Market data.xlsx', header=[0,1,2])

# Preprocessing sales data

In [8]:
# Then try converting with 'to_numeric'. 
# 'errors="coerce"' will replace anything unparseable with NaN
sales_df['Sales_EUR'] = (
    sales_df['Sales_EUR']
    .replace(',','.', regex=True)        # if commas are being used as decimals
    .replace(r'[^0-9\.\-]', '', regex=True)  # remove any characters that aren’t digits, dot, or minus
    .pipe(pd.to_numeric, errors='coerce')
)

In [9]:
sales_df['Sales_EUR'].dtype

dtype('float64')

DATE is object, and must be converted to datetime

In [10]:
# Convert the DATE column to datetime
sales_df['DATE'] = pd.to_datetime(sales_df['DATE'], format='%d.%m.%Y')

In [11]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9802 entries, 0 to 9801
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   DATE        9802 non-null   datetime64[ns]
 1   Mapped_GCK  9802 non-null   object        
 2   Sales_EUR   9795 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 229.9+ KB


In [None]:
sales_df.head()

Unnamed: 0,DATE,Mapped_GCK,Sales_EUR
0,01.10.2018,#1,0
1,02.10.2018,#1,0
2,03.10.2018,#1,0
3,04.10.2018,#1,0
4,05.10.2018,#1,0


In [13]:
# Aggregate to monthly level
sales_df['YearMonth'] = sales_df['DATE'].dt.to_period('M')
monthly_sales = sales_df.groupby(['YearMonth', 'Mapped_GCK'])['Sales_EUR'].sum().reset_index()

# Convert YearMonth to datetime
monthly_sales['YearMonth'] = monthly_sales['YearMonth'].dt.to_timestamp()

In [14]:
sales_df.sample(10)

Unnamed: 0,DATE,Mapped_GCK,Sales_EUR,YearMonth
1390,2022-03-17,#11,0.0,2022-03
2452,2019-06-10,#8,68958.37,2019-06
8149,2019-11-28,#6,0.0,2019-11
4132,2020-03-16,#3,0.0,2020-03
9005,2019-07-30,#6,0.0,2019-07
6594,2019-12-09,#8,0.0,2019-12
4362,2019-07-12,#4,72365.29,2019-07
6771,2020-06-04,#6,0.0,2020-06
233,2018-12-06,#1,0.0,2018-12
5435,2019-07-22,#4,0.0,2019-07


# Preprocessing market data

In [16]:
market_df

Unnamed: 0_level_0,Unnamed: 0_level_0,China,China,France,France,Germany,Germany,Italy,Italy,Japan,...,production index,production index,production index,production index,production index,production index,production index,production index,production index,production index
Unnamed: 0_level_1,Index 2010=100 (if not otherwise noted),Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,...,France: Machinery and equipment n.e.c.,Germany: Machinery and equipment n.e.c.,United States: Electrical equipment,World: Electrical equipment,Switzerland: Electrical equipment,United Kingdom: Electrical equipment,Italy: Electrical equipment,Japan: Electrical equipment,France: Electrical equipment,Germany: Electrical equipment
Unnamed: 0_level_2,date,MAB_ELE_PRO156,MAB_ELE_SHP156,MAB_ELE_PRO250,MAB_ELE_SHP250,MAB_ELE_PRO276,MAB_ELE_SHP276,MAB_ELE_PRO380,MAB_ELE_SHP380,MAB_ELE_PRO392,...,PRO28250_org,PRO28276_org,PRO27840_org,PRO271000_org,PRO27756_org,PRO27826_org,PRO27380_org,PRO27392_org,PRO27250_org,PRO27276_org
0,2004m2,16.940704,16.940704,112.091273,83.458866,82.623037,79.452532,124.289603,86.560493,109.334010,...,118.274109,80.829010,117.723991,,81.100000,120.706516,141.510864,106.161262,102.077057,85.913200
1,2004m3,23.711852,23.711852,136.327976,106.168192,100.556582,97.012918,143.411662,106.344544,140.884616,...,148.121841,102.130104,119.220779,,76.690307,138.309550,152.880234,140.288741,117.225685,97.670815
2,2004m4,24.435235,24.435235,117.791806,92.007646,89.653203,84.932358,129.083828,95.579673,105.853579,...,125.482231,90.961426,117.441124,,71.552403,115.557330,137.796875,106.271197,105.335777,87.253983
3,2004m5,23.708115,23.708115,109.002541,85.696486,86.880571,82.372794,135.590391,100.087039,101.864777,...,116.649750,88.082901,117.899216,,66.414500,119.269534,143.860535,101.608710,96.616508,84.675552
4,2004m6,27.009138,27.009138,133.785737,106.641482,99.010814,95.108740,136.424935,110.889719,120.332920,...,143.248734,100.978699,119.499107,,61.276596,128.849416,144.315308,116.655248,118.458710,95.401802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,2021m12,310.763183,310.763183,100.565744,134.589504,118.103281,149.364286,94.006826,150.482735,127.771735,...,112.791885,129.188248,109.624107,132.281006,114.326241,121.065762,72.915611,109.005151,80.763306,97.773956
215,2022m1,235.956129,235.956129,85.743503,108.156320,94.550610,120.353403,86.851008,101.258277,110.460181,...,90.558372,92.343117,111.364670,122.236023,108.999212,112.324119,74.355736,95.369065,77.944954,98.599052
216,2022m2,235.956129,235.956129,90.603540,117.715770,103.987916,129.383676,106.583758,120.956538,117.879631,...,97.766502,102.820961,114.688400,127.373421,103.672183,115.557330,91.182419,103.950687,79.001831,106.128059
217,2022m3,329.413367,329.413367,107.843548,136.858720,121.308119,151.201314,124.637966,153.645142,152.000561,...,114.720810,122.049515,115.164093,152.452942,98.345154,145.254965,102.475998,133.743932,96.704582,119.948433


# Merge market data

In [None]:
# Ensure macro data is monthly
market_df['YearMonth'] = market_df['Date'].dt.to_period('M').dt.to_timestamp()

# Merge with sales
full_df = pd.merge(monthly_sales, market_df, on='YearMonth', how='left')

# Sort for time series modeling
full_df = full_df.sort_values(['Mapped_GCK', 'YearMonth']).reset_index(drop=True)

# Feature Engineering

In [None]:
# Example: Add lag and rolling mean for each product group
for lag in [1, 2, 3]:
    full_df[f'lag_{lag}'] = full_df.groupby('Mapped_GCK')['Sales EUR'].shift(lag)

for window in [3, 6]:
    full_df[f'rolling_mean_{window}'] = full_df.groupby('Mapped_GCK')['Sales EUR'].transform(lambda x: x.shift(1).rolling(window).mean())

# Model with XGBoost (ML Approach)

In [None]:
results = []

# Loop through each product
for gck in full_df['Mapped_GCK'].unique():
    gck_df = full_df[full_df['Mapped_GCK'] == gck].dropna()
    
    train = gck_df[gck_df['YearMonth'] < '2022-05-01']
    test = gck_df[gck_df['YearMonth'] >= '2022-05-01']
    
    features = [col for col in gck_df.columns if col not in ['Sales EUR', 'Date', 'YearMonth', 'Mapped_GCK']]
    
    model = XGBRegressor(n_estimators=100, learning_rate=0.1)
    model.fit(train[features], train['Sales EUR'])
    
    preds = model.predict(test[features])
    
    rmse = np.sqrt(mean_squared_error(test['Sales EUR'], preds))
    results.append((gck, rmse))
    
    # Save predictions for submission
    test['Predicted_Sales'] = preds
    test[['YearMonth', 'Mapped_GCK', 'Predicted_Sales']].to_csv(f'predictions_{gck}.csv', index=False)

# Model with SARIMAX (Statistical Approach)

In [15]:
# Example for 1 product
gck = 'Product_X'
gck_df = full_df[full_df['Mapped_GCK'] == gck].set_index('YearMonth')

endog = gck_df['Sales EUR']
exog = gck_df[market_df.columns.difference(['Date', 'YearMonth'])]

# Train/test split
train_endog = endog[:'2022-04']
test_endog = endog['2022-05':]

train_exog = exog.loc[:'2022-04']
test_exog = exog.loc['2022-05':]

model = SARIMAX(train_endog, exog=train_exog, order=(1,1,1), seasonal_order=(1,1,1,12))
sarimax_res = model.fit()

forecast = sarimax_res.predict(start=test_endog.index[0], end=test_endog.index[-1], exog=test_exog)
rmse = np.sqrt(mean_squared_error(test_endog, forecast))
print(f"RMSE for {gck}: {rmse}")

NameError: name 'full_df' is not defined