# Imports

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Time series
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose

# ML
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb


import warnings
warnings.filterwarnings('ignore')

# Configurations
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [61]:
# Formulas

def missing(df):
    """
    Displays only the columns in the DataFrame with missing values,
    along with the count and percentage of missing entries.
    """
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if missing.empty:
        print("No missing values in the dataset.")
    else:
        percent = (missing / len(df)) * 100
        missing_df = pd.DataFrame({
            'Missing Values': missing,
            'Percentage (%)': percent.round(2)
        })
        print("Columns with missing values:")
        display(missing_df)

# Loading data

In [62]:
# Load sales and market data
sales_df = pd.read_csv('../Data/Case2_Sales data.csv', sep=";")
market_df = pd.read_excel('/Users/diogocarvalho/Documents/GitHub/Case-2--Siemens-Sales-Forecast/Data/Case2_Market data.xlsx', header=[0,1,2])

# Preprocessing sales data

> Sales_EUR is object and must be numeric

In [63]:
# fix the type of Sales_EUR column
sales_df['Sales_EUR'] = (
    sales_df['Sales_EUR']
    .replace(',','.', regex=True)        
    .replace(r'[^0-9\.\-]', '', regex=True) 
    .pipe(pd.to_numeric, errors='coerce')
)

In [64]:
sales_df['Sales_EUR'].dtype

dtype('float64')

> DATE is object, and must be converted to datetime

In [65]:
# Convert the DATE column to datetime
sales_df['DATE'] = pd.to_datetime(sales_df['DATE'], format='%d.%m.%Y')

In [66]:
# Aggregate to monthly level
sales_df['YearMonth'] = sales_df['DATE'].dt.to_period('M')
monthly_sales = sales_df.groupby(['YearMonth', 'Mapped_GCK'])['Sales_EUR'].sum().reset_index()

# Convert YearMonth to datetime
monthly_sales['YearMonth'] = monthly_sales['YearMonth'].dt.to_timestamp()

In [67]:
# make DATE index of sales_df
# sales_df = sales_df.set_index('DATE')

In [68]:
sales_df.head()

Unnamed: 0,DATE,Mapped_GCK,Sales_EUR,YearMonth
0,2018-10-01,#1,0.0,2018-10
1,2018-10-02,#1,0.0,2018-10
2,2018-10-03,#1,0.0,2018-10
3,2018-10-04,#1,0.0,2018-10
4,2018-10-05,#1,0.0,2018-10


> Aggregating sales per product based on month and year

Since we will be doing monthly analysis, it's no use for us to have the data in days.

In [69]:
# Aggregate sales data by month for each product
monthly_sales_data = sales_df.groupby(['YearMonth', 'Mapped_GCK'], as_index=False).agg({
    'Sales_EUR': 'sum'  # Sum sales revenue
})

In [70]:
# Display the first few rows to check the transformation
monthly_sales_data.head(25)

Unnamed: 0,YearMonth,Mapped_GCK,Sales_EUR
0,2018-10,#1,36098918.79
1,2018-10,#11,1021303.5
2,2018-10,#12,28686.33
3,2018-10,#13,27666.1
4,2018-10,#14,5770.0
5,2018-10,#16,333196.87
6,2018-10,#20,4563.14
7,2018-10,#3,8089465.96
8,2018-10,#36,6474.6
9,2018-10,#4,397760.69


In [71]:
monthly_sales_data.shape

(602, 3)

### Separate products into different datasets

We will predcit procuts individually, so separating them will be useful

In [72]:
# replace the "#" with "Product" in the product names
monthly_sales_data['Mapped_GCK'] = monthly_sales_data['Mapped_GCK'].str.replace('#', 'Product')

In [73]:
# Get the unique products
unique_products = monthly_sales_data['Mapped_GCK' ].unique()
unique_products

array(['Product1', 'Product11', 'Product12', 'Product13', 'Product14',
       'Product16', 'Product20', 'Product3', 'Product36', 'Product4',
       'Product5', 'Product6', 'Product8', 'Product9'], dtype=object)

In [74]:
# Create a dictionary to store the DataFrames
product_dataframes = {product: monthly_sales_data[monthly_sales_data['Mapped_GCK'] == product] for product in unique_products}

In [75]:
# Loop through each product and assign it to a dynamically named variable
for i, product in enumerate(product_dataframes.keys(), start=1):
    globals()[f"product{i}"] = product_dataframes[product]

In [76]:
product1.head()

Unnamed: 0,YearMonth,Mapped_GCK,Sales_EUR
0,2018-10,Product1,36098918.79
14,2018-11,Product1,5140760.0
28,2018-12,Product1,37889612.12
42,2019-01,Product1,27728148.35
56,2019-02,Product1,34793163.53


In [77]:
# Check the shape of each product DataFrame
for i in range(1, len(product_dataframes) + 1):
    print(f"product{i} shape: {globals()[f'product{i}'].shape}")

product1 shape: (43, 3)
product2 shape: (43, 3)
product3 shape: (43, 3)
product4 shape: (43, 3)
product5 shape: (43, 3)
product6 shape: (43, 3)
product7 shape: (43, 3)
product8 shape: (43, 3)
product9 shape: (43, 3)
product10 shape: (43, 3)
product11 shape: (43, 3)
product12 shape: (43, 3)
product13 shape: (43, 3)
product14 shape: (43, 3)


In [78]:
for i in range(1, 15):
    df = globals()[f'product{i}']
    
    # Convert Period to Timestamp (datetime type)
    df['DATE'] = df['YearMonth'].dt.to_timestamp()
    
    globals()[f'product{i}'] = df

In [79]:
# check if the conversion was successful
# for i in range(1, len(product_dataframes) + 1):
#     print(f"product{i} first 5 rows:{globals()[f'product{i}'].info()}")

# Preprocessing market data

In [80]:
market_df.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,China,China,France,France,Germany,Germany,Italy,Italy,Japan,Japan,Switzerland,Switzerland,United Kingdom,United Kingdom,United States,United States,Europe,Europe,Europe,Europe,Europe,Europe,Europe,Europe,Europe,Producer Prices,Producer Prices,Producer Prices,Producer Prices,Producer Prices,Producer Prices,production index,production index,production index,production index,production index,production index,production index,production index,production index,production index,production index,production index,production index,production index,production index,production index
Unnamed: 0_level_1,Index 2010=100 (if not otherwise noted),Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,Production Index Machinery & Electricals,Shipments Index Machinery & Electricals,World: Price of Base Metals,World: Price of Energy,World: Price of Metals & Minerals,World: Price of Natural gas index,"World: Price of Crude oil, average",World: Price of Copper,United States: EUR in LCU,United States: Electrical equipment,United Kingdom: Electrical equipment,Italy: Electrical equipment,France: Electrical equipment,Germany: Electrical equipment,China: Electrical equipment,United States: Machinery and equipment n.e.c.,World: Machinery and equipment n.e.c.,Switzerland: Machinery and equipment n.e.c.,United Kingdom: Machinery and equipment n.e.c.,Italy: Machinery and equipment n.e.c.,Japan: Machinery and equipment n.e.c.,France: Machinery and equipment n.e.c.,Germany: Machinery and equipment n.e.c.,United States: Electrical equipment,World: Electrical equipment,Switzerland: Electrical equipment,United Kingdom: Electrical equipment,Italy: Electrical equipment,Japan: Electrical equipment,France: Electrical equipment,Germany: Electrical equipment
Unnamed: 0_level_2,date,MAB_ELE_PRO156,MAB_ELE_SHP156,MAB_ELE_PRO250,MAB_ELE_SHP250,MAB_ELE_PRO276,MAB_ELE_SHP276,MAB_ELE_PRO380,MAB_ELE_SHP380,MAB_ELE_PRO392,MAB_ELE_SHP392,MAB_ELE_PRO756,MAB_ELE_SHP756,MAB_ELE_PRO826,MAB_ELE_SHP826,MAB_ELE_PRO840,MAB_ELE_SHP840,MAB_ELE_PRO1100,MAB_ELE_SHP1100,RohiBASEMET1000_org,RohiENERGY1000_org,RohiMETMIN1000_org,RohiNATGAS1000_org,RohCRUDE_PETRO1000_org,RohCOPPER1000_org,WKLWEUR840_org,PRI27840_org,PRI27826_org,PRI27380_org,PRI27250_org,PRI27276_org,PRI27156_org,PRO28840_org,PRO281000_org,PRO28756_org,PRO28826_org,PRO28380_org,PRO28392_org,PRO28250_org,PRO28276_org,PRO27840_org,PRO271000_org,PRO27756_org,PRO27826_org,PRO27380_org,PRO27392_org,PRO27250_org,PRO27276_org
0,2004m2,16.940704,16.940704,112.091273,83.458866,82.623037,79.452532,124.289603,86.560493,109.33401,110.495272,91.221862,89.987275,111.353812,73.601265,107.6014,79.24023,97.122911,80.09853,54.039811,44.123338,48.747945,87.076974,39.639458,36.623832,1.2646,78.969864,80.757423,93.020027,,93.230453,,102.491722,97.597374,97.1,106.191977,116.790276,110.890034,118.274109,80.82901,117.723991,,81.1,120.706516,141.510864,106.161262,102.077057,85.9132
1,2004m3,23.711852,23.711852,136.327976,106.168192,100.556582,97.012918,143.411662,106.344544,140.884616,144.686166,85.866287,79.883583,127.558608,84.047595,110.187364,98.619024,113.783904,96.015929,54.666162,47.588957,49.256157,87.192705,42.592034,39.931055,1.2262,79.673569,80.962135,93.540268,,93.335678,,105.62748,113.224892,91.195116,121.625075,139.288391,141.176853,148.121841,102.130104,119.220779,,76.690307,138.30955,152.880234,140.288741,117.225685,97.670815
2,2004m4,24.435235,24.435235,117.791806,92.007646,89.653203,84.932358,129.083828,95.579673,105.853579,102.655769,85.622508,79.740802,108.732297,73.026027,108.166564,89.774031,101.715199,85.167236,54.872715,47.779013,49.423751,91.379923,42.650637,39.134854,1.1985,80.337639,80.757423,93.852425,,93.440903,,103.484955,100.16909,93.793535,104.965505,125.289566,105.648765,125.482231,90.961426,117.441124,,71.552403,115.55733,137.796875,106.271197,105.335777,87.253983
3,2004m5,23.708115,23.708115,109.002541,85.696486,86.880571,82.372794,135.590391,100.087039,101.864777,100.305285,85.378729,79.598021,110.6452,74.591883,108.425887,87.463813,101.275727,84.485767,51.230356,53.590898,46.468392,99.04452,47.517121,36.278433,1.2007,80.798828,80.757423,93.852425,,93.546127,,103.643944,99.581436,96.391954,105.885359,131.988998,101.990361,116.64975,88.082901,117.899216,,66.4145,119.269534,143.860535,101.60871,96.616508,84.675552
4,2004m6,27.009138,27.009138,133.785737,106.641482,99.010814,95.10874,136.424935,110.889719,120.33292,119.61638,85.13495,79.455239,122.02096,82.343346,110.569933,97.364496,112.057197,96.963294,52.876331,50.799575,47.803913,98.636267,44.967605,35.65738,1.2138,80.91349,80.552711,93.956467,,93.440903,,106.062668,109.27771,98.990373,118.252278,132.988922,122.136575,143.248734,100.978699,119.499107,,61.276596,128.849416,144.315308,116.655248,118.45871,95.401802


In [81]:
# drop the second header row
market_df.columns = market_df.columns.droplevel(1)

In [82]:
# Join the header levels
market_df.columns = [
    '_'.join([str(level) for level in col_tuple]).strip()
    for col_tuple in market_df.columns
]


In [83]:
# change first column name to "DATE"
market_df.rename(columns={'Unnamed: 0_level_0_date': 'DATE'}, inplace=True)

In [84]:
# Replace the "m" with "-" in DATE column of market_df
market_df['DATE'] = market_df['DATE'].str.replace('m', '-')

In [85]:
# Convert the DATE column to datetime
market_df['DATE'] = pd.to_datetime(market_df['DATE'], infer_datetime_format=True)

In [86]:
# Create the YearMonth column, like done before in the sales_df
market_df['YearMonth'] = market_df['DATE'].dt.to_period('M')

In [87]:
# Convert YearMonth to match sales_df
market_df['YearMonth'] = market_df['YearMonth'].dt.strftime('%Y-%m')

# Data Cleaning

### Checking for missing values

In [88]:
# Check missing values for features with at least one missing value
missing(market_df)

Columns with missing values:


Unnamed: 0,Missing Values,Percentage (%)
Switzerland_MAB_ELE_PRO756,1,0.46
Switzerland_MAB_ELE_SHP756,1,0.46
United Kingdom_MAB_ELE_SHP826,18,8.22
United States_MAB_ELE_SHP840,1,0.46
Producer Prices_PRI27826_org,18,8.22
Producer Prices_PRI27250_org,35,15.98
Producer Prices_PRI27156_org,23,10.5
production index_PRO28756_org,1,0.46
production index_PRO271000_org,11,5.02
production index_PRO27756_org,1,0.46


In [89]:
market_df.head()

Unnamed: 0,DATE,China_MAB_ELE_PRO156,China_MAB_ELE_SHP156,France_MAB_ELE_PRO250,France_MAB_ELE_SHP250,Germany_MAB_ELE_PRO276,Germany_MAB_ELE_SHP276,Italy_MAB_ELE_PRO380,Italy_MAB_ELE_SHP380,Japan_MAB_ELE_PRO392,Japan_MAB_ELE_SHP392,Switzerland_MAB_ELE_PRO756,Switzerland_MAB_ELE_SHP756,United Kingdom_MAB_ELE_PRO826,United Kingdom_MAB_ELE_SHP826,United States_MAB_ELE_PRO840,United States_MAB_ELE_SHP840,Europe_MAB_ELE_PRO1100,Europe_MAB_ELE_SHP1100,Europe_RohiBASEMET1000_org,Europe_RohiENERGY1000_org,Europe_RohiMETMIN1000_org,Europe_RohiNATGAS1000_org,Europe_RohCRUDE_PETRO1000_org,Europe_RohCOPPER1000_org,Europe_WKLWEUR840_org,Producer Prices_PRI27840_org,Producer Prices_PRI27826_org,Producer Prices_PRI27380_org,Producer Prices_PRI27250_org,Producer Prices_PRI27276_org,Producer Prices_PRI27156_org,production index_PRO28840_org,production index_PRO281000_org,production index_PRO28756_org,production index_PRO28826_org,production index_PRO28380_org,production index_PRO28392_org,production index_PRO28250_org,production index_PRO28276_org,production index_PRO27840_org,production index_PRO271000_org,production index_PRO27756_org,production index_PRO27826_org,production index_PRO27380_org,production index_PRO27392_org,production index_PRO27250_org,production index_PRO27276_org,YearMonth
0,2004-02-01,16.940704,16.940704,112.091273,83.458866,82.623037,79.452532,124.289603,86.560493,109.33401,110.495272,91.221862,89.987275,111.353812,73.601265,107.6014,79.24023,97.122911,80.09853,54.039811,44.123338,48.747945,87.076974,39.639458,36.623832,1.2646,78.969864,80.757423,93.020027,,93.230453,,102.491722,97.597374,97.1,106.191977,116.790276,110.890034,118.274109,80.82901,117.723991,,81.1,120.706516,141.510864,106.161262,102.077057,85.9132,2004-02
1,2004-03-01,23.711852,23.711852,136.327976,106.168192,100.556582,97.012918,143.411662,106.344544,140.884616,144.686166,85.866287,79.883583,127.558608,84.047595,110.187364,98.619024,113.783904,96.015929,54.666162,47.588957,49.256157,87.192705,42.592034,39.931055,1.2262,79.673569,80.962135,93.540268,,93.335678,,105.62748,113.224892,91.195116,121.625075,139.288391,141.176853,148.121841,102.130104,119.220779,,76.690307,138.30955,152.880234,140.288741,117.225685,97.670815,2004-03
2,2004-04-01,24.435235,24.435235,117.791806,92.007646,89.653203,84.932358,129.083828,95.579673,105.853579,102.655769,85.622508,79.740802,108.732297,73.026027,108.166564,89.774031,101.715199,85.167236,54.872715,47.779013,49.423751,91.379923,42.650637,39.134854,1.1985,80.337639,80.757423,93.852425,,93.440903,,103.484955,100.16909,93.793535,104.965505,125.289566,105.648765,125.482231,90.961426,117.441124,,71.552403,115.55733,137.796875,106.271197,105.335777,87.253983,2004-04
3,2004-05-01,23.708115,23.708115,109.002541,85.696486,86.880571,82.372794,135.590391,100.087039,101.864777,100.305285,85.378729,79.598021,110.6452,74.591883,108.425887,87.463813,101.275727,84.485767,51.230356,53.590898,46.468392,99.04452,47.517121,36.278433,1.2007,80.798828,80.757423,93.852425,,93.546127,,103.643944,99.581436,96.391954,105.885359,131.988998,101.990361,116.64975,88.082901,117.899216,,66.4145,119.269534,143.860535,101.60871,96.616508,84.675552,2004-05
4,2004-06-01,27.009138,27.009138,133.785737,106.641482,99.010814,95.10874,136.424935,110.889719,120.33292,119.61638,85.13495,79.455239,122.02096,82.343346,110.569933,97.364496,112.057197,96.963294,52.876331,50.799575,47.803913,98.636267,44.967605,35.65738,1.2138,80.91349,80.552711,93.956467,,93.440903,,106.062668,109.27771,98.990373,118.252278,132.988922,122.136575,143.248734,100.978699,119.499107,,61.276596,128.849416,144.315308,116.655248,118.45871,95.401802,2004-06


In [90]:
market_df.columns

Index(['DATE', 'China_MAB_ELE_PRO156', 'China_MAB_ELE_SHP156',
       'France_MAB_ELE_PRO250', 'France_MAB_ELE_SHP250',
       'Germany_MAB_ELE_PRO276', 'Germany_MAB_ELE_SHP276',
       'Italy_MAB_ELE_PRO380', 'Italy_MAB_ELE_SHP380', 'Japan_MAB_ELE_PRO392',
       'Japan_MAB_ELE_SHP392', 'Switzerland_MAB_ELE_PRO756',
       'Switzerland_MAB_ELE_SHP756', 'United Kingdom_MAB_ELE_PRO826',
       'United Kingdom_MAB_ELE_SHP826', 'United States_MAB_ELE_PRO840',
       'United States_MAB_ELE_SHP840', 'Europe_MAB_ELE_PRO1100',
       'Europe_MAB_ELE_SHP1100', 'Europe_RohiBASEMET1000_org',
       'Europe_RohiENERGY1000_org', 'Europe_RohiMETMIN1000_org',
       'Europe_RohiNATGAS1000_org', 'Europe_RohCRUDE_PETRO1000_org',
       'Europe_RohCOPPER1000_org', 'Europe_WKLWEUR840_org',
       'Producer Prices_PRI27840_org', 'Producer Prices_PRI27826_org',
       'Producer Prices_PRI27380_org', 'Producer Prices_PRI27250_org',
       'Producer Prices_PRI27276_org', 'Producer Prices_PRI27156_org',
    

### Fix missing values

> Switzerland_MAB_ELE_PRO75

Filled the one missing value using forward fill (uses the previous known value)

In [91]:
# check missing values in Switzerland_MAB_ELE_PRO756
market_df['Switzerland_MAB_ELE_PRO756'].isnull().sum()

1

In [92]:
market_df[market_df['Switzerland_MAB_ELE_PRO756'].isna()]

Unnamed: 0,DATE,China_MAB_ELE_PRO156,China_MAB_ELE_SHP156,France_MAB_ELE_PRO250,France_MAB_ELE_SHP250,Germany_MAB_ELE_PRO276,Germany_MAB_ELE_SHP276,Italy_MAB_ELE_PRO380,Italy_MAB_ELE_SHP380,Japan_MAB_ELE_PRO392,Japan_MAB_ELE_SHP392,Switzerland_MAB_ELE_PRO756,Switzerland_MAB_ELE_SHP756,United Kingdom_MAB_ELE_PRO826,United Kingdom_MAB_ELE_SHP826,United States_MAB_ELE_PRO840,United States_MAB_ELE_SHP840,Europe_MAB_ELE_PRO1100,Europe_MAB_ELE_SHP1100,Europe_RohiBASEMET1000_org,Europe_RohiENERGY1000_org,Europe_RohiMETMIN1000_org,Europe_RohiNATGAS1000_org,Europe_RohCRUDE_PETRO1000_org,Europe_RohCOPPER1000_org,Europe_WKLWEUR840_org,Producer Prices_PRI27840_org,Producer Prices_PRI27826_org,Producer Prices_PRI27380_org,Producer Prices_PRI27250_org,Producer Prices_PRI27276_org,Producer Prices_PRI27156_org,production index_PRO28840_org,production index_PRO281000_org,production index_PRO28756_org,production index_PRO28826_org,production index_PRO28380_org,production index_PRO28392_org,production index_PRO28250_org,production index_PRO28276_org,production index_PRO27840_org,production index_PRO271000_org,production index_PRO27756_org,production index_PRO27826_org,production index_PRO27380_org,production index_PRO27392_org,production index_PRO27250_org,production index_PRO27276_org,YearMonth
218,2022-04-01,267.373145,267.373145,87.69811,116.528738,99.522205,127.022869,103.55669,128.733305,114.262328,115.012049,,,95.266502,,116.961047,,112.902215,134.93551,146.090998,153.188945,138.094143,243.43603,130.83543,134.859685,1.0819,137.531616,,118.408043,113.280655,121.220627,98.857087,119.385483,128.285706,,84.728728,111.090744,120.09881,91.979698,98.675873,112.158089,134.843353,,114.359844,86.255684,102.36168,80.763306,101.074341,2022-04


In [93]:
# Fill missing values in Switzerland_MAB_ELE_PRO756 using forward fill
market_df['Switzerland_MAB_ELE_PRO756'].fillna(method='ffill', inplace=True)

> Switzerland_MAB_ELE_PRO756

Filled the one missing value using forward fill (uses the previous known value)

In [94]:
missing(market_df)

Columns with missing values:


Unnamed: 0,Missing Values,Percentage (%)
Switzerland_MAB_ELE_SHP756,1,0.46
United Kingdom_MAB_ELE_SHP826,18,8.22
United States_MAB_ELE_SHP840,1,0.46
Producer Prices_PRI27826_org,18,8.22
Producer Prices_PRI27250_org,35,15.98
Producer Prices_PRI27156_org,23,10.5
production index_PRO28756_org,1,0.46
production index_PRO271000_org,11,5.02
production index_PRO27756_org,1,0.46


In [95]:
# Fill missing values in Switzerland_MAB_ELE_SHP756 using forward fill
market_df['Switzerland_MAB_ELE_SHP756'].fillna(method='ffill', inplace=True)

In [96]:
missing(market_df)

Columns with missing values:


Unnamed: 0,Missing Values,Percentage (%)
United Kingdom_MAB_ELE_SHP826,18,8.22
United States_MAB_ELE_SHP840,1,0.46
Producer Prices_PRI27826_org,18,8.22
Producer Prices_PRI27250_org,35,15.98
Producer Prices_PRI27156_org,23,10.5
production index_PRO28756_org,1,0.46
production index_PRO271000_org,11,5.02
production index_PRO27756_org,1,0.46


> States_MAB_ELE_SHP840


In [97]:
# Fill missing values in United States_MAB_ELE_SHP840 using forward fill
market_df['United States_MAB_ELE_SHP840'].fillna(method='ffill', inplace=True)

In [98]:
missing(market_df)

Columns with missing values:


Unnamed: 0,Missing Values,Percentage (%)
United Kingdom_MAB_ELE_SHP826,18,8.22
Producer Prices_PRI27826_org,18,8.22
Producer Prices_PRI27250_org,35,15.98
Producer Prices_PRI27156_org,23,10.5
production index_PRO28756_org,1,0.46
production index_PRO271000_org,11,5.02
production index_PRO27756_org,1,0.46


> production index_PRO28756_org	


In [99]:
# Fill missing values in production index_PRO28756_org using forward fill
market_df['production index_PRO28756_org'].fillna(method='ffill', inplace=True)

In [100]:
missing(market_df)

Columns with missing values:


Unnamed: 0,Missing Values,Percentage (%)
United Kingdom_MAB_ELE_SHP826,18,8.22
Producer Prices_PRI27826_org,18,8.22
Producer Prices_PRI27250_org,35,15.98
Producer Prices_PRI27156_org,23,10.5
production index_PRO271000_org,11,5.02
production index_PRO27756_org,1,0.46


> production index_PRO27756_org

In [101]:
# Fill missing values in production index_PRO27756_org using forward fill
market_df['production index_PRO27756_org'].fillna(method='ffill', inplace=True)

In [102]:
missing(market_df)

Columns with missing values:


Unnamed: 0,Missing Values,Percentage (%)
United Kingdom_MAB_ELE_SHP826,18,8.22
Producer Prices_PRI27826_org,18,8.22
Producer Prices_PRI27250_org,35,15.98
Producer Prices_PRI27156_org,23,10.5
production index_PRO271000_org,11,5.02


>United Kingdom_MAB_ELE_SHP826, Producer Prices_PRI27826_org

This variables have missing data from november 2020 untill april 2022.



In [103]:
# Check missing values in United Kingdom_MAB_ELE_SHP826
market_df[market_df['United Kingdom_MAB_ELE_SHP826'].isnull()]

Unnamed: 0,DATE,China_MAB_ELE_PRO156,China_MAB_ELE_SHP156,France_MAB_ELE_PRO250,France_MAB_ELE_SHP250,Germany_MAB_ELE_PRO276,Germany_MAB_ELE_SHP276,Italy_MAB_ELE_PRO380,Italy_MAB_ELE_SHP380,Japan_MAB_ELE_PRO392,Japan_MAB_ELE_SHP392,Switzerland_MAB_ELE_PRO756,Switzerland_MAB_ELE_SHP756,United Kingdom_MAB_ELE_PRO826,United Kingdom_MAB_ELE_SHP826,United States_MAB_ELE_PRO840,United States_MAB_ELE_SHP840,Europe_MAB_ELE_PRO1100,Europe_MAB_ELE_SHP1100,Europe_RohiBASEMET1000_org,Europe_RohiENERGY1000_org,Europe_RohiMETMIN1000_org,Europe_RohiNATGAS1000_org,Europe_RohCRUDE_PETRO1000_org,Europe_RohCOPPER1000_org,Europe_WKLWEUR840_org,Producer Prices_PRI27840_org,Producer Prices_PRI27826_org,Producer Prices_PRI27380_org,Producer Prices_PRI27250_org,Producer Prices_PRI27276_org,Producer Prices_PRI27156_org,production index_PRO28840_org,production index_PRO281000_org,production index_PRO28756_org,production index_PRO28826_org,production index_PRO28380_org,production index_PRO28392_org,production index_PRO28250_org,production index_PRO28276_org,production index_PRO27840_org,production index_PRO271000_org,production index_PRO27756_org,production index_PRO27826_org,production index_PRO27380_org,production index_PRO27392_org,production index_PRO27250_org,production index_PRO27276_org,YearMonth
201,2020-11-01,274.401951,274.401951,94.021859,116.619203,112.279605,129.325685,103.453886,120.110839,107.8768,108.831086,94.450154,90.768739,106.183797,,102.842865,115.023808,115.623883,129.531565,91.446941,55.301816,90.279614,59.062273,53.520901,93.817077,1.1838,113.15876,,105.193787,103.084373,110.908455,91.126147,100.440186,123.780357,87.062691,93.211822,109.490875,111.299889,100.304573,114.565346,107.602722,135.364349,107.171001,129.687656,89.590698,100.897088,83.845871,108.087662,2020-11
202,2020-12-01,296.243834,296.243834,94.067274,120.031794,112.755519,135.720643,88.323974,132.213934,116.53076,118.433251,96.511966,92.848468,98.839148,,100.701243,127.778181,113.323302,136.69085,98.044626,63.746684,99.650513,63.858433,61.647509,103.151524,1.217,113.538193,,105.401886,103.18634,110.908455,91.602428,98.205009,123.597504,87.814041,92.189762,95.892014,119.901755,105.380714,124.467476,105.646439,126.429832,111.489362,110.887137,70.944923,109.657266,75.743118,91.276321,2020-12
203,2021-01-01,214.843002,214.843002,84.129571,97.831165,90.608216,103.206701,89.617695,92.884582,102.486579,103.404191,93.962599,90.451946,88.236915,,107.055814,114.206245,96.647822,105.778193,99.711652,69.882566,102.839302,73.021479,67.81732,105.804688,1.2171,113.347328,,105.922134,103.594193,111.434586,92.022632,106.065178,109.42234,86.749628,77.983136,93.39222,105.466748,89.03553,89.119171,109.018326,116.305664,106.382979,106.815687,80.949974,96.409984,76.183487,93.339058,2021-01
204,2021-02-01,214.843002,214.843002,88.827948,107.84733,102.520103,117.656513,103.66083,107.781307,112.067771,112.963085,91.413233,88.055424,98.93732,,107.707957,121.292151,107.957109,119.189639,105.019764,80.381943,106.392356,97.478025,76.496385,112.424523,1.2098,113.84111,,106.546425,104.002045,111.645035,92.256222,107.852631,116.34584,85.685216,86.261818,109.490875,116.299057,93.807106,102.014969,107.421349,120.062042,101.276596,121.904002,90.272865,103.440134,80.763306,103.446495,2021-02
205,2021-03-01,309.574752,309.574752,109.451042,129.230949,123.58159,147.238328,117.789173,134.255081,143.412762,147.823226,88.863867,85.658902,125.183283,,112.193648,144.457633,129.677284,147.139591,109.040858,80.765237,110.221389,65.61845,80.75579,119.290152,1.1899,115.685043,,106.962624,104.511856,111.750267,92.803706,113.301704,142.993607,84.620803,116.616983,124.589615,147.738341,118.680206,125.618881,109.99852,145.655823,96.170213,140.704521,102.172813,134.592862,94.502747,119.845291,2021-03
206,2021-04-01,282.420831,282.420831,93.530969,115.198554,108.016734,125.173968,109.508032,116.72687,116.215743,115.628078,89.111656,86.280605,104.652101,,112.572205,135.097535,114.127744,127.187722,113.471814,80.801718,115.323053,71.27745,79.642441,123.757038,1.1979,117.270813,,107.795021,104.613815,111.960716,92.910796,113.559624,129.624741,87.062691,96.584618,117.29023,120.147103,100.0,109.383995,110.616066,133.578918,92.639874,119.269534,91.637192,108.199661,83.053207,105.509239,2021-04
207,2021-05-01,291.521687,291.521687,86.443078,103.008082,102.878699,122.563573,111.914939,127.351245,102.790421,104.42544,89.359446,86.902308,103.071975,,112.187736,135.07879,112.073688,126.496868,121.931208,86.284861,125.793313,83.467233,84.007277,134.867515,1.2146,118.790665,,108.419319,104.817749,112.171173,95.187973,113.255424,125.129715,89.504579,95.255941,119.590034,105.767591,92.994919,104.087509,110.072578,128.480606,89.109535,117.233809,94.290047,96.719942,75.831192,100.661797,2021-05
208,2021-06-01,321.742425,321.742425,107.274337,135.650631,115.558297,139.043711,111.887888,134.819939,128.91902,131.821521,89.607236,87.524012,116.740429,,112.808948,148.629304,125.123157,144.396426,118.988358,93.918953,124.27319,94.730656,90.843411,127.82723,1.2047,120.752991,,109.355759,105.531479,112.381622,96.697182,114.939987,139.292435,91.946466,109.462567,118.990089,133.326069,115.431473,117.213593,108.587227,141.034653,85.579196,129.927153,95.578575,119.933003,94.062386,112.52256,2021-06
209,2021-07-01,264.032744,264.032744,95.324444,101.023707,110.680184,130.895496,113.565085,131.02265,119.578788,121.681405,93.286137,91.141378,105.426099,,107.927668,133.574009,116.188229,128.83682,119.339499,98.289417,124.520601,112.678743,92.715863,125.429283,1.1822,121.894318,,110.084099,105.837372,113.223434,97.717431,107.521271,129.356079,94.357048,95.869176,122.189819,123.965424,103.553299,112.26252,108.732765,134.241028,91.44208,122.742241,93.759468,110.634392,81.99633,107.778252,2021-07
210,2021-08-01,253.723069,253.723069,63.856268,90.607333,100.880844,119.089013,61.338702,80.672925,103.195873,107.638835,96.965037,94.758744,99.283383,,110.153555,141.828254,98.242703,110.683212,120.873867,96.323599,119.044083,129.818583,87.128029,124.358515,1.1772,122.707123,,110.604347,106.653076,113.960014,97.822692,110.545776,119.849464,96.767629,89.839026,66.694443,111.770463,65.380714,99.481873,109.376541,125.740089,97.304965,116.395569,49.039917,85.712195,61.387154,103.446495,2021-08


In [104]:
# check missing values in Producer Prices_PRI27826_org
market_df[market_df['Producer Prices_PRI27826_org'].isnull()]

Unnamed: 0,DATE,China_MAB_ELE_PRO156,China_MAB_ELE_SHP156,France_MAB_ELE_PRO250,France_MAB_ELE_SHP250,Germany_MAB_ELE_PRO276,Germany_MAB_ELE_SHP276,Italy_MAB_ELE_PRO380,Italy_MAB_ELE_SHP380,Japan_MAB_ELE_PRO392,Japan_MAB_ELE_SHP392,Switzerland_MAB_ELE_PRO756,Switzerland_MAB_ELE_SHP756,United Kingdom_MAB_ELE_PRO826,United Kingdom_MAB_ELE_SHP826,United States_MAB_ELE_PRO840,United States_MAB_ELE_SHP840,Europe_MAB_ELE_PRO1100,Europe_MAB_ELE_SHP1100,Europe_RohiBASEMET1000_org,Europe_RohiENERGY1000_org,Europe_RohiMETMIN1000_org,Europe_RohiNATGAS1000_org,Europe_RohCRUDE_PETRO1000_org,Europe_RohCOPPER1000_org,Europe_WKLWEUR840_org,Producer Prices_PRI27840_org,Producer Prices_PRI27826_org,Producer Prices_PRI27380_org,Producer Prices_PRI27250_org,Producer Prices_PRI27276_org,Producer Prices_PRI27156_org,production index_PRO28840_org,production index_PRO281000_org,production index_PRO28756_org,production index_PRO28826_org,production index_PRO28380_org,production index_PRO28392_org,production index_PRO28250_org,production index_PRO28276_org,production index_PRO27840_org,production index_PRO271000_org,production index_PRO27756_org,production index_PRO27826_org,production index_PRO27380_org,production index_PRO27392_org,production index_PRO27250_org,production index_PRO27276_org,YearMonth
201,2020-11-01,274.401951,274.401951,94.021859,116.619203,112.279605,129.325685,103.453886,120.110839,107.8768,108.831086,94.450154,90.768739,106.183797,,102.842865,115.023808,115.623883,129.531565,91.446941,55.301816,90.279614,59.062273,53.520901,93.817077,1.1838,113.15876,,105.193787,103.084373,110.908455,91.126147,100.440186,123.780357,87.062691,93.211822,109.490875,111.299889,100.304573,114.565346,107.602722,135.364349,107.171001,129.687656,89.590698,100.897088,83.845871,108.087662,2020-11
202,2020-12-01,296.243834,296.243834,94.067274,120.031794,112.755519,135.720643,88.323974,132.213934,116.53076,118.433251,96.511966,92.848468,98.839148,,100.701243,127.778181,113.323302,136.69085,98.044626,63.746684,99.650513,63.858433,61.647509,103.151524,1.217,113.538193,,105.401886,103.18634,110.908455,91.602428,98.205009,123.597504,87.814041,92.189762,95.892014,119.901755,105.380714,124.467476,105.646439,126.429832,111.489362,110.887137,70.944923,109.657266,75.743118,91.276321,2020-12
203,2021-01-01,214.843002,214.843002,84.129571,97.831165,90.608216,103.206701,89.617695,92.884582,102.486579,103.404191,93.962599,90.451946,88.236915,,107.055814,114.206245,96.647822,105.778193,99.711652,69.882566,102.839302,73.021479,67.81732,105.804688,1.2171,113.347328,,105.922134,103.594193,111.434586,92.022632,106.065178,109.42234,86.749628,77.983136,93.39222,105.466748,89.03553,89.119171,109.018326,116.305664,106.382979,106.815687,80.949974,96.409984,76.183487,93.339058,2021-01
204,2021-02-01,214.843002,214.843002,88.827948,107.84733,102.520103,117.656513,103.66083,107.781307,112.067771,112.963085,91.413233,88.055424,98.93732,,107.707957,121.292151,107.957109,119.189639,105.019764,80.381943,106.392356,97.478025,76.496385,112.424523,1.2098,113.84111,,106.546425,104.002045,111.645035,92.256222,107.852631,116.34584,85.685216,86.261818,109.490875,116.299057,93.807106,102.014969,107.421349,120.062042,101.276596,121.904002,90.272865,103.440134,80.763306,103.446495,2021-02
205,2021-03-01,309.574752,309.574752,109.451042,129.230949,123.58159,147.238328,117.789173,134.255081,143.412762,147.823226,88.863867,85.658902,125.183283,,112.193648,144.457633,129.677284,147.139591,109.040858,80.765237,110.221389,65.61845,80.75579,119.290152,1.1899,115.685043,,106.962624,104.511856,111.750267,92.803706,113.301704,142.993607,84.620803,116.616983,124.589615,147.738341,118.680206,125.618881,109.99852,145.655823,96.170213,140.704521,102.172813,134.592862,94.502747,119.845291,2021-03
206,2021-04-01,282.420831,282.420831,93.530969,115.198554,108.016734,125.173968,109.508032,116.72687,116.215743,115.628078,89.111656,86.280605,104.652101,,112.572205,135.097535,114.127744,127.187722,113.471814,80.801718,115.323053,71.27745,79.642441,123.757038,1.1979,117.270813,,107.795021,104.613815,111.960716,92.910796,113.559624,129.624741,87.062691,96.584618,117.29023,120.147103,100.0,109.383995,110.616066,133.578918,92.639874,119.269534,91.637192,108.199661,83.053207,105.509239,2021-04
207,2021-05-01,291.521687,291.521687,86.443078,103.008082,102.878699,122.563573,111.914939,127.351245,102.790421,104.42544,89.359446,86.902308,103.071975,,112.187736,135.07879,112.073688,126.496868,121.931208,86.284861,125.793313,83.467233,84.007277,134.867515,1.2146,118.790665,,108.419319,104.817749,112.171173,95.187973,113.255424,125.129715,89.504579,95.255941,119.590034,105.767591,92.994919,104.087509,110.072578,128.480606,89.109535,117.233809,94.290047,96.719942,75.831192,100.661797,2021-05
208,2021-06-01,321.742425,321.742425,107.274337,135.650631,115.558297,139.043711,111.887888,134.819939,128.91902,131.821521,89.607236,87.524012,116.740429,,112.808948,148.629304,125.123157,144.396426,118.988358,93.918953,124.27319,94.730656,90.843411,127.82723,1.2047,120.752991,,109.355759,105.531479,112.381622,96.697182,114.939987,139.292435,91.946466,109.462567,118.990089,133.326069,115.431473,117.213593,108.587227,141.034653,85.579196,129.927153,95.578575,119.933003,94.062386,112.52256,2021-06
209,2021-07-01,264.032744,264.032744,95.324444,101.023707,110.680184,130.895496,113.565085,131.02265,119.578788,121.681405,93.286137,91.141378,105.426099,,107.927668,133.574009,116.188229,128.83682,119.339499,98.289417,124.520601,112.678743,92.715863,125.429283,1.1822,121.894318,,110.084099,105.837372,113.223434,97.717431,107.521271,129.356079,94.357048,95.869176,122.189819,123.965424,103.553299,112.26252,108.732765,134.241028,91.44208,122.742241,93.759468,110.634392,81.99633,107.778252,2021-07
210,2021-08-01,253.723069,253.723069,63.856268,90.607333,100.880844,119.089013,61.338702,80.672925,103.195873,107.638835,96.965037,94.758744,99.283383,,110.153555,141.828254,98.242703,110.683212,120.873867,96.323599,119.044083,129.818583,87.128029,124.358515,1.1772,122.707123,,110.604347,106.653076,113.960014,97.822692,110.545776,119.849464,96.767629,89.839026,66.694443,111.770463,65.380714,99.481873,109.376541,125.740089,97.304965,116.395569,49.039917,85.712195,61.387154,103.446495,2021-08


In [105]:
#fill missing values in United Kingdom_MAB_ELE_SHP826 using mean
market_df['United Kingdom_MAB_ELE_SHP826'].fillna(market_df['United Kingdom_MAB_ELE_SHP826'].mean(), inplace=True)
market_df['Producer Prices_PRI27826_org'].fillna(market_df['Producer Prices_PRI27826_org'].mean(), inplace=True)

In [106]:
missing(market_df)

Columns with missing values:


Unnamed: 0,Missing Values,Percentage (%)
Producer Prices_PRI27250_org,35,15.98
Producer Prices_PRI27156_org,23,10.5
production index_PRO271000_org,11,5.02


> production index_PRO271000_org
Has missing data from january 2004 until december 2004. <br>
We will input the missing values with zero, considering this variable was not being tracked since the very beginning of the dataset.

In [107]:
# check missing values in production index_PRO271000_org
market_df[market_df['production index_PRO271000_org'].isnull()]

Unnamed: 0,DATE,China_MAB_ELE_PRO156,China_MAB_ELE_SHP156,France_MAB_ELE_PRO250,France_MAB_ELE_SHP250,Germany_MAB_ELE_PRO276,Germany_MAB_ELE_SHP276,Italy_MAB_ELE_PRO380,Italy_MAB_ELE_SHP380,Japan_MAB_ELE_PRO392,Japan_MAB_ELE_SHP392,Switzerland_MAB_ELE_PRO756,Switzerland_MAB_ELE_SHP756,United Kingdom_MAB_ELE_PRO826,United Kingdom_MAB_ELE_SHP826,United States_MAB_ELE_PRO840,United States_MAB_ELE_SHP840,Europe_MAB_ELE_PRO1100,Europe_MAB_ELE_SHP1100,Europe_RohiBASEMET1000_org,Europe_RohiENERGY1000_org,Europe_RohiMETMIN1000_org,Europe_RohiNATGAS1000_org,Europe_RohCRUDE_PETRO1000_org,Europe_RohCOPPER1000_org,Europe_WKLWEUR840_org,Producer Prices_PRI27840_org,Producer Prices_PRI27826_org,Producer Prices_PRI27380_org,Producer Prices_PRI27250_org,Producer Prices_PRI27276_org,Producer Prices_PRI27156_org,production index_PRO28840_org,production index_PRO281000_org,production index_PRO28756_org,production index_PRO28826_org,production index_PRO28380_org,production index_PRO28392_org,production index_PRO28250_org,production index_PRO28276_org,production index_PRO27840_org,production index_PRO271000_org,production index_PRO27756_org,production index_PRO27826_org,production index_PRO27380_org,production index_PRO27392_org,production index_PRO27250_org,production index_PRO27276_org,YearMonth
0,2004-02-01,16.940704,16.940704,112.091273,83.458866,82.623037,79.452532,124.289603,86.560493,109.33401,110.495272,91.221862,89.987275,111.353812,73.601265,107.6014,79.24023,97.122911,80.09853,54.039811,44.123338,48.747945,87.076974,39.639458,36.623832,1.2646,78.969864,80.757423,93.020027,,93.230453,,102.491722,97.597374,97.1,106.191977,116.790276,110.890034,118.274109,80.82901,117.723991,,81.1,120.706516,141.510864,106.161262,102.077057,85.9132,2004-02
1,2004-03-01,23.711852,23.711852,136.327976,106.168192,100.556582,97.012918,143.411662,106.344544,140.884616,144.686166,85.866287,79.883583,127.558608,84.047595,110.187364,98.619024,113.783904,96.015929,54.666162,47.588957,49.256157,87.192705,42.592034,39.931055,1.2262,79.673569,80.962135,93.540268,,93.335678,,105.62748,113.224892,91.195116,121.625075,139.288391,141.176853,148.121841,102.130104,119.220779,,76.690307,138.30955,152.880234,140.288741,117.225685,97.670815,2004-03
2,2004-04-01,24.435235,24.435235,117.791806,92.007646,89.653203,84.932358,129.083828,95.579673,105.853579,102.655769,85.622508,79.740802,108.732297,73.026027,108.166564,89.774031,101.715199,85.167236,54.872715,47.779013,49.423751,91.379923,42.650637,39.134854,1.1985,80.337639,80.757423,93.852425,,93.440903,,103.484955,100.16909,93.793535,104.965505,125.289566,105.648765,125.482231,90.961426,117.441124,,71.552403,115.55733,137.796875,106.271197,105.335777,87.253983,2004-04
3,2004-05-01,23.708115,23.708115,109.002541,85.696486,86.880571,82.372794,135.590391,100.087039,101.864777,100.305285,85.378729,79.598021,110.6452,74.591883,108.425887,87.463813,101.275727,84.485767,51.230356,53.590898,46.468392,99.04452,47.517121,36.278433,1.2007,80.798828,80.757423,93.852425,,93.546127,,103.643944,99.581436,96.391954,105.885359,131.988998,101.990361,116.64975,88.082901,117.899216,,66.4145,119.269534,143.860535,101.60871,96.616508,84.675552,2004-05
4,2004-06-01,27.009138,27.009138,133.785737,106.641482,99.010814,95.10874,136.424935,110.889719,120.33292,119.61638,85.13495,79.455239,122.02096,82.343346,110.569933,97.364496,112.057197,96.963294,52.876331,50.799575,47.803913,98.636267,44.967605,35.65738,1.2138,80.91349,80.552711,93.956467,,93.440903,,106.062668,109.27771,98.990373,118.252278,132.988922,122.136575,143.248734,100.978699,119.499107,,61.276596,128.849416,144.315308,116.655248,118.45871,95.401802,2004-06
5,2004-07-01,23.590959,23.590959,117.405904,89.302781,94.200318,89.646239,140.351467,106.133436,118.140547,116.438941,84.324925,78.702987,112.620947,77.109447,104.538671,84.691023,104.089388,88.061626,54.771829,53.953416,49.341893,95.775539,47.939813,37.272915,1.2266,80.974838,80.552711,94.060524,,93.440903,,99.118011,102.110336,96.391954,108.951537,135.588715,118.880825,122.030464,95.682213,115.277336,,63.546099,119.269534,151.288528,116.631115,109.915596,91.48259,2004-07
6,2004-08-01,24.280597,24.280597,79.137033,71.798962,82.804844,80.197268,54.397252,49.046004,106.243773,107.615706,83.514901,77.950734,102.724312,70.849033,107.164223,88.294735,80.476626,71.158884,54.087599,58.14188,48.786719,89.649886,53.243495,37.772771,1.2176,81.121933,80.962135,94.268616,,93.440903,,100.78627,90.885391,93.793535,97.095648,53.495544,109.858584,79.492393,83.246979,119.799347,,65.815603,112.922862,56.467911,98.873136,78.561462,81.993988,2004-08
7,2004-09-01,25.989192,25.989192,130.487955,98.896817,102.039931,101.18697,130.661571,104.558892,124.929188,128.391232,82.704877,77.198482,120.37799,83.96576,109.027976,97.232273,111.213045,97.167301,54.69557,58.052276,49.280018,87.046949,52.627049,38.420022,1.2218,81.402054,81.576263,94.372665,,93.546127,,102.90654,108.077499,91.195116,114.777276,120.589951,129.960671,136.446701,103.972374,121.154922,,68.085106,130.525896,153.78978,114.669943,120.836693,98.495918,2004-09
8,2004-10-01,24.895962,24.895962,119.707613,99.60374,93.519358,89.045549,133.231484,95.903054,104.922286,104.343529,85.664089,80.08275,114.478921,80.901243,110.445124,91.011662,105.69441,89.526169,57.610933,66.835461,51.645502,104.581039,59.312567,39.977785,1.249,81.674583,81.576263,94.580765,,93.546127,,103.907288,101.639977,98.051186,108.13389,126.589455,108.53441,120.913712,93.955093,123.396988,,64.334121,125.975452,148.48407,97.557126,117.75412,92.720238,2004-10
9,2004-11-01,26.774785,26.774785,114.509588,86.511165,97.130763,93.852657,130.111098,105.53936,112.828512,112.273727,88.6233,82.967019,119.455503,84.270833,109.607827,86.753335,107.465868,92.995787,58.167131,59.107614,52.096793,101.189309,53.295969,41.445074,1.2991,81.607956,81.883324,94.580765,,93.546127,,101.965965,103.843315,104.907255,112.222128,124.189651,117.230886,117.563454,96.948761,124.746834,,60.583136,132.561621,143.708954,103.852026,109.563301,97.464546,2004-11


In [108]:
# fill missing values in production index_PRO271000_org with zero
market_df['production index_PRO271000_org'].fillna(0, inplace=True)

In [109]:
missing(market_df)

Columns with missing values:


Unnamed: 0,Missing Values,Percentage (%)
Producer Prices_PRI27250_org,35,15.98
Producer Prices_PRI27156_org,23,10.5


> Producer Prices_PRI27250_org

This feature has missing values since jannuary 2004 until december 2006.

We will input with zero, as we considered it has a not being tracked at the time, so starting at zero will be clear in the analysis.

In [110]:
# Check missing values in Producer Prices_PRI27250_org
market_df[market_df['Producer Prices_PRI27250_org'].isnull()]

Unnamed: 0,DATE,China_MAB_ELE_PRO156,China_MAB_ELE_SHP156,France_MAB_ELE_PRO250,France_MAB_ELE_SHP250,Germany_MAB_ELE_PRO276,Germany_MAB_ELE_SHP276,Italy_MAB_ELE_PRO380,Italy_MAB_ELE_SHP380,Japan_MAB_ELE_PRO392,Japan_MAB_ELE_SHP392,Switzerland_MAB_ELE_PRO756,Switzerland_MAB_ELE_SHP756,United Kingdom_MAB_ELE_PRO826,United Kingdom_MAB_ELE_SHP826,United States_MAB_ELE_PRO840,United States_MAB_ELE_SHP840,Europe_MAB_ELE_PRO1100,Europe_MAB_ELE_SHP1100,Europe_RohiBASEMET1000_org,Europe_RohiENERGY1000_org,Europe_RohiMETMIN1000_org,Europe_RohiNATGAS1000_org,Europe_RohCRUDE_PETRO1000_org,Europe_RohCOPPER1000_org,Europe_WKLWEUR840_org,Producer Prices_PRI27840_org,Producer Prices_PRI27826_org,Producer Prices_PRI27380_org,Producer Prices_PRI27250_org,Producer Prices_PRI27276_org,Producer Prices_PRI27156_org,production index_PRO28840_org,production index_PRO281000_org,production index_PRO28756_org,production index_PRO28826_org,production index_PRO28380_org,production index_PRO28392_org,production index_PRO28250_org,production index_PRO28276_org,production index_PRO27840_org,production index_PRO271000_org,production index_PRO27756_org,production index_PRO27826_org,production index_PRO27380_org,production index_PRO27392_org,production index_PRO27250_org,production index_PRO27276_org,YearMonth
0,2004-02-01,16.940704,16.940704,112.091273,83.458866,82.623037,79.452532,124.289603,86.560493,109.33401,110.495272,91.221862,89.987275,111.353812,73.601265,107.6014,79.24023,97.122911,80.09853,54.039811,44.123338,48.747945,87.076974,39.639458,36.623832,1.2646,78.969864,80.757423,93.020027,,93.230453,,102.491722,97.597374,97.1,106.191977,116.790276,110.890034,118.274109,80.82901,117.723991,0.0,81.1,120.706516,141.510864,106.161262,102.077057,85.9132,2004-02
1,2004-03-01,23.711852,23.711852,136.327976,106.168192,100.556582,97.012918,143.411662,106.344544,140.884616,144.686166,85.866287,79.883583,127.558608,84.047595,110.187364,98.619024,113.783904,96.015929,54.666162,47.588957,49.256157,87.192705,42.592034,39.931055,1.2262,79.673569,80.962135,93.540268,,93.335678,,105.62748,113.224892,91.195116,121.625075,139.288391,141.176853,148.121841,102.130104,119.220779,0.0,76.690307,138.30955,152.880234,140.288741,117.225685,97.670815,2004-03
2,2004-04-01,24.435235,24.435235,117.791806,92.007646,89.653203,84.932358,129.083828,95.579673,105.853579,102.655769,85.622508,79.740802,108.732297,73.026027,108.166564,89.774031,101.715199,85.167236,54.872715,47.779013,49.423751,91.379923,42.650637,39.134854,1.1985,80.337639,80.757423,93.852425,,93.440903,,103.484955,100.16909,93.793535,104.965505,125.289566,105.648765,125.482231,90.961426,117.441124,0.0,71.552403,115.55733,137.796875,106.271197,105.335777,87.253983,2004-04
3,2004-05-01,23.708115,23.708115,109.002541,85.696486,86.880571,82.372794,135.590391,100.087039,101.864777,100.305285,85.378729,79.598021,110.6452,74.591883,108.425887,87.463813,101.275727,84.485767,51.230356,53.590898,46.468392,99.04452,47.517121,36.278433,1.2007,80.798828,80.757423,93.852425,,93.546127,,103.643944,99.581436,96.391954,105.885359,131.988998,101.990361,116.64975,88.082901,117.899216,0.0,66.4145,119.269534,143.860535,101.60871,96.616508,84.675552,2004-05
4,2004-06-01,27.009138,27.009138,133.785737,106.641482,99.010814,95.10874,136.424935,110.889719,120.33292,119.61638,85.13495,79.455239,122.02096,82.343346,110.569933,97.364496,112.057197,96.963294,52.876331,50.799575,47.803913,98.636267,44.967605,35.65738,1.2138,80.91349,80.552711,93.956467,,93.440903,,106.062668,109.27771,98.990373,118.252278,132.988922,122.136575,143.248734,100.978699,119.499107,0.0,61.276596,128.849416,144.315308,116.655248,118.45871,95.401802,2004-06
5,2004-07-01,23.590959,23.590959,117.405904,89.302781,94.200318,89.646239,140.351467,106.133436,118.140547,116.438941,84.324925,78.702987,112.620947,77.109447,104.538671,84.691023,104.089388,88.061626,54.771829,53.953416,49.341893,95.775539,47.939813,37.272915,1.2266,80.974838,80.552711,94.060524,,93.440903,,99.118011,102.110336,96.391954,108.951537,135.588715,118.880825,122.030464,95.682213,115.277336,0.0,63.546099,119.269534,151.288528,116.631115,109.915596,91.48259,2004-07
6,2004-08-01,24.280597,24.280597,79.137033,71.798962,82.804844,80.197268,54.397252,49.046004,106.243773,107.615706,83.514901,77.950734,102.724312,70.849033,107.164223,88.294735,80.476626,71.158884,54.087599,58.14188,48.786719,89.649886,53.243495,37.772771,1.2176,81.121933,80.962135,94.268616,,93.440903,,100.78627,90.885391,93.793535,97.095648,53.495544,109.858584,79.492393,83.246979,119.799347,0.0,65.815603,112.922862,56.467911,98.873136,78.561462,81.993988,2004-08
7,2004-09-01,25.989192,25.989192,130.487955,98.896817,102.039931,101.18697,130.661571,104.558892,124.929188,128.391232,82.704877,77.198482,120.37799,83.96576,109.027976,97.232273,111.213045,97.167301,54.69557,58.052276,49.280018,87.046949,52.627049,38.420022,1.2218,81.402054,81.576263,94.372665,,93.546127,,102.90654,108.077499,91.195116,114.777276,120.589951,129.960671,136.446701,103.972374,121.154922,0.0,68.085106,130.525896,153.78978,114.669943,120.836693,98.495918,2004-09
8,2004-10-01,24.895962,24.895962,119.707613,99.60374,93.519358,89.045549,133.231484,95.903054,104.922286,104.343529,85.664089,80.08275,114.478921,80.901243,110.445124,91.011662,105.69441,89.526169,57.610933,66.835461,51.645502,104.581039,59.312567,39.977785,1.249,81.674583,81.576263,94.580765,,93.546127,,103.907288,101.639977,98.051186,108.13389,126.589455,108.53441,120.913712,93.955093,123.396988,0.0,64.334121,125.975452,148.48407,97.557126,117.75412,92.720238,2004-10
9,2004-11-01,26.774785,26.774785,114.509588,86.511165,97.130763,93.852657,130.111098,105.53936,112.828512,112.273727,88.6233,82.967019,119.455503,84.270833,109.607827,86.753335,107.465868,92.995787,58.167131,59.107614,52.096793,101.189309,53.295969,41.445074,1.2991,81.607956,81.883324,94.580765,,93.546127,,101.965965,103.843315,104.907255,112.222128,124.189651,117.230886,117.563454,96.948761,124.746834,0.0,60.583136,132.561621,143.708954,103.852026,109.563301,97.464546,2004-11


In [111]:
# fill missing values in Producer Prices_PRI27250_org with zero
market_df['Producer Prices_PRI27250_org'].fillna(0, inplace=True)

In [112]:
missing(market_df)

Columns with missing values:


Unnamed: 0,Missing Values,Percentage (%)
Producer Prices_PRI27156_org,23,10.5


> Producer Prices_PRI27156_org

This variable is missing the whole 2004 and 2005 years. (from january 2004 to december 2005)

In [113]:
# check missing values in Producer Prices_PRI27156_org
market_df[market_df['Producer Prices_PRI27156_org'].isnull()]

Unnamed: 0,DATE,China_MAB_ELE_PRO156,China_MAB_ELE_SHP156,France_MAB_ELE_PRO250,France_MAB_ELE_SHP250,Germany_MAB_ELE_PRO276,Germany_MAB_ELE_SHP276,Italy_MAB_ELE_PRO380,Italy_MAB_ELE_SHP380,Japan_MAB_ELE_PRO392,Japan_MAB_ELE_SHP392,Switzerland_MAB_ELE_PRO756,Switzerland_MAB_ELE_SHP756,United Kingdom_MAB_ELE_PRO826,United Kingdom_MAB_ELE_SHP826,United States_MAB_ELE_PRO840,United States_MAB_ELE_SHP840,Europe_MAB_ELE_PRO1100,Europe_MAB_ELE_SHP1100,Europe_RohiBASEMET1000_org,Europe_RohiENERGY1000_org,Europe_RohiMETMIN1000_org,Europe_RohiNATGAS1000_org,Europe_RohCRUDE_PETRO1000_org,Europe_RohCOPPER1000_org,Europe_WKLWEUR840_org,Producer Prices_PRI27840_org,Producer Prices_PRI27826_org,Producer Prices_PRI27380_org,Producer Prices_PRI27250_org,Producer Prices_PRI27276_org,Producer Prices_PRI27156_org,production index_PRO28840_org,production index_PRO281000_org,production index_PRO28756_org,production index_PRO28826_org,production index_PRO28380_org,production index_PRO28392_org,production index_PRO28250_org,production index_PRO28276_org,production index_PRO27840_org,production index_PRO271000_org,production index_PRO27756_org,production index_PRO27826_org,production index_PRO27380_org,production index_PRO27392_org,production index_PRO27250_org,production index_PRO27276_org,YearMonth
0,2004-02-01,16.940704,16.940704,112.091273,83.458866,82.623037,79.452532,124.289603,86.560493,109.33401,110.495272,91.221862,89.987275,111.353812,73.601265,107.6014,79.24023,97.122911,80.09853,54.039811,44.123338,48.747945,87.076974,39.639458,36.623832,1.2646,78.969864,80.757423,93.020027,0.0,93.230453,,102.491722,97.597374,97.1,106.191977,116.790276,110.890034,118.274109,80.82901,117.723991,0.0,81.1,120.706516,141.510864,106.161262,102.077057,85.9132,2004-02
1,2004-03-01,23.711852,23.711852,136.327976,106.168192,100.556582,97.012918,143.411662,106.344544,140.884616,144.686166,85.866287,79.883583,127.558608,84.047595,110.187364,98.619024,113.783904,96.015929,54.666162,47.588957,49.256157,87.192705,42.592034,39.931055,1.2262,79.673569,80.962135,93.540268,0.0,93.335678,,105.62748,113.224892,91.195116,121.625075,139.288391,141.176853,148.121841,102.130104,119.220779,0.0,76.690307,138.30955,152.880234,140.288741,117.225685,97.670815,2004-03
2,2004-04-01,24.435235,24.435235,117.791806,92.007646,89.653203,84.932358,129.083828,95.579673,105.853579,102.655769,85.622508,79.740802,108.732297,73.026027,108.166564,89.774031,101.715199,85.167236,54.872715,47.779013,49.423751,91.379923,42.650637,39.134854,1.1985,80.337639,80.757423,93.852425,0.0,93.440903,,103.484955,100.16909,93.793535,104.965505,125.289566,105.648765,125.482231,90.961426,117.441124,0.0,71.552403,115.55733,137.796875,106.271197,105.335777,87.253983,2004-04
3,2004-05-01,23.708115,23.708115,109.002541,85.696486,86.880571,82.372794,135.590391,100.087039,101.864777,100.305285,85.378729,79.598021,110.6452,74.591883,108.425887,87.463813,101.275727,84.485767,51.230356,53.590898,46.468392,99.04452,47.517121,36.278433,1.2007,80.798828,80.757423,93.852425,0.0,93.546127,,103.643944,99.581436,96.391954,105.885359,131.988998,101.990361,116.64975,88.082901,117.899216,0.0,66.4145,119.269534,143.860535,101.60871,96.616508,84.675552,2004-05
4,2004-06-01,27.009138,27.009138,133.785737,106.641482,99.010814,95.10874,136.424935,110.889719,120.33292,119.61638,85.13495,79.455239,122.02096,82.343346,110.569933,97.364496,112.057197,96.963294,52.876331,50.799575,47.803913,98.636267,44.967605,35.65738,1.2138,80.91349,80.552711,93.956467,0.0,93.440903,,106.062668,109.27771,98.990373,118.252278,132.988922,122.136575,143.248734,100.978699,119.499107,0.0,61.276596,128.849416,144.315308,116.655248,118.45871,95.401802,2004-06
5,2004-07-01,23.590959,23.590959,117.405904,89.302781,94.200318,89.646239,140.351467,106.133436,118.140547,116.438941,84.324925,78.702987,112.620947,77.109447,104.538671,84.691023,104.089388,88.061626,54.771829,53.953416,49.341893,95.775539,47.939813,37.272915,1.2266,80.974838,80.552711,94.060524,0.0,93.440903,,99.118011,102.110336,96.391954,108.951537,135.588715,118.880825,122.030464,95.682213,115.277336,0.0,63.546099,119.269534,151.288528,116.631115,109.915596,91.48259,2004-07
6,2004-08-01,24.280597,24.280597,79.137033,71.798962,82.804844,80.197268,54.397252,49.046004,106.243773,107.615706,83.514901,77.950734,102.724312,70.849033,107.164223,88.294735,80.476626,71.158884,54.087599,58.14188,48.786719,89.649886,53.243495,37.772771,1.2176,81.121933,80.962135,94.268616,0.0,93.440903,,100.78627,90.885391,93.793535,97.095648,53.495544,109.858584,79.492393,83.246979,119.799347,0.0,65.815603,112.922862,56.467911,98.873136,78.561462,81.993988,2004-08
7,2004-09-01,25.989192,25.989192,130.487955,98.896817,102.039931,101.18697,130.661571,104.558892,124.929188,128.391232,82.704877,77.198482,120.37799,83.96576,109.027976,97.232273,111.213045,97.167301,54.69557,58.052276,49.280018,87.046949,52.627049,38.420022,1.2218,81.402054,81.576263,94.372665,0.0,93.546127,,102.90654,108.077499,91.195116,114.777276,120.589951,129.960671,136.446701,103.972374,121.154922,0.0,68.085106,130.525896,153.78978,114.669943,120.836693,98.495918,2004-09
8,2004-10-01,24.895962,24.895962,119.707613,99.60374,93.519358,89.045549,133.231484,95.903054,104.922286,104.343529,85.664089,80.08275,114.478921,80.901243,110.445124,91.011662,105.69441,89.526169,57.610933,66.835461,51.645502,104.581039,59.312567,39.977785,1.249,81.674583,81.576263,94.580765,0.0,93.546127,,103.907288,101.639977,98.051186,108.13389,126.589455,108.53441,120.913712,93.955093,123.396988,0.0,64.334121,125.975452,148.48407,97.557126,117.75412,92.720238,2004-10
9,2004-11-01,26.774785,26.774785,114.509588,86.511165,97.130763,93.852657,130.111098,105.53936,112.828512,112.273727,88.6233,82.967019,119.455503,84.270833,109.607827,86.753335,107.465868,92.995787,58.167131,59.107614,52.096793,101.189309,53.295969,41.445074,1.2991,81.607956,81.883324,94.580765,0.0,93.546127,,101.965965,103.843315,104.907255,112.222128,124.189651,117.230886,117.563454,96.948761,124.746834,0.0,60.583136,132.561621,143.708954,103.852026,109.563301,97.464546,2004-11


In [114]:
# fill missing values in Producer Prices_PRI27156_org with zero 
market_df['Producer Prices_PRI27156_org'].fillna(0, inplace=True)

In [115]:
missing(market_df)

No missing values in the dataset.


### Checking for duplicates

There are no duplicates in the market_df dataset.

In [116]:
# check for duplicates in market_df
market_df.duplicated().sum()

0

# Merge market data

In [117]:
for i in range(1, 15):
    df = globals()[f'product{i}']
    
    # Ensure both sides have datetime-type DATE columns (just in case)
    df['DATE'] = pd.to_datetime(df['DATE'])
    market_df['DATE'] = pd.to_datetime(market_df['DATE'])

    # Merge market_df into the product sales data
    merged_df = pd.merge(df, market_df, on='DATE', how='left')

    # Optional: sort by date (and product, if exists)
    merged_df = merged_df.sort_values(['DATE']).reset_index(drop=True)
    
    # Save back
    globals()[f'product{i}'] = merged_df

In [118]:
# # Ensure macro data is monthly
# market_df['YearMonth'] = market_df['DATE'].dt.to_period('M').dt.to_timestamp()

# # Merge with sales
# full_df = pd.merge(monthly_sales, market_df, on='YearMonth', how='left')

# # Sort for time series modeling
# full_df = full_df.sort_values(['Mapped_GCK', 'YearMonth']).reset_index(drop=True)

In [119]:
# check for rows where YearMonth_x is not equal to YearMonth_y
for i in range(1, 15):
    df = globals()[f'product{i}']
    print(f"product{i} shape: {df[df['YearMonth_x'] != df['YearMonth_y']].shape}")

product1 shape: (0, 52)
product2 shape: (0, 52)
product3 shape: (0, 52)
product4 shape: (0, 52)
product5 shape: (0, 52)
product6 shape: (0, 52)
product7 shape: (0, 52)
product8 shape: (0, 52)
product9 shape: (0, 52)
product10 shape: (0, 52)
product11 shape: (0, 52)
product12 shape: (0, 52)
product13 shape: (0, 52)
product14 shape: (0, 52)


In [120]:
# drop the YearMonth_y column in all product DataFrames because it's same as YearMonth_x
for i in range(1, 15):
    df = globals()[f'product{i}']
    df.drop(columns='YearMonth_y', inplace=True)
    globals()[f'product{i}'] = df

In [121]:
# change the name of YearMonth_x to YearMonth
for i in range(1, 15):
    df = globals()[f'product{i}']
    df.rename(columns={'YearMonth_x': 'YearMonth'}, inplace=True)
    globals()[f'product{i}'] = df

In [122]:
# Make DATE the index of the DataFrame
for i in range(1, 15):
    df = globals()[f'product{i}']
    df = df.set_index('DATE')
    globals()[f'product{i}'] = df

In [123]:
product1.columns

Index(['YearMonth', 'Mapped_GCK', 'Sales_EUR', 'China_MAB_ELE_PRO156',
       'China_MAB_ELE_SHP156', 'France_MAB_ELE_PRO250',
       'France_MAB_ELE_SHP250', 'Germany_MAB_ELE_PRO276',
       'Germany_MAB_ELE_SHP276', 'Italy_MAB_ELE_PRO380',
       'Italy_MAB_ELE_SHP380', 'Japan_MAB_ELE_PRO392', 'Japan_MAB_ELE_SHP392',
       'Switzerland_MAB_ELE_PRO756', 'Switzerland_MAB_ELE_SHP756',
       'United Kingdom_MAB_ELE_PRO826', 'United Kingdom_MAB_ELE_SHP826',
       'United States_MAB_ELE_PRO840', 'United States_MAB_ELE_SHP840',
       'Europe_MAB_ELE_PRO1100', 'Europe_MAB_ELE_SHP1100',
       'Europe_RohiBASEMET1000_org', 'Europe_RohiENERGY1000_org',
       'Europe_RohiMETMIN1000_org', 'Europe_RohiNATGAS1000_org',
       'Europe_RohCRUDE_PETRO1000_org', 'Europe_RohCOPPER1000_org',
       'Europe_WKLWEUR840_org', 'Producer Prices_PRI27840_org',
       'Producer Prices_PRI27826_org', 'Producer Prices_PRI27380_org',
       'Producer Prices_PRI27250_org', 'Producer Prices_PRI27276_org',
    

In [124]:
product4.shape

(43, 50)

In [125]:
# check if the change was successful
for i in range(1, len(product_dataframes) + 1):
     print(f"product{i} shape is:{globals()[f'product{i}'].shape}")

product1 shape is:(43, 50)
product2 shape is:(43, 50)
product3 shape is:(43, 50)
product4 shape is:(43, 50)
product5 shape is:(43, 50)
product6 shape is:(43, 50)
product7 shape is:(43, 50)
product8 shape is:(43, 50)
product9 shape is:(43, 50)
product10 shape is:(43, 50)
product11 shape is:(43, 50)
product12 shape is:(43, 50)
product13 shape is:(43, 50)
product14 shape is:(43, 50)


In [126]:
# check if the change was successful
for i in range(1, len(product_dataframes) + 1):
     print(f"product{i} first 5 rows:{globals()[f'product{i}'].head()}")

product1 first 5 rows:           YearMonth Mapped_GCK    Sales_EUR  China_MAB_ELE_PRO156  \
DATE                                                                 
2018-10-01   2018-10   Product1  36098918.79            211.955755   
2018-11-01   2018-11   Product1   5140760.00            220.519655   
2018-12-01   2018-12   Product1  37889612.12            241.846854   
2019-01-01   2019-01   Product1  27728148.35            175.668147   
2019-02-01   2019-02   Product1  34793163.53            175.668147   

            China_MAB_ELE_SHP156  France_MAB_ELE_PRO250  \
DATE                                                      
2018-10-01            211.955755             108.280608   
2018-11-01            220.519655              99.636911   
2018-12-01            241.846854              94.690312   
2019-01-01            175.668147              90.143775   
2019-02-01            175.668147              92.551521   

            France_MAB_ELE_SHP250  Germany_MAB_ELE_PRO276  \
DATE        

# Train-test split

> Doing the split before feature engineering **prevents data leakage**, which could give the model an unrealistic peek into the future.

We split the data in Jannuary 2022, leaving 39 months for training and 4 for testing (~90-10 split):
- train_product1 to train_product14 – data until December 2021
- test_product1 to test_product14 – data from Jannuary 2022 onward

In [127]:
# Define the split date
split_date = pd.to_datetime("2021-12-31")

# Loop through each product DataFrame
for i in range(1, 15):
    df = globals()[f'product{i}']

    # Ensure index is datetime and sorted
    df = df.sort_index()

    # Perform the split using the index
    train_df = df[df.index <= split_date].copy()
    test_df = df[df.index > split_date].copy()

    # Save them back to memory
    globals()[f'train_product{i}'] = train_df
    globals()[f'test_product{i}'] = test_df

In [128]:
train_product1.shape

(39, 50)

In [129]:
test_product1.shape

(4, 50)

In [130]:
test_product1.head()

Unnamed: 0_level_0,YearMonth,Mapped_GCK,Sales_EUR,China_MAB_ELE_PRO156,China_MAB_ELE_SHP156,France_MAB_ELE_PRO250,France_MAB_ELE_SHP250,Germany_MAB_ELE_PRO276,Germany_MAB_ELE_SHP276,Italy_MAB_ELE_PRO380,Italy_MAB_ELE_SHP380,Japan_MAB_ELE_PRO392,Japan_MAB_ELE_SHP392,Switzerland_MAB_ELE_PRO756,Switzerland_MAB_ELE_SHP756,United Kingdom_MAB_ELE_PRO826,United Kingdom_MAB_ELE_SHP826,United States_MAB_ELE_PRO840,United States_MAB_ELE_SHP840,Europe_MAB_ELE_PRO1100,Europe_MAB_ELE_SHP1100,Europe_RohiBASEMET1000_org,Europe_RohiENERGY1000_org,Europe_RohiMETMIN1000_org,Europe_RohiNATGAS1000_org,Europe_RohCRUDE_PETRO1000_org,Europe_RohCOPPER1000_org,Europe_WKLWEUR840_org,Producer Prices_PRI27840_org,Producer Prices_PRI27826_org,Producer Prices_PRI27380_org,Producer Prices_PRI27250_org,Producer Prices_PRI27276_org,Producer Prices_PRI27156_org,production index_PRO28840_org,production index_PRO281000_org,production index_PRO28756_org,production index_PRO28826_org,production index_PRO28380_org,production index_PRO28392_org,production index_PRO28250_org,production index_PRO28276_org,production index_PRO27840_org,production index_PRO271000_org,production index_PRO27756_org,production index_PRO27826_org,production index_PRO27380_org,production index_PRO27392_org,production index_PRO27250_org,production index_PRO27276_org
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
2022-01-01,2022-01,Product1,37942942.06,235.956129,235.956129,85.743503,108.15632,94.55061,120.353403,86.851008,101.258277,110.460181,110.823532,103.49926,101.70157,95.003541,95.957072,111.052133,129.565798,103.199827,120.338095,133.219393,121.309886,125.229641,196.91114,106.173052,129.829146,1.1314,131.62851,100.151243,115.390617,111.037476,117.853386,98.280171,110.894371,117.489883,100.305236,85.44417,92.292313,117.861377,90.558372,92.343117,111.36467,122.236023,108.999212,112.324119,74.355736,95.369065,77.944954,98.599052
2022-02-01,2022-02,Product1,36293076.33,235.956129,235.956129,90.60354,117.71577,103.987916,129.383676,106.583758,120.956538,117.879631,118.300232,100.294492,98.583952,98.458412,95.957072,116.336327,138.56033,113.500635,131.500126,138.905572,131.273215,131.176501,197.523679,118.348203,131.963648,1.1342,133.342178,100.151243,116.431107,112.057098,118.905647,98.714158,117.168167,124.627762,98.332942,89.021378,113.290565,124.710859,97.766502,102.820961,114.6884,127.373421,103.672183,115.55733,91.182419,103.950687,79.001831,106.128059
2022-03-01,2022-03,Product1,40220407.03,329.413367,329.413367,107.843548,136.85872,121.308119,151.201314,124.637966,153.645142,152.000561,156.400634,97.089723,95.466333,121.993915,95.957072,117.654038,165.926217,133.13301,158.055622,149.890871,163.186834,141.283339,271.079906,142.200872,135.782207,1.1019,136.153778,100.151243,117.471596,112.362991,119.852684,99.021554,118.910912,149.375229,96.360648,109.155949,134.288818,160.954233,114.72081,122.049515,115.164093,152.452942,98.345154,145.254965,102.475998,133.743932,96.704582,119.948433
2022-04-01,2022-04,Product1,41053203.12,267.373145,267.373145,87.69811,116.528738,99.522205,127.022869,103.55669,128.733305,114.262328,115.012049,97.089723,95.466333,95.266502,95.957072,116.961047,165.926217,112.902215,134.93551,146.090998,153.188945,138.094143,243.43603,130.83543,134.859685,1.0819,137.531616,100.151243,118.408043,113.280655,121.220627,98.857087,119.385483,128.285706,96.360648,84.728728,111.090744,120.09881,91.979698,98.675873,112.158089,134.843353,98.345154,114.359844,86.255684,102.36168,80.763306,101.074341


# Feature Engineering

## Lag features for Sales_EUR

`Lag features` answer the question "what were the sales X months ago?" - we are using lag with X=[1,3,12] (1 month, 1 trimester, 1 year)

In [131]:
# include recent sales for each product
for i in range(1, 15):
    df = globals()[f'train_product{i}']
    df['Sales_Lag_1'] = df['Sales_EUR'].shift(1)
    df['Sales_Lag_3'] = df['Sales_EUR'].shift(3)
    df['Sales_Lag_12'] = df['Sales_EUR'].shift(12)
    globals()[f'train_product{i}'] = df

`Rolling mean(moving average)`capture short-term trends or smooths out noise. For example, April 2021, it averages Jan–Mar 2021 (but shifted 1 step so we're not using the current month’s value).


In [132]:
## Include rolling mean for each product
for i in range(1, 15):
    df = globals()[f'train_product{i}']
    df['Rolling_Mean_3'] = df['Sales_EUR'].shift(1).rolling(window=3).mean() # mean of the last 3 months
    df['Rolling_Mean_6'] = df['Sales_EUR'].shift(1).rolling(window=6).mean() # mean of the last 6 months
    globals()[f'train_product{i}'] = df

## Temporal features

We are creating columns to keep the month, and the quarter, as it may help identify seasonal trends

In [133]:
# create a month and a quarter column
for i in range(1, 15):
    df = globals()[f'train_product{i}']
    df['Month'] = df.index.month
    df['Quarter'] = df.index.quarter
    globals()[f'train_product{i}'] = df

## Lag features for Macro Variables

In [134]:
# Loop over all 14 product DataFrames
for i in range(1, 15):
    df = globals()[f'train_product{i}']
    
    # Identify macro columns (exclude sales/date/product ID columns)
    macro_cols = [col for col in df.columns if col not in [
        'YearMonth_x', 'YearMonth_y', 'Mapped_GCK', 'Sales_EUR', 'DATE'
    ] and not col.startswith('Sales') and not col in ['Month', 'Quarter', 'Product_ID']]
    
    # Create lag and delta features
    for col in macro_cols:
        df[f'{col}_Lag1'] = df[col].shift(1)
        df[f'{col}_Delta'] = df[col] - df[col].shift(1)
    
    # Save back to global scope
    globals()[f'train_product{i}'] = df

# Feature selection

## Feature correlation analysis

After analysing the feature correlation to Sales_EUR in each product, **we decided to keep only the best 5 for each product**, in order to keep the model light. 

`Note:` with more resources (time & computational power), we could keep more features a maybe achieve bettter results.

In [135]:
for i in range(1, 15):
    df = globals()[f'train_product{i}']
    
    # Drop any rows with NaNs just to be safe
    df_clean = df.dropna()

    # Compute correlations with the target variable
    correlation = df_clean.corr(numeric_only=True)['Sales_EUR'].drop('Sales_EUR').abs().sort_values(ascending=False)

    # Get top 5 correlated features
    top_features = correlation.head(5)

    print(f"\nTop 5 Correlated Features for train_product{i}:")
    print(top_features)


Top 5 Correlated Features for train_product1:
production index_PRO27826_org_Lag1    0.617876
production index_PRO27276_org_Lag1    0.581394
United Kingdom_MAB_ELE_PRO826_Lag1    0.555097
production index_PRO28826_org_Lag1    0.504613
Quarter                               0.442357
Name: Sales_EUR, dtype: float64

Top 5 Correlated Features for train_product2:
Germany_MAB_ELE_SHP276_Delta            0.563012
production index_PRO28276_org_Delta     0.518644
Germany_MAB_ELE_PRO276_Delta            0.513384
Europe_MAB_ELE_SHP1100_Delta            0.506551
production index_PRO271000_org_Delta    0.492981
Name: Sales_EUR, dtype: float64

Top 5 Correlated Features for train_product3:
production index_PRO27840_org_Lag1    0.647807
Europe_MAB_ELE_SHP1100                0.624814
Producer Prices_PRI27826_org          0.598709
United States_MAB_ELE_PRO840_Lag1     0.597511
Germany_MAB_ELE_SHP276                0.596622
Name: Sales_EUR, dtype: float64

Top 5 Correlated Features for train_product4:
U

## Filter dataframes to only include the top 5 features

In [136]:
for i in range(1, 15):
    # Get train and test DataFrames
    train_df = globals()[f'train_product{i}']
    test_df = globals()[f'test_product{i}']

    # Drop NA rows from training set for correlation analysis
    train_clean = train_df.dropna()

    # Calculate correlation with Sales_EUR
    correlation = train_clean.corr(numeric_only=True)['Sales_EUR'].drop('Sales_EUR').abs().sort_values(ascending=False)
    top_5_features = correlation.head(5).index.tolist()

    # Include Sales_EUR in both sets
    selected_columns = top_5_features + ['Sales_EUR']

    # Only keep features present in both DataFrames
    available_columns = set(train_df.columns).intersection(test_df.columns)
    selected_columns = [col for col in selected_columns if col in available_columns]

    # Filter and drop NaNs
    train_filtered = train_df[selected_columns].dropna()
    test_filtered = test_df[selected_columns].dropna()

    # Create X and y
    if 'Sales_EUR' in train_filtered.columns:
        globals()[f'X_train_product{i}'] = train_filtered.drop(columns='Sales_EUR')
        globals()[f'y_train_product{i}'] = train_filtered['Sales_EUR']
    if 'Sales_EUR' in test_filtered.columns:
        globals()[f'X_test_product{i}'] = test_filtered.drop(columns='Sales_EUR')
        globals()[f'y_test_product{i}'] = test_filtered['Sales_EUR']

# TESTING - Model with XGBoost (ML Approach)

Why XGBoost?
- Handles non-linearity and interactions between features
- Works great with small-to-medium datasets
- Naturally includes feature importance
- Robust to multicollinearity

In [137]:
# To store models and RMSEs
xgb_models = {}
rmse_scores = {}

for i in range(1, 15):
    try:
        # Get training and test data
        X_train = globals()[f'X_train_product{i}']
        y_train = globals()[f'y_train_product{i}']
        X_test = globals()[f'X_test_product{i}']
        y_test = globals()[f'y_test_product{i}']

        # Train the model
        model = XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=42,
            objective='reg:squarederror'
        )
        model.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        # Store model and score
        xgb_models[f'product{i}'] = model
        rmse_scores[f'product{i}'] = rmse

        print(f"product{i} – RMSE: {rmse:.2f}")

    except Exception as e:
        print(f"product{i} – Error: {e}")

product1 – Error: list index out of range
product2 – Error: list index out of range
product3 – RMSE: 129211.62
product4 – Error: list index out of range
product5 – RMSE: 19374.22
product6 – Error: list index out of range
product7 – RMSE: 3252.25
product8 – RMSE: 2470790.51
product9 – RMSE: 8228.97
product10 – RMSE: 33379.01
product11 – Error: list index out of range
product12 – RMSE: 313094.22
product13 – Error: list index out of range
product14 – RMSE: 14929.03


# Visualize feature importance

In [138]:

xgb.plot_importance(xgb_models['product1'])
plt.title("Feature Importance – product1")
plt.show()

KeyError: 'product1'

In [221]:
# results = []

# # Loop through each product
# for gck in full_df['Mapped_GCK'].unique():
#     gck_df = full_df[full_df['Mapped_GCK'] == gck].dropna()
    
#     train = gck_df[gck_df['YearMonth'] < '2022-05-01']
#     test = gck_df[gck_df['YearMonth'] >= '2022-05-01']
    
#     features = [col for col in gck_df.columns if col not in ['Sales_EUR', 'Date', 'YearMonth', 'Mapped_GCK']]
    
#     model = XGBRegressor(n_estimators=100, learning_rate=0.1)
#     model.fit(train[features], train['Sales_EUR'])
    
#     preds = model.predict(test[features])
    
#     rmse = np.sqrt(mean_squared_error(test['Sales_EUR'], preds))
#     results.append((gck, rmse))
    
#     # Save predictions for submission
#     test['Predicted_Sales'] = preds
#     test[['YearMonth', 'Mapped_GCK', 'Predicted_Sales']].to_csv(f'predictions_{gck}.csv', index=False)

# Model with SARIMAX (Statistical Approach)

In [222]:
# # Example for 1 product
# gck = 'Product_X'
# gck_df = full_df[full_df['Mapped_GCK'] == gck].set_index('YearMonth')

# endog = gck_df['Sales_EUR']
# exog = gck_df[market_df.columns.difference(['Date', 'YearMonth'])]

# # Train/test split
# train_endog = endog[:'2022-04']
# test_endog = endog['2022-05':]

# train_exog = exog.loc[:'2022-04']
# test_exog = exog.loc['2022-05':]

# model = SARIMAX(train_endog, exog=train_exog, order=(1,1,1), seasonal_order=(1,1,1,12))
# sarimax_res = model.fit()

# forecast = sarimax_res.predict(start=test_endog.index[0], end=test_endog.index[-1], exog=test_exog)
# rmse = np.sqrt(mean_squared_error(test_endog, forecast))
# print(f"RMSE for {gck}: {rmse}")

# Evaluation

# Submission Formatting

In [223]:
# # Combine all test predictions into one file
# final_submission = pd.concat([pd.read_csv(f'predictions_{gck}.csv') for gck in full_df['Mapped_GCK'].unique()])
# final_submission.columns = ['Year Month', 'Mapped_GCK', 'Sales EUR']
# final_submission.to_csv("Submission_Template.csv", index=False)