
DataSource: 
* [Consumer price indices](https://stat.gov.pl/en/latest-statistical-news/news-releases/)  
* [Poland macroeconomic indicators](https://stat.gov.pl/en/poland-macroeconomic-indicators/)  

Description:  
* Here we prepare the data with the year from 2018 till now.
* Features used could be seen in the file named 'featurename.csv'

In [22]:
import pandas as pd
import numpy as np

### Preprocess the data
+ Read the data  
+ Check if there are missing values  
+ Split for target and features  

In [3]:
df = pd.read_csv("features_from2018.csv", sep=";", decimal=",")

In [4]:
df.tail(5)

Unnamed: 0,Month,CPIchange,fea_1,fea_2,fea_3,fea_4,fea_5,fea_6,fea_7,fea_8,...,fea_13,fea_14,fea_15,fea_16,fea_17,fea_18,fea_19,fea_20,fea_21,fea_22
60,2023-01,16.6,105.2,97.4,113.6,167.2,134.2,120.1,129.1,113.4,...,120.6,109.8,106.5,121.6,113.8,110.7,116.5,106.3,116.7,113.9
61,2023-02,18.4,102.0,91.7,112.0,190.2,123.4,118.2,129.0,111.3,...,124.0,111.1,106.6,122.7,113.4,110.1,123.7,107.8,116.0,114.0
62,2023-03,16.1,77.5,80.4,101.2,131.2,113.3,110.3,121.8,104.7,...,124.0,112.0,107.7,119.6,114.4,109.5,103.9,109.4,116.4,114.1
63,2023-04,14.7,67.2,65.8,96.4,132.8,101.6,106.8,117.5,101.7,...,119.7,112.4,108.1,118.2,113.4,109.5,104.6,109.4,114.9,113.8
64,2023-05,,,,,,,,,,...,,,,,,,,,,


In [26]:
fea_names = pd.read_csv("featurename.csv", sep=";")
fea_names

Unnamed: 0,feature index,feature name
0,fea_1,Procurement price indices of wheat (excluding ...
1,fea_2,Procurement price indices of rye (excluding so...
2,fea_3,Procurement price indices of cattle (excluding...
3,fea_4,Procurement price indices of pigs
4,fea_5,Procurement price indices of milk
5,fea_6,Price indices of the sold production in industry
6,fea_7,in mining and quarrying
7,fea_8,in manufacturing
8,fea_9,"n electricity, gas, steam and air conditioning..."
9,fea_10,"in water supply; sewerage, waste management an..."


In [5]:
df.isnull().any(axis=1)

0     False
1     False
2     False
3     False
4     False
      ...  
60    False
61    False
62    False
63    False
64     True
Length: 65, dtype: bool

In [6]:
label_col = "CPIchange"

In [7]:
feature_cols = df.columns[2:]

+ Standardization of features, make them easier to be interpreted

In [8]:
for col in feature_cols:
#    print(df[col])
    if df[col].mean()>50:
        df[col] = df[col] - 100

In [8]:
df.head(5)

Unnamed: 0,Month,CPIchange,fea_1,fea_2,fea_3,fea_4,fea_5,fea_6,fea_7,fea_8,...,fea_13,fea_14,fea_15,fea_16,fea_17,fea_18,fea_19,fea_20,fea_21,fea_22
0,2018-01,1.9,2.1,4.6,4.3,-13.2,7.6,0.2,0.1,0.2,...,4.8,1.2,-4.5,2.1,0.3,1.8,-1.2,2.0,2.5,2.1
1,2018-02,1.4,0.1,2.1,4.5,-8.3,2.0,-0.1,-1.8,0.0,...,3.4,1.0,-4.2,2.1,0.5,1.9,-2.0,0.6,1.5,2.1
2,2018-03,1.3,-2.8,-1.2,2.5,-7.2,2.1,0.5,-0.3,0.6,...,3.7,0.9,-3.7,2.0,0.0,1.4,-1.9,0.6,1.0,2.2
3,2018-04,1.6,-3.6,-1.8,5.4,-16.4,0.9,1.0,1.6,1.1,...,4.1,1.7,-3.8,2.0,0.5,1.4,-0.1,0.1,0.9,2.1
4,2018-05,1.7,-5.3,-3.6,5.6,-19.6,-1.8,3.0,6.2,3.1,...,3.0,1.6,-3.8,2.0,0.4,2.4,4.8,-1.9,1.0,2.2


+ Time series prediction
+ For each obersavation, use the previous 3 obersavations as features

In [9]:
window_size = 3
df_features = df[[]].copy()

column_data_list = []

for fea in feature_cols:
    for i in range(window_size):
        column_data_list.append(df.shift(i+1)[fea].rename("%s_shift_%s"%(fea, i+1)))
        # df_features.loc[:, "%s_shift_%s"%(fea, i+1)]= df.shift(i+1)[fea]
df_features = pd.concat(column_data_list, axis=1)

In [10]:
df_features

Unnamed: 0,fea_1_shift_1,fea_1_shift_2,fea_1_shift_3,fea_2_shift_1,fea_2_shift_2,fea_2_shift_3,fea_3_shift_1,fea_3_shift_2,fea_3_shift_3,fea_4_shift_1,...,fea_19_shift_3,fea_20_shift_1,fea_20_shift_2,fea_20_shift_3,fea_21_shift_1,fea_21_shift_2,fea_21_shift_3,fea_22_shift_1,fea_22_shift_2,fea_22_shift_3
0,,,,,,,,,,,...,,,,,,,,,,
1,2.1,,,4.6,,,4.3,,,-13.2,...,,2.0,,,2.5,,,2.1,,
2,0.1,2.1,,2.1,4.6,,4.5,4.3,,-8.3,...,,0.6,2.0,,1.5,2.5,,2.1,2.1,
3,-2.8,0.1,2.1,-1.2,2.1,4.6,2.5,4.5,4.3,-7.2,...,-1.2,0.6,0.6,2.0,1.0,1.5,2.5,2.2,2.1,2.1
4,-3.6,-2.8,0.1,-1.8,-1.2,2.1,5.4,2.5,4.5,-16.4,...,-2.0,0.1,0.6,0.6,0.9,1.0,1.5,2.1,2.2,2.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,11.6,27.9,48.1,4.6,26.6,53.9,16.4,16.4,25.9,70.8,...,17.3,4.9,5.0,3.3,14.7,14.2,14.8,13.8,13.6,13.5
61,5.2,11.6,27.9,-2.6,4.6,26.6,13.6,16.4,16.4,67.2,...,14.4,6.3,4.9,5.0,16.7,14.7,14.2,13.9,13.8,13.6
62,2.0,5.2,11.6,-8.3,-2.6,4.6,12.0,13.6,16.4,90.2,...,13.3,7.8,6.3,4.9,16.0,16.7,14.7,14.0,13.9,13.8
63,-22.5,2.0,5.2,-19.6,-8.3,-2.6,1.2,12.0,13.6,31.2,...,16.5,9.4,7.8,6.3,16.4,16.0,16.7,14.1,14.0,13.9


### Model building
+ Linear regression
+ Lassor regularization

In [11]:
from sklearn.linear_model import LinearRegression, LassoLars

In [12]:
# lin_model = LinearRegression()
lin_model = LassoLars(alpha=.02)
lin_model.fit(df_features.iloc[window_size:-1], df.iloc[window_size:-1][label_col])

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


LassoLars(alpha=0.02)

+ Prediction of the model
+ Round to one decimal, that is the submitted answer.

In [13]:
prediction_May = round(lin_model.predict(df_features.iloc[window_size:])[-1],1)

In [14]:
prediction_May

13.1

+ Predictions of all the other months

In [15]:
lin_model.predict(df_features.iloc[window_size:])

array([ 1.55488741,  1.72196988,  2.08340924,  2.22216677,  2.22888743,
        2.13834512,  2.15130998,  2.19608856,  1.71575452,  1.5113473 ,
        1.49713434,  1.87570408,  2.19838635,  2.62575832,  2.67691102,
        2.7970928 ,  2.90303077,  3.00237636,  2.87236441,  2.61504408,
        2.88561776,  3.39407941,  3.97917941,  4.17507174,  3.84561386,
        3.06766572,  2.7555567 ,  3.20633745,  2.98616546,  2.78057952,
        3.17870371,  3.23966746,  3.10220203,  2.84825113,  2.86768448,
        3.05959696,  3.9344685 ,  4.77910474,  5.29745238,  5.22721083,
        5.77543064,  6.27558919,  6.3914761 ,  7.1678462 ,  8.13270084,
        8.84273467,  9.91812369, 10.09986734, 11.94713937, 13.4656877 ,
       14.63717098, 15.78240117, 15.81675656, 16.42431801, 17.18154392,
       17.54847356, 16.97705813, 16.58705332, 16.8848368 , 16.87472439,
       14.57088943, 13.05907262])

### Evaluation
+ Evaluate the accuracy with RMSE, have a general look

In [18]:
prediction_all = lin_model.predict(df_features.iloc[window_size:])
prediction_other = prediction_all[:-1]

In [17]:
actual_change = df['CPIchange'].values
actual_change = actual_change[3:-1]

In [19]:
RMSE = np.sqrt(np.mean((prediction_other - actual_change)**2))
RMSE

0.5211578061783615

+ The coefficients regularized by Lasso

In [27]:
coef = lin_model.coef_
coef

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00315456, 0.        , 0.        , 0.        ,
       0.        , 0.00037856, 0.01196876, 0.        , 0.        ,
       0.16138604, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.08303995, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.37028375, 0.        ,
       0.        , 0.        , 0.03741103, 0.03247081, 0.        ,
       0.        , 0.        , 0.        , 0.05275269, 0.        ,
       0.01726212, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.21780843, 0.        , 0.        , 0.        , 0.        ,
       0.        ])