In [1]:
#pip install pandas numpy statsmodels seaborn numpy matplotlib

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# this so called "line magic" command, amongst other things, stores the plots in the notebook document.
%matplotlib inline

# library for linear regression
import statsmodels.api as sm

# warnings supression
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# read in forecast dataset and mark all questionmarks as NaNs.
df = pd.read_csv("data/forecast_zalando.csv",
                  na_values='?')
df.head()

# let's drop 2025 since I want this to be forecasted using this new model
# keeping 2024 using the Q1 2024 results of 1.5% increase
df = df.drop(columns=["2025"])
df

Unnamed: 0.1,Unnamed: 0,2018,2019,2020,2021,2022,2023,2024
0,GMV,6.6,8.2,10.7,14.3,14.8,14.6,14.8
1,Net Revenue,5.3,6.4,8.0,10.4,10.3,10.1,10.3


In [5]:
# Set 'Unnamed: 0' as index 
df.set_index('Unnamed: 0', inplace=True)

# transpose (flip the axis so years are rows, and GMV/Revenue are columns)
df = df.T.copy()

In [6]:
# should see dataframe now flipped correctly
df

Unnamed: 0,GMV,Net Revenue
2018,6.6,5.3
2019,8.2,6.4
2020,10.7,8.0
2021,14.3,10.4
2022,14.8,10.3
2023,14.6,10.1
2024,14.8,10.3


In [7]:
# Assign independent & dependent variables
X = df['Net Revenue'].astype(float) #independent
y = df['GMV'].astype(float)         #dependent

In [11]:
# Add a constant to the independent variable set (intercept term) for 1's
# This is because our statsmodel OLS needs an adjustable intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

In [15]:
# display the regression results
model.summary()

  warn("omni_normtest is not valid with less than 8 observations; %i "


0,1,2,3
Dep. Variable:,GMV,R-squared:,0.994
Model:,OLS,Adj. R-squared:,0.993
Method:,Least Squares,F-statistic:,823.0
Date:,"Wed, 22 May 2024",Prob (F-statistic):,9.64e-07
Time:,23:19:30,Log-Likelihood:,-0.25593
No. Observations:,7,AIC:,4.512
Df Residuals:,5,BIC:,4.404
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.1599,0.506,-4.267,0.008,-3.461,-0.859
Net Revenue,1.6303,0.057,28.688,0.000,1.484,1.776

0,1,2,3
Omnibus:,,Durbin-Watson:,1.402
Prob(Omnibus):,,Jarque-Bera (JB):,0.849
Skew:,-0.82,Prob(JB):,0.654
Kurtosis:,2.533,Cond. No.,40.7


In [17]:
# print out the intercept and slope of the model by adressing the parameters via ".params"
intercept, slope = model.params

print(f'intercept: {intercept}, slope: {slope}')

intercept: -2.1599497802887537, slope: 1.630257376020087


###  R-squared $\textbf{R}^2$
Proportion of the variation in $y$ that is explained by the model. Measured on a scale from 0 (bad) to 1 (good)  

### const coef
This is the intercept, in other words: value of y when x = 0  

### Net Revenue coef
This is the slope, in other words: Amount, y changes for each unit change of x  