In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from pandas_datareader import data as pdr
import datetime as dt
import yfinance as yf
yf.pdr_override()
%matplotlib inline

# !pip install yfinance --user

: 

In [None]:
start = dt.datetime(2020, 8, 1)
# start = dt.datetime.strptime('2020-07-01', '%Y-%m-%d')
end = dt.datetime(2023, 2, 1)
# end = dt.datetime.strptime('2023-2-1', '%Y-%m-%d')
stocks = ['IFX.DE']
stock_df = pdr.get_data_yahoo(stocks, start, end)


In [None]:
stock_df.head()

In [None]:
stock_df[['Open', 'High', 'Low', 'Close', 'Adj Close']].plot()

### Questions
1. How to tell if product is active or not?
2. How to deal with NA values? (https://towardsdatascience.com/4-techniques-to-handle-missing-values-in-time-series-data-c3568589b5a8)
3. 

### Notes
1. All products are active

In [None]:
df = pd.read_csv("20230411_SummerTerm23_Data_Challenge_Infineon_Data.csv", parse_dates=['reporting_time'])
df = df.set_index('reporting_time')
df = df[df['planning_method_latest']!='no Plan']
# (75088, 21) # original shape
# (61756, 21) no cancelled products
df.tail()

In [None]:
df.info()

In [None]:
df['product_application'].value_counts()

In [None]:
df['product_main_family'].value_counts()

In [None]:
df['product_name'].value_counts() # cannot use for time series as some product names only have 1 entry

In [None]:
df['product_marketing_name'].value_counts()

In [None]:
df['product_basic_type'].value_counts() # cannot use for time series as some product basic types only have 1 entry

In [None]:
df_sample = df[df['product_marketing_name'] == 'maus'].dropna()
test_series = df_sample.demand.resample('M').mean()
test_exog_df = df_sample[['external1', 'external2', 'external3', 'external4', 'external5', 'external6', 'external7']].resample('M').mean()
# test_series = test_series.diff().iloc[1:]
# test_exog_df = test_exog_df.diff().iloc[1:]
plt.figure(figsize=(15,10))
plt.plot(test_series)
plt.grid()
plt.show()

In [None]:
# test_exog_df

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller


In [None]:
acf_plot = plot_acf(test_series)

In [None]:
pcf_plot = plot_pacf(test_series)

In [None]:
result = adfuller(test_series, maxlag = 5)
print(f'ADF Statistic: {result[0]}')
print(f'n_lags: {result[2]}')
print(f'p-value: {result[1]}')
for key, value in result[4].items():
    print('Critial Values:')
    print(f'   {key}, {value}')

In [None]:
train_prop = 0.7
valid_prop = 0.1
validation_prop = 1-train_prop-test_prop

train_set = test_series.iloc[:int(len(test_series)*train_prop)]
valid_set = test_series.iloc[int(len(test_series)*train_prop): int(len(test_series)*(train_prop+test_prop))]
test_set = test_series.iloc[int(len(test_series)*(train_prop+test_prop)):]

train_exog = test_exog_df.iloc[:int(len(test_exog_df)*train_prop)]
valid_exog = test_exog_df.iloc[int(len(test_exog_df)*train_prop): int(len(test_exog_df)*(train_prop+test_prop))]
# test_exog

In [None]:
train_set_1 = test_series.iloc[:int(len(test_series)*train_prop)].diff().iloc[1:]
acf = plot_acf(train_set_1)
pacf = plot_pacf(train_set_1)

In [None]:
result = adfuller(train_set_1, maxlag = 5)
print(f'ADF Statistic: {result[0]}')
print(f'n_lags: {result[2]}')
print(f'p-value: {result[1]}')
for key, value in result[4].items():
    print('Critial Values:')
    print(f'   {key}, {value}')

In [None]:
model = ARIMA(train_set, order= (result[2], 0, 1), exog = train_exog)

In [None]:
model_fit = model.fit()
print(model_fit.summary())

In [None]:
predictions = model_fit.predict(start = valid_set.index[0], end = valid_set.index[-1], exog = valid_exog)
residuals = valid_set-predictions
plt.figure(figsize = (10,4))
plt.plot(residuals)
plt.title('residuals of test_series')
plt.ylabel('error')
plt.xlabel('date')
plt.axhline(0, color= 'r', linestyle = '--', alpha = 0.2)
plt.axvline(valid_set.index[0], color ='k', linestyle = '--', alpha = 0.2)
plt.axvline(valid_set.index[-1], color ='k', linestyle = '--', alpha = 0.2)

plt.show()

In [None]:
plt.figure(figsize=(10,4))
plt.plot(valid_set, label = 'validation set')
plt.plot(predictions, color = 'r', label = 'predictions')
plt.title('Validation set and prediction')
plt.axvline(valid_set.index[0], color ='k', linestyle = '--', alpha = 0.2)
plt.axvline(valid_set.index[-1], color ='k', linestyle = '--', alpha = 0.2)
plt.legend(loc = "upper left")
plt.grid()

In [None]:
rmse = np.sqrt(np.mean(residuals**2))
print(f"Root mean squared error: {rmse}")

In [None]:
valid_set

In [None]:
sns.heatmap(df[['external1', 'external2', 'external3', 'external4', 'external5', 'external6', 'external7']].corr(), annot= True)