# MAMO Projekt - Data Wrangling

In [102]:
# loading the libraries
import pandas as pd
from scipy import io
import numpy as np

In [103]:
# load simulatons for each model
data_dict = {}
for elem in ['IM1', 'IM2', 'IM3', 'IM0']:
    data_dict[elem] = io.matlab.loadmat(f"../Data/verifikace/simul_results_{elem}.mat")

In [104]:
#  load observed data
data_observed = pd.read_csv("../Data/verifikace/data_orig.csv").drop("Unnamed: 0", axis=1)
data_observed

Unnamed: 0,Yhat,Chat,Ihat,qhat,Rhat,pihat
0,0.024378,0.011791,0.039947,0.056197,0.005926,0.006924
1,0.008916,0.007314,0.007435,-0.009764,0.003159,-0.000441
2,0.000361,-0.006465,0.003276,-0.021206,0.000654,-0.01381
3,0.006097,-0.005752,0.003605,-0.027877,-0.000193,-0.011187
4,-0.002412,0.001836,-0.027888,-0.011065,-0.001822,-0.008569
5,0.009639,0.003462,-0.006922,0.014308,-0.002673,-0.003942
6,0.0097,0.002962,0.03971,0.010851,-0.00245,0.002709
7,0.001745,0.005979,0.027927,0.015742,-0.001623,0.004405
8,0.001735,0.008306,-0.002724,0.018249,-0.000728,0.000162
9,0.006518,0.008952,0.013066,0.01908,0.000237,0.000996


# 1 Standard Errors on Variables

In [133]:

std_err = pd.concat([pd.Series(np.mean(data_dict['IM1']['sim_std'], axis=1)),
                     pd.Series(np.mean(data_dict['IM2']['sim_std'], axis=1)),
                     pd.Series(np.mean(data_dict['IM3']['sim_std'], axis=1)),
                     pd.Series(np.mean(data_dict['IM0']['sim_std'], axis=1))], axis=1)
std_err.columns = ['M1', 'M2', 'M3', 'M0']
std_err.index = ['Yhat', 'Chat', 'pihat', 'qhat', 'Rhat', 'Ihat']
print(std_err.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &        M1 &        M2 &        M3 &        M0 \\
\midrule
Yhat  &  0.183233 &  0.017557 &  0.031303 &  0.065100 \\
Chat  &  0.107606 &  0.020537 &  0.015315 &  0.058421 \\
pihat &  0.092172 &  0.008194 &  0.051572 &  0.038937 \\
qhat  &  0.003686 &  0.015774 &  0.042055 &  0.075616 \\
Rhat  &  0.026045 &  0.047620 &  0.002872 &  0.006538 \\
Ihat  &       NaN &  0.001529 &  0.012544 &  0.023978 \\
\bottomrule
\end{tabular}



  print(std_err.to_latex())


In [134]:
std_err = pd.concat([pd.Series(np.mean(data_dict['IM1']['sim_std'])),
                     pd.Series(np.mean(data_dict['IM2']['sim_std'])),
                     pd.Series(np.mean(data_dict['IM3']['sim_std'])),
                     pd.Series(np.mean(data_dict['IM0']['sim_std']))], axis=1)
std_err.columns = ['M1', 'M2', 'M3', 'M0']
std_err.index = ['Mean_StdErr']
print(std_err.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &        M1 &        M2 &        M3 &        M0 \\
\midrule
Mean\_StdErr &  0.082548 &  0.018535 &  0.025943 &  0.044765 \\
\bottomrule
\end{tabular}



  print(std_err.to_latex())


# 2 Autocorrelations

In [108]:
import statsmodels.api as sm
df_acf = pd.DataFrame()
for col in data_observed.columns:
    df_acf = pd.concat([df_acf, pd.Series(sm.tsa.acf(data_observed[col], nlags=5))], axis=1)
df_acf.columns = data_observed.columns
df_acf = df_acf.drop(index=0)
df_acf

Unnamed: 0,Yhat,Chat,Ihat,qhat,Rhat,pihat
1,0.728603,0.812631,0.701256,0.606004,0.791582,0.781387
2,0.499782,0.636246,0.359306,0.274435,0.502886,0.515172
3,0.289927,0.53047,0.103021,-0.024391,0.247092,0.318639
4,0.098083,0.464438,-0.079851,-0.161415,-0.008164,0.090391
5,0.022489,0.320832,-0.174992,-0.125157,-0.218318,-0.005011


In [141]:
squared_dev = {}
for element in ['IM0', 'IM2', 'IM3']:
    squared_dev[element] = (np.mean(data_dict[element]['sim_auto'], axis=2).transpose() - df_acf)**2
    squared_dev[element] = squared_dev[element]

In [142]:
squared_dev['IM2']

Unnamed: 0,Yhat,Chat,Ihat,qhat,Rhat,pihat
1,1.597495,1.644391,0.520665,1.337559,1.732645,1.499434
2,0.367542,0.682594,0.982848,0.101273,0.448447,0.483658
3,0.025119,0.105118,0.049757,0.009566,0.000107,0.006985
4,0.010795,0.079214,0.169603,0.049155,0.032911,0.001304
5,0.133321,0.594393,0.019319,0.006004,0.0461,0.157223


In [135]:
print(pd.DataFrame(squared_dev, index=['Autocorrelation']).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &       IM0 &       IM2 &       IM3 \\
\midrule
Autocorrelation &  0.292506 &  2.578909 &  0.466051 \\
\bottomrule
\end{tabular}



  print(pd.DataFrame(squared_dev, index=['Autocorrelation']).to_latex())


# 3 Correlations

In [154]:
mean_corr = {}
for element in ['IM0', 'IM3']:
    mean_corr[element] = pd.DataFrame(np.mean(data_dict[element]['sim_corr'], axis=2),
                                      columns=data_observed.columns,
                                      index=data_observed.columns)
mean_corr['IM3']

Unnamed: 0,Yhat,Chat,Ihat,qhat,Rhat,pihat
Yhat,1.0,0.913159,0.939796,0.884552,-0.523456,0.051851
Chat,0.913159,1.0,0.721554,0.767831,-0.7185,0.092616
Ihat,0.939796,0.721554,1.0,0.865783,-0.293986,0.005068
qhat,0.884552,0.767831,0.865783,1.0,-0.450284,-0.144559
Rhat,-0.523456,-0.7185,-0.293986,-0.450284,1.0,-0.59143
pihat,0.051851,0.092616,0.005068,-0.144559,-0.59143,1.0


In [155]:
data_observed.corr()

Unnamed: 0,Yhat,Chat,Ihat,qhat,Rhat,pihat
Yhat,1.0,0.515691,0.71994,0.211371,0.301331,0.031011
Chat,0.515691,1.0,0.295096,0.043896,0.368363,0.571885
Ihat,0.71994,0.295096,1.0,-0.0384,0.471488,-0.025316
qhat,0.211371,0.043896,-0.0384,1.0,-0.311476,-0.055458
Rhat,0.301331,0.368363,0.471488,-0.311476,1.0,0.204541
pihat,0.031011,0.571885,-0.025316,-0.055458,0.204541,1.0


In [162]:
corr_dict = {}
for k, v in mean_corr.items():
    corr_dict[k] = np.mean((mean_corr[k] - data_observed.corr())**2).mean()

In [171]:
print(pd.DataFrame(corr_dict, index=['MSE']).to_latex())

\begin{tabular}{lrr}
\toprule
{} &       IM0 &       IM3 \\
\midrule
MSE &  0.405589 &  0.306795 \\
\bottomrule
\end{tabular}



  print(pd.DataFrame(corr_dict, index=['MSE']).to_latex())
