In [4]:
import pandas as pd
from utils import deviance_analysis


In [16]:

df = pd.read_csv('../data/full-coin-data/coin data/analyses/data-long.csv')

df['same_side'] = ( df['toss_start'] == df['toss_end'] ).astype(float)
df['hop1_mem'] = df.groupby('sequence_id')['same_side'].shift(1)
df['hop2_mem'] = df.groupby('sequence_id')['same_side'].shift(2)

# Remove the first two entries of each sequence to avoid NaN values
df = df[df['toss_id'] > 2].reset_index(drop=True)
# Only keep a fraction of the data to speed up the analysis, not needed given the small number of parameters
#df = df[df['sequence_id'] < 80].reset_index(drop=True)

#print(df.iloc[10000-10:10005-10])
print(df.shape)


(342253, 10)


In [17]:

formula_cst = 'same_side~1'
formula_min1 = 'same_side~1+hop1_mem'
formula_min2 = 'same_side~1+hop1_mem+hop2_mem'
#formula_nested = 'same_side+diff_side~1+C(person)+C(coin)+C(person):C(coin)'

results = []
dic_cst, results_cst = deviance_analysis(df, formula_cst, 'mem_cst_glm', force=True)
results.append(dic_cst)
dic_min1, results_min1 = deviance_analysis(df, formula_min1, 'hop1_mem_glm', force=True)
results.append(dic_min1)
dic_min2, results_min2 = deviance_analysis(df, formula_min2, 'hop2_mem_glm', force=True, summary=True)
results.append(dic_min2)

results_df = pd.DataFrame(results)
results_df['aic'] = results_df['aic'] - results_df['aic'].min()
print(results_df)


                 Generalized Linear Model Regression Results                  
Dep. Variable:              same_side   No. Observations:               342253
Model:                            GLM   Df Residuals:                   342250
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -2.3719e+05
Date:                Mon, 06 Jan 2025   Deviance:                   4.7438e+05
Time:                        17:29:30   Pearson chi2:                 3.42e+05
No. Iterations:                     4   Pseudo R-squ. (CS):          2.974e-06
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0262      0.006      4.384      0.0

In [18]:
print('Table for model comparison : ')
deviance_string = results_df.to_latex(index=False, 
                columns=['formula', 'deviance', 'aic', 'df_model'], 
                header=['Model', 'Deviance', 'AIC', 'Model DF'],
                formatters={'formula': lambda x: r'\texttt{'+x+r'}'},
                float_format='{:.2f}'.format,
                caption='Model comparison for models including : no memory, 1-hop memory and 2-hop memory.',
                label='tab:memory-model-comparison',
                position='htb',
                column_format='lccc')
caption_index = deviance_string.find(r'\caption')
deviance_string = deviance_string.replace(r'_', r'\_')
deviance_string = deviance_string[:caption_index] + r'\centering' + '\n' + deviance_string[caption_index:]
print(deviance_string)

Table for model comparison : 
\begin{table}[htb]
\centering
\caption{Model comparison for models including : no memory, 1-hop memory and 2-hop memory.}
\label{tab:memory-model-comparison}
\begin{tabular}{lccc}
\toprule
Model & Deviance & AIC & Model DF \\
\midrule
\texttt{1} & 474381.54 & 0.00 & 0 \\
\texttt{1+hop1\_mem} & 474380.73 & 1.19 & 1 \\
\texttt{1+hop1\_mem+hop2\_mem} & 474380.53 & 2.98 & 2 \\
\bottomrule
\end{tabular}
\end{table}

