In [None]:
# !wget https://dataverse.harvard.edu/api/access/datafile/3107210?format=tab&gbrecs=true


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

from scipy.spatial.distance import jensenshannon
from scipy.special import kl_div


from tqdm.notebook import tqdm, trange

In [None]:
df=pd.read_csv('worlds_polls.csv',sep='\t')
keys_cols=['polldate', 'election',	'system',	'rule',	'round','electionyr','elecdate']

In [None]:
def entropy(p,q):
  return - p*np.log(q)
  
def mse(y_true,y_pred):
  return np.square(y_pred-y_true)

def mae(y_true,y_pred):
  return np.abs(y_pred-y_true)

def error_p(p,n,std=1.96):
  return std*np.sqrt(p*(1-p)/n)

def diff_error(p,q,n):
  err_th=error_p(p,n)
  err_obs=np.abs(p-q)
  return err_obs-err_th

def odd_err(p,v):
  return np.abs(np.log((p/(1-p))*(1-v)/v))

def extract(df,id_poll):
  sondage=df[df.idpoll==id_poll]
  return sondage.vote_,sondage.poll_, sondage.idpoll

In [None]:
def generate_id_poll(df_source,k_cols):
  df=df_source.copy()
  df['idpoll']=0
  keys=df[k_cols].drop_duplicates( ignore_index=True)
  print(f'Nombre de clefs {len(keys)}')
  for i in trange(len(keys)):
    filter=(df[keys_cols] ==keys[keys_cols].iloc[i]).sum(axis=1)==7
    df.loc[filter,'idpoll']=i
  return df

df=df[df.vote_.isna()==False]
df=df[df.poll_.isna()==False] 
df=generate_id_poll(df,keys_cols)

In [None]:
# df.to_pickle("polls.p")

In [None]:
df=pd.read_pickle("polls.p")
df=df[df["sample"]>0]
df=df.drop(columns=["countryid", "regime", "turnout", "inc_", "gov_","espv"])
df=df.rename(columns={"sample":"sample_"})
df=df[df.yr>2000]
df=df[df.daysbeforeED<7]

In [None]:
df

# Plot error over time


In [None]:
sns.scatterplot(x=df.sample_, y=mae(df.vote_,df.poll_))
plt.ylim(0,6)
plt.xlim(0,10000)

In [None]:
sns.regplot(x=df.sample_, y=mae(df.vote_,df.poll_), lowess=True, line_kws=dict(color="r"))
plt.ylim(0,6)
plt.xlim(0,10000)

In [None]:
max_sample=10000
n = np.linspace(200, max_sample, 1000)
p = np.linspace(0.001, 1, 1000)

nn, pp = np.meshgrid(n,p)
z = error_p(pp,nn,0.67)*100
sub_df = df.query(f'sample_ > 200 and sample_ < {max_sample}')

In [None]:
from numpy import size
import plotly.graph_objects as go



fig = go.Figure(data=[go.Surface(x=nn, y=pp, z=z)])

fig.add_scatter3d(x=sub_df.sample_, y=sub_df.vote_/100, z=mae(sub_df.vote_,sub_df.poll_), mode='markers', marker_size = 1)

fig.update_layout(title='test',
                  width=1000, height=1000,
                  margin=dict(l=65, r=50, b=65, t=90))

In [None]:
df.sample_.describe()

## Compute entropy


In [None]:
def poll_measure(df:pd.DataFrame,measure=entropy)->float:
  """
  df should store only one polls
  """
  y=df.vote_.to_numpy()/100
  yhat=df.poll_.to_numpy()/100
  if y.sum()<1:
    y=np.append(y,1-y.sum())
    yhat=np.append(yhat,1-yhat.sum())
    yhat[yhat<0]=0
  yhat[yhat==0]=0.00001
  return measure(y,yhat).sum()

poll_measure(df[df.idpoll==23756],measure=kl_div)

## Compute random sample

In [None]:
def random_sample_measure(df:pd.DataFrame,n_samples:int=1000,poll_size:int=None,measure=kl_div)->float:
  """
  df should store only one polls
  """
  y=df.vote_.to_numpy()/100
  if poll_size==None:
    poll_size=int(df['sample'].iloc[0])
  if y.sum()<1:
    y=np.append(y,1-y.sum())
  if y.sum()!=1.0:
    # Proba étrange ne sommant pas à 1
    y=y/y.sum()
  try:
    samples=np.random.choice(list(range(len(y))),size=(n_samples,poll_size),p=y)
  except :
    print(y,y.sum())
    return 0
  samples=np.eye(len(y))[samples]
  yhat=samples.sum(axis=1)
  yhat[yhat==0]=0.00001
  yhat=(yhat.T/yhat.sum(axis=1)).T
  measures=measure(y,yhat).sum(axis=1)
  return np.median(measures)

random_sample_measure(df[df.idpoll==23756],1000,1500,measure=kl_div)

## Finding best random sampling through minimization

In [None]:
def optimal_size_sample(df:pd.DataFrame,n_samples:int=1000,measure=kl_div)->int:
  sample_size=int(df["sample"].iloc[0])
  lb=1
  ub=sample_size
  y_hat=random_sample_measure(df,poll_size=ub,n_samples=n_samples,measure=measure)
  y=poll_measure(df,measure=measure)
  while y-y_hat<-1e-5 and ub<16*sample_size:
    lb=ub
    ub=ub*2
    y_hat=random_sample_measure(df,poll_size=ub,n_samples=n_samples,measure=measure)
  if y-y_hat>0:
    while ub!=lb and ub!=lb+1:
      n=int((ub+lb)/2)
      y_hat=random_sample_measure(df,poll_size=n,n_samples=n_samples,measure=measure)
      r=y-y_hat
      if r>=0:
        ub=n
      else:
        lb=n
  return ub

optimal_size_sample(df[df.idpoll==23756],1000)

In [None]:
from numpy.core.numeric import ones_like
samples_sizes=[]
for i in tqdm(df.idpoll.unique()):
  poll_df=df[df.idpoll==i]
  year=poll_df.yr.iloc[0]
  country=poll_df.country.iloc[0]
  election=poll_df.election.iloc[0]
  system=poll_df.system.iloc[0]
  daysbeforeED=poll_df.daysbeforeED.iloc[0]
  nb_candidates=len(poll_df)
  if poll_df.vote_.sum()<100:
    nb_candidates+=1
  n=int(poll_df["sample"].iloc[0])

  optimal_kl=optimal_size_sample(poll_df)
  optimal_entropy=optimal_size_sample(poll_df,measure=entropy)
  optimal_mse=optimal_size_sample(poll_df,measure=mse)
  optimal_mae=optimal_size_sample(poll_df,measure=mae)
  samples_sizes.append((i,n,optimal_kl,
                        optimal_entropy,optimal_mse,optimal_mae,
                        year,country,election,system,
                        daysbeforeED,nb_candidates))
best_sample_size=pd.DataFrame(samples_sizes,
                              columns=['id','poll_sample','optimal_kl',
                                       'optimal_entropy','optimal_mse','optimal_mae',
                                       'year','country','election',
                                       'system','daysbeforeED','nb_candidates'])


In [None]:
# best_sample_size.to_pickle("bss.p")
best_sample_size=pd.read_pickle("bss.p")
bss14=best_sample_size[best_sample_size.daysbeforeED<=21].copy()
liste_mesures=[['optimal_kl','optimal_entropy'],
               ['optimal_mse','optimal_mae'],
               ['oneshot','']]

nb_lignes=3
nb_colonnes=2

In [None]:
bss14

## Précision en nombre de tirage aléatoire TCL

In [None]:
fig, axes = plt.subplots(nb_lignes,nb_colonnes,figsize=(10,10),dpi=100)
line=list(range(5000))
for i in range(nb_lignes):
    for j in range(nb_colonnes):
        if liste_mesures[i][j]!= '':
            ax=axes[i,j]
            g=sns.regplot(x='poll_sample',y=liste_mesures[i][j],
                        lowess=True,
                        data=bss14,
                        ax=ax,
                        line_kws={"color": "red"})
            ax.set_xlim(1, 5000)
            ax.set_ylim(1, 5000)

plt.show()

In [None]:
fig, axes = plt.subplots(nb_lignes,nb_colonnes,figsize=(18,10),dpi=200)
line=list(range(5000))
for i in range(nb_lignes):
    for j in range(nb_colonnes):
      if liste_mesures[i][j]!= '':
        ax=axes[i,j]
        bss14["bins"]=pd.qcut(bss14["poll_sample"], 5)
        sns.violinplot(x="bins",
                      y=liste_mesures[i][j],
                      data=bss14,
                      ax=ax,
                      #  inner="quartile"
                    )
        ax.set_xlabel("Taille d'échantillon du sondage")
        ax.set_ylim(0, 3000)
fig.suptitle("Précision en fonction de la taille initiale du sondage")
plt.show()
best_sample_size[best_sample_size['poll_sample']==1000].optimal_kl.median()

In [None]:
fig, axes = plt.subplots(nb_lignes,nb_colonnes,figsize=(18,10),dpi=200)
line=list(range(5000))
for i in range(nb_lignes):
  for j in range(nb_colonnes):
    if liste_mesures[i][j]!= '':
      ax=axes[i,j]
      bss14["bins"]=pd.qcut(bss14["poll_sample"], 5)
      sns.boxplot(x="bins",
                  y=liste_mesures[i][j],
                  data=bss14,
                  whis=1.5,          
                  ax=ax,    
                    #  inner="quartile"
                  )
      ax.set_ylim(0, 1300)
      if  liste_mesures[i][j]=='oneshot':
        ax.set_ylim(0, 8000)
      ax.set_xlabel("Taille d'échantillon du sondage")
fig.suptitle("Précision en fonction de la taille initiale du sondage")
plt.show()
best_sample_size[best_sample_size['poll_sample']==1000].optimal_kl.median()

In [None]:
rolling_best=bss14.sort_values('poll_sample').rolling(50).median()

fig, axes = plt.subplots(nb_lignes,nb_colonnes,figsize=(18,10),dpi=200)
for i in range(nb_lignes):
  for j in range(nb_colonnes):
    if liste_mesures[i][j]!= '':
      ax=axes[i,j]
      sns.lineplot(x='poll_sample',y=liste_mesures[i][j],data=rolling_best,ax=ax)
      ax.set_xlim(1, 6000)
      ax.set_ylim(1, 7000)
plt.show()

In [None]:
x=[]
y=[]
for v in tqdm(best_sample_size['poll_sample'].unique()):

  x.append(v)
  y.append(best_sample_size[best_sample_size['poll_sample']==v]['optimal_kl'].median())

fig, ax = plt.subplots(figsize=(5,5),dpi=100)
sns.regplot(x=x,y=y,
            ax=ax,
            # lowess=True,
            line_kws={"color": "red"})
# plt.plot(line,line,'r')
ax.set_xlim(1, 6000)
ax.set_ylim(1, 6000)
plt.show()

## Précision au cours du temps


In [None]:
# fig, ax = plt.subplots(figsize=(5,5),dpi=100)
# # g=sns.regplot(x='year',y='random sample',
# #               lowess=True,
# #               data=bss14,
# #               ax=ax,
# #               line_kws={"color": "red"})
# ax.set_ylim(1, 5000)
# plt.show()
fig, axes = plt.subplots(nb_lignes,nb_colonnes,figsize=(18,10),dpi=200)
line=list(range(5000))
for i in range(nb_lignes):
  for j in range(nb_colonnes):
    if liste_mesures[i][j]!= '':
      ax=axes[i,j]
      bss14["bins"]=pd.qcut(bss14["year"], 5)
      sns.violinplot(x="bins",
                    y=liste_mesures[i][j],
                    data=bss14,
                    ax=ax,
                    #  inner="quartile"
                  )
      ax.set_ylim(0, 5000)
      if liste_mesures[i][j]== 'oneshot':
        ax.set_ylim(0, 10000)
      ax.set_xlabel("Année du sondage")
ax.set_title("Précision par rapport à l'année")
plt.show()

In [None]:
fig, axes = plt.subplots(nb_lignes,nb_colonnes,figsize=(18,10),dpi=200)
for i in range(nb_lignes):
  for j in range(nb_colonnes):
    if liste_mesures[i][j]!= '':
      ax=axes[i,j]
      bss14["bins"]=pd.qcut(bss14["year"], 5)
      sns.boxplot(x="bins",
                    y=liste_mesures[i][j],
                    data=bss14,
                    ax=ax,
                    #  inner="quartile"
                  )
      ax.set_ylim(0,2000)
      if liste_mesures[i][j]== 'oneshot':
        ax.set_ylim(0, 10000)
      ax.set_xlabel("Année du sondage")
ax.set_title("Précision par rapport à l'année")
plt.show()

In [None]:
fig, axes = plt.subplots(nb_lignes,nb_colonnes,figsize=(10,10),dpi=100)
for i in range(nb_lignes):
  for j in range(nb_colonnes):
    if liste_mesures[i][j]!= '':
      ax=axes[i,j]
      g=sns.regplot(x='year',y=liste_mesures[i][j],
                  data=best_sample_size[best_sample_size['daysbeforeED']<14],
                  ax=ax,
                  line_kws={"color": "red"})
      ax.set_ylim(1, 5000)
plt.show()

In [None]:
fig,ax=plt.subplots(figsize=(8,5),dpi=100)

bss14["bins"]=pd.qcut(bss14["year"], 5)
sns.boxplot(x="bins",
              y="poll_sample",
              data=bss14,
              #  inner="quartile"
            )
ax.set_ylim(0,4000)
ax.set_xlabel("Année du sondage")
ax.set_title("Précision par rapport à l'année")
plt.show()

## Précision en fonction de l'écart à la date d'élection

In [None]:

fig, axes = plt.subplots(nb_lignes*nb_colonnes,1,figsize=(18,10),dpi=200)
line=list(range(5000))
for i in range(nb_lignes):
  for j in range(nb_colonnes):
    if liste_mesures[i][j]!= '':
      ax=axes[i*2+j]
      best_sample_size["bins"]=pd.qcut(best_sample_size["daysbeforeED"], 12)
      sns.violinplot(x="bins",
                    y=liste_mesures[i][j],
                    data=best_sample_size,
                    ax=ax,
                    #  inner="quartile"
                  )
      ax.set_ylim(0, 2000)
      if liste_mesures[i][j]== 'oneshot':
        ax.set_ylim(0, 10000)
      ax.set_xlabel("Nb jours avant élection")
fig.suptitle("Précision en fonction du nombre de jours avant l'élection")
plt.show()

In [None]:
fig, axes = plt.subplots(nb_lignes*nb_colonnes,1,figsize=(18,10),dpi=200)
line=list(range(5000))
for i in range(nb_lignes):
  for j in range(nb_colonnes):
    if liste_mesures[i][j]!= '':
      ax=axes[i*2+j]
      best_sample_size["bins"]=pd.qcut(best_sample_size["daysbeforeED"], 12)
      sns.boxplot(x="bins",
                    y=liste_mesures[i][j],
                    data=best_sample_size,
                    ax=ax,
                    #  inner="quartile"
                  )
      ax.set_ylim(0, 1000)
      if liste_mesures[i][j]== 'oneshot':
        ax.set_ylim(0, 10000)
      ax.set_xlabel("Nb jours avant élection")
fig.suptitle("Précision en fonction du nombre de jours avant l'élection")
plt.show()

## Précision en fonction du nombre de partis présentés


In [None]:
fig, axes = plt.subplots(nb_lignes,nb_colonnes,figsize=(18,10),dpi=200)
line=list(range(5000))
for i in range(nb_lignes):
  for j in range(nb_colonnes):
    if liste_mesures[i][j]!= '':
      ax=axes[i,j]
  # bss14["bins"]=pd.qcut(bss14["nb_candidates"])
      sns.boxplot(x="nb_candidates",
                    y=liste_mesures[i][j],
                    data=bss14,
                    ax=ax,
                    #  inner="quartile"
                  )
      ax.set_ylim(0, 3500)
      if liste_mesures[i][j]== 'oneshot':
        ax.set_ylim(0, 10000)
      ax.set_xlabel("Nombre de partis présentés")
fig.suptitle("Précision en fonction du nombre de partis concurrents")
plt.show()

## Comparaison avec un tirage aléatoire unique

In [None]:


def optimal_size_one_shot(df:pd.DataFrame,n_samples:int=1000,measure=kl_div)->int:
  sample_size=int(df["sample"].iloc[0])
  lb=1
  ub=sample_size
  y_hat=random_sample_measure(df,poll_size=ub,n_samples=n_samples,measure=measure)
  y=random_sample_measure(df,poll_size=ub,n_samples=1,measure=measure)
  while y-y_hat<-1e-5 and ub<16*sample_size:
    lb=ub
    ub=ub*2
    y_hat=random_sample_measure(df,poll_size=ub,n_samples=n_samples,measure=measure)
  if y-y_hat>0:
    while ub!=lb and ub!=lb+1:
      n=int((ub+lb)/2)
      y_hat=random_sample_measure(df,poll_size=n,n_samples=n_samples,measure=measure)
      r=y-y_hat
      if r>=0:
        ub=n
      else:
        lb=n
  return ub

In [None]:
# samples_sizes=[]
# for i in tqdm(df.idpoll.unique()):
#   poll_df=df[df.idpoll==i]
#   year=poll_df.yr.iloc[0]
#   country=poll_df.country.iloc[0]
#   election=poll_df.election.iloc[0]
#   system=poll_df.system.iloc[0]
#   daysbeforeED=poll_df.daysbeforeED.iloc[0]
#   nb_candidates=len(poll_df)
#   if poll_df.vote_.sum()<100:
#     nb_candidates+=1
#   n=int(poll_df["sample"].iloc[0])

#   oneshot=optimal_size_one_shot(poll_df)

#   samples_sizes.append((i,n,oneshot,
#                         year,country,election,system,
#                         daysbeforeED,nb_candidates))
