# Analyse des sondages

In [None]:
from typing import Any,Callable
import numpy as np
import scipy as sp
import pandas as pd
import sklearn
import statsmodels
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

## Chargement et préparation des données

In [None]:
def get_data()->pd.DataFrame:
    df = pd.read_pickle("polls.p")
    df = df[df["sample"]>0]
    df = df.drop(columns=["countryid", "regime", "turnout", "inc_", "gov_","espv", "rule"])
    df = df.rename(columns={"sample":"sample_"})
    perc_col = ["vote_", "poll_", "ipoll_"]
    df = df.astype({"daysbeforeED":int, "yr":int, "partyid":int})

    df[perc_col] = df[perc_col]/100
    df["residual_"] = df.poll_ - df.vote_
    df["error_"] = df.residual_.abs()
    return df

df = get_data()

## Définition des valeurs et métriques
 - Variable aléatoire de vote : $X$
 - Taille d'échantillon : $n$
 - Coefficient de marge (z-score) : $z$
 - Écart type empirique : $$\sigma = \sqrt{\sum^{n}_{i=1}{\frac{(x_i - \bar{x})^2}{n-1}}}$$
 - Écart type théorique : $$\sigma = \sqrt{P(X)(1-P(X))}$$
 - Marge d'erreur $L$ : $$L = MOE(X,n,z) = z\frac{\sigma}{\sqrt{n}}$$
 - Intervalle de confiance $CI$ : $$CI(X,n,z) = \bar{X} \pm z\frac{\sigma}{\sqrt{n}}$$
 - Taille échantillon estimé sur erreur moyenne $S_{inv}$ : $$S_{inv}(P_o,L) = 0.69^2\frac{P_o (1-P_o)}{L^2}$$

In [None]:
def moe(p:float, n:int, z:float=1.96)->float:
    return z*(p*(1-p)/n)**0.5

def ci(p:float, n:int, z:float=1.96)->tuple[float,float]:
    return p-moe(p,n,z), p+moe(p,n,z)

def likely_sample(Po:float,L:float,z:float=0.68)->float:
    return (Po*(1-Po)) * (z/L)**2

## Première visualisation

In [None]:
df

In [None]:
px.scatter(df, x="sample_", y="residual_", size="vote_", color="yr")

In [None]:
fig = px.scatter_3d(df, x="sample_", y="vote_", z="error_", log_x=True)
fig.update_traces(marker_size=1)

## Calcul de l'erreur paramétrée par n et p

In [None]:
subdf = df.query('yr>=2005 and daysbeforeED<8 ')
subdf

In [None]:
from loess.loess_2d import loess_2d
x= subdf.vote_.values
y= subdf.sample_.values
z= subdf.error_.values
X, Y = np.mgrid[x.min():x.max():300j, y.min():y.max():300j]
Z, _ = loess_2d(x, y, z, xnew=X.ravel(), ynew=Y.ravel(), degree=1, frac=0.4,)
Z = Z.reshape(X.shape)


In [None]:
fig = go.Figure(go.Surface(x=X, y=Y, z=Z))
fig.update_layout(
    scene=dict(
        xaxis_title="Vote",
        yaxis=dict(title="Sample", type="log"),
        zaxis=dict(title="Error"),
        ),
        margin=dict(l=0, r=0, b=0, t=0),
        width = 700,
    )
fig.add_scatter3d(x=subdf.vote_, y=subdf.sample_, z=subdf.error_, mode='markers', marker=dict(size=1))
fig.show()

# Calcul de la médiane théorique

In [None]:
X, Y = np.mgrid[x.min():x.max():300j, y.min():10000:300j]
Z, _ = loess_2d(x, y, z, xnew=X.ravel(), ynew=Y.ravel(), degree=1, frac=0.4,)
Z = Z.reshape(X.shape)

In [None]:
Z_th = moe(X,Y,0.67)
fig = go.Figure(go.Surface(x=X, y=Y, z=Z_th))
fig.add_surface(x=X, y=Y, z=Z)
# fig.add_scatter3d(x=subdf.vote_, y=subdf.sample_, z=subdf.error_, mode='markers', marker=dict(size=1))
fig.update_layout(
    scene=dict(
        xaxis_title="Vote",
        yaxis=dict(title="Sample", type="log"),
        zaxis=dict(title="Error"),
        ),
        margin=dict(l=0, r=0, b=0, t=0),
        width = 1200,
)
fig.write_html("error.html")
fig.show()

### Visualisation de l'erreur proportionnelle à l'erreur théorique

In [None]:
Z_th = moe(X,Y,0.67)
fig = go.Figure(go.Surface(x=X, y=Y, z=Z/Z_th))
# fig.add_scatter3d(x=subdf.vote_, y=subdf.sample_, z=subdf.error_, mode='markers', marker=dict(size=1))
fig.update_layout(
    scene=dict(
        xaxis_title="Vote",
        yaxis=dict(title="Sample", type="log"),
        zaxis=dict(title="Error"),
        ),
        margin=dict(l=0, r=0, b=0, t=0),
        width = 1200,
)
fig.write_html("error.html")
fig.show()

## Estimation de la taille d'échantillon la plus probable

In [None]:
estim_Y = likely_sample(X,Z)
estim_Y_th = likely_sample(X,Z_th)


In [19]:
fig = go.Figure(go.Surface(x=X, y=Y, z=estim_Y))
# fig.add_surface(x=X, y=Y, z=estim_Y_th)
# fig.add_scatter3d(x=subdf.vote_, y=subdf.sample_, z=subdf.error_, mode='markers', marker=dict(size=1))
fig.update_layout(
    scene=dict(
        xaxis_title="Vote",
        yaxis=dict(title="Sample", type="log"),
        zaxis=dict(title="Estimated Sample"),
        ),
        margin=dict(l=0, r=0, b=0, t=0),
        width = 800,
        height = 800
)
fig.show()

In [None]:
(estim_Y).mean()