# Analyse de données avec Polars et Plotly Express

## Librairies

In [51]:
import pandas as pd
import seaborn as sns
import polars as pl
import plotly.express as px

## Données

In [52]:
df_pd = sns.load_dataset("tips")   # pandas DataFrame
df = pl.from_pandas(df_pd)      # convert to polars
print(df.head())


shape: (5, 7)
┌────────────┬──────┬────────┬────────┬─────┬────────┬──────┐
│ total_bill ┆ tip  ┆ sex    ┆ smoker ┆ day ┆ time   ┆ size │
│ ---        ┆ ---  ┆ ---    ┆ ---    ┆ --- ┆ ---    ┆ ---  │
│ f64        ┆ f64  ┆ cat    ┆ cat    ┆ cat ┆ cat    ┆ i64  │
╞════════════╪══════╪════════╪════════╪═════╪════════╪══════╡
│ 16.99      ┆ 1.01 ┆ Female ┆ No     ┆ Sun ┆ Dinner ┆ 2    │
│ 10.34      ┆ 1.66 ┆ Male   ┆ No     ┆ Sun ┆ Dinner ┆ 3    │
│ 21.01      ┆ 3.5  ┆ Male   ┆ No     ┆ Sun ┆ Dinner ┆ 3    │
│ 23.68      ┆ 3.31 ┆ Male   ┆ No     ┆ Sun ┆ Dinner ┆ 2    │
│ 24.59      ┆ 3.61 ┆ Female ┆ No     ┆ Sun ┆ Dinner ┆ 4    │
└────────────┴──────┴────────┴────────┴─────┴────────┴──────┘


## Analyse Univariée

### Variable discrète

In [53]:
df["sex"].value_counts(normalize=True, sort=True)

sex,proportion
cat,f64
"""Male""",0.643443
"""Female""",0.356557


In [54]:
# Avec Plotly Express
fig = px.bar(df, x="sex", title="Distribution de la variable 'sex'")
fig.update_traces(showlegend=False)
fig.update_layout(
    autosize=False,
    width=700,
    height=500,
    paper_bgcolor="rgb(230, 240, 240)",
)
fig.show()

### Variable continue

#### Statistiques

In [55]:
df["tip"].describe()

statistic,value
str,f64
"""count""",244.0
"""null_count""",0.0
"""mean""",2.998279
"""std""",1.383638
"""min""",1.0
"""25%""",2.0
"""50%""",2.92
"""75%""",3.55
"""max""",10.0


#### Histogramme

In [56]:
# Avec Plotly Express
fig = px.histogram(df, x="tip", nbins=50, title="Distribution de la variable 'tip'")
fig.update_traces(showlegend=False)
fig.update_layout(
    autosize=False,
    width=700,
    height=500
)
fig.show()

#### BoxPlot

In [57]:
# Avec Plotly Express
fig = px.box(df, x="tip", title="BoxPlot de la variable 'tip'")
fig.update_traces(showlegend=False)
fig.update_layout(
    autosize=False,
    width=700,
    height=500
)
fig.show()

## Analyse Multivariée

### Discret / Discret

In [58]:
df_DD = df.pivot(on="day", index="size", values="day", aggregate_function="len", sort_columns=True).sort("size")
#on="day" → crée une colonne par valeur distincte de day.
#index="size" → crée une ligne par valeur distincte de size.
#values="day" + aggregate_function="len" → compte le nombre de lignes pour chaque combinaison (size, day).
#sort("size") → trie les lignes.
df_DD

size,Fri,Sat,Sun,Thur
i64,u32,u32,u32,u32
1,1,2,0,1
2,16,53,39,48
3,1,18,15,4
4,1,13,18,5
5,0,1,3,1
6,0,0,1,3


In [59]:
order = ["Thur", "Fri", "Sat", "Sun"]
fig = px.imshow(df_DD[order], y=df_DD["size"], text_auto=True, labels=dict(x="Day", y="Size", color="Count"))
fig.update_layout(
    autosize=False,
    width=700,
    height=500)
fig.update_xaxes(side="top")
fig.show()

### Discret / Continu

In [60]:
df.group_by("sex").agg([pl.mean("tip").alias("mean_tip"), pl.std("tip").alias("std_tip"), pl.min("tip").alias("min_tip"), pl.max("tip").alias("max_tip")])

sex,mean_tip,std_tip,min_tip,max_tip
cat,f64,f64,f64,f64
"""Female""",2.833448,1.159495,1.0,6.5
"""Male""",3.089618,1.489102,1.0,10.0


In [61]:
fig = px.bar(df.group_by("sex").agg(pl.mean("tip").alias("mean_tip")), x="sex", y="mean_tip", title="Moyenne du pourboire par sexe", )
fig.update_layout(
    autosize=False,
    width=700,
    height=500
)
fig.show()

In [62]:
fig = px.histogram(df, x="tip", color="sex", title="Distribution du pourboire par sexe")
fig.update_layout(
    autosize=False,
    width=700,
    height=500,
    paper_bgcolor="rgb(230, 240, 240)"
)
fig.show()

In [63]:
fig = px.box(df, x="tip", y="sex")
fig.update_layout(
    autosize=False,
    width=700,
    height=500
)
fig.show()

In [64]:
fig = px.scatter(df, x="tip", y="total_bill", color="sex")
fig.update_layout(
    autosize=False,
    width=700,
    height=500
)
fig.show()