# <span style="color:brown;">Polars</span>

In [1]:
import polars as pl
from polars import col as c

In [2]:
pl_pokemon = (
    pl.read_csv(
        source = "data/pokemon.csv",
        schema_overrides = {
            "Type 1": pl.Categorical,
            "Type 2": pl.Categorical,
            "Generation": pl.Categorical,
            "Legendary": pl.Boolean
        }
    )
    .drop("#")
    .rename(lambda col: col.strip().replace(" ", "_").replace(".", ""))
    .with_columns(Generation = c("Generation").cast(pl.Utf8).cast(pl.Enum(["1", "2", "3", "4", "5", "6"])))
)

In [3]:
print(pl_pokemon.head(5))

shape: (5, 12)
┌───────────────────────┬────────┬────────┬───────┬───┬────────┬───────┬────────────┬───────────┐
│ Name                  ┆ Type_1 ┆ Type_2 ┆ Total ┆ … ┆ Sp_Def ┆ Speed ┆ Generation ┆ Legendary │
│ ---                   ┆ ---    ┆ ---    ┆ ---   ┆   ┆ ---    ┆ ---   ┆ ---        ┆ ---       │
│ str                   ┆ cat    ┆ cat    ┆ i64   ┆   ┆ i64    ┆ i64   ┆ enum       ┆ bool      │
╞═══════════════════════╪════════╪════════╪═══════╪═══╪════════╪═══════╪════════════╪═══════════╡
│ Bulbasaur             ┆ Grass  ┆ Poison ┆ 318   ┆ … ┆ 65     ┆ 45    ┆ 1          ┆ false     │
│ Ivysaur               ┆ Grass  ┆ Poison ┆ 405   ┆ … ┆ 80     ┆ 60    ┆ 1          ┆ false     │
│ Venusaur              ┆ Grass  ┆ Poison ┆ 525   ┆ … ┆ 100    ┆ 80    ┆ 1          ┆ false     │
│ VenusaurMega Venusaur ┆ Grass  ┆ Poison ┆ 625   ┆ … ┆ 120    ┆ 80    ┆ 1          ┆ false     │
│ Charmander            ┆ Fire   ┆ null   ┆ 309   ┆ … ┆ 50     ┆ 65    ┆ 1          ┆ false     │
└────

In [5]:
# Type_1 equal to 'Fire' AND Generation equal to '1'
print(
    pl_pokemon
    .filter((c("Type_1") == "Fire") & (c("Generation") == "1"))
    .select(["Name", "Type_1", "Type_2", "Generation"])
)

shape: (14, 4)
┌───────────────────────────┬────────┬────────┬────────────┐
│ Name                      ┆ Type_1 ┆ Type_2 ┆ Generation │
│ ---                       ┆ ---    ┆ ---    ┆ ---        │
│ str                       ┆ cat    ┆ cat    ┆ enum       │
╞═══════════════════════════╪════════╪════════╪════════════╡
│ Charmander                ┆ Fire   ┆ null   ┆ 1          │
│ Charmeleon                ┆ Fire   ┆ null   ┆ 1          │
│ Charizard                 ┆ Fire   ┆ Flying ┆ 1          │
│ CharizardMega Charizard X ┆ Fire   ┆ Dragon ┆ 1          │
│ CharizardMega Charizard Y ┆ Fire   ┆ Flying ┆ 1          │
│ …                         ┆ …      ┆ …      ┆ …          │
│ Ponyta                    ┆ Fire   ┆ null   ┆ 1          │
│ Rapidash                  ┆ Fire   ┆ null   ┆ 1          │
│ Magmar                    ┆ Fire   ┆ null   ┆ 1          │
│ Flareon                   ┆ Fire   ┆ null   ┆ 1          │
│ Moltres                   ┆ Fire   ┆ Flying ┆ 1          │
└────────

In [6]:
# HP less than 50 OR HP greater than 100
print(
    pl_pokemon
    .filter((c("HP") < 50) | (c("HP") > 100))
    .select(["Name", "Type_1", "Type_2", "HP", "Generation", "Legendary"])
)

shape: (215, 6)
┌─────────────────────┬────────┬────────┬─────┬────────────┬───────────┐
│ Name                ┆ Type_1 ┆ Type_2 ┆ HP  ┆ Generation ┆ Legendary │
│ ---                 ┆ ---    ┆ ---    ┆ --- ┆ ---        ┆ ---       │
│ str                 ┆ cat    ┆ cat    ┆ i64 ┆ enum       ┆ bool      │
╞═════════════════════╪════════╪════════╪═════╪════════════╪═══════════╡
│ Bulbasaur           ┆ Grass  ┆ Poison ┆ 45  ┆ 1          ┆ false     │
│ Charmander          ┆ Fire   ┆ null   ┆ 39  ┆ 1          ┆ false     │
│ Squirtle            ┆ Water  ┆ null   ┆ 44  ┆ 1          ┆ false     │
│ Caterpie            ┆ Bug    ┆ null   ┆ 45  ┆ 1          ┆ false     │
│ Weedle              ┆ Bug    ┆ Poison ┆ 40  ┆ 1          ┆ false     │
│ …                   ┆ …      ┆ …      ┆ …   ┆ …          ┆ …         │
│ PumpkabooSmall Size ┆ Ghost  ┆ Grass  ┆ 44  ┆ 6          ┆ false     │
│ Noibat              ┆ Flying ┆ Dragon ┆ 40  ┆ 6          ┆ false     │
│ Xerneas             ┆ Fairy  ┆ nu

In [7]:
# (Type_1 equal to 'Fire' OR Type_1 equal to 'Water') AND Generation > to '4'
print(
    pl_pokemon
    .filter(
        ((c("Type_1") == "Fire") | (c("Type_1") == "Water")) & (c("Generation") > "4") # because & has higher precedence than |
    )
)

shape: (40, 12)
┌───────────┬────────┬──────────┬───────┬───┬────────┬───────┬────────────┬───────────┐
│ Name      ┆ Type_1 ┆ Type_2   ┆ Total ┆ … ┆ Sp_Def ┆ Speed ┆ Generation ┆ Legendary │
│ ---       ┆ ---    ┆ ---      ┆ ---   ┆   ┆ ---    ┆ ---   ┆ ---        ┆ ---       │
│ str       ┆ cat    ┆ cat      ┆ i64   ┆   ┆ i64    ┆ i64   ┆ enum       ┆ bool      │
╞═══════════╪════════╪══════════╪═══════╪═══╪════════╪═══════╪════════════╪═══════════╡
│ Tepig     ┆ Fire   ┆ null     ┆ 308   ┆ … ┆ 45     ┆ 45    ┆ 5          ┆ false     │
│ Pignite   ┆ Fire   ┆ Fighting ┆ 418   ┆ … ┆ 55     ┆ 55    ┆ 5          ┆ false     │
│ Emboar    ┆ Fire   ┆ Fighting ┆ 528   ┆ … ┆ 65     ┆ 65    ┆ 5          ┆ false     │
│ Oshawott  ┆ Water  ┆ null     ┆ 308   ┆ … ┆ 45     ┆ 45    ┆ 5          ┆ false     │
│ Dewott    ┆ Water  ┆ null     ┆ 413   ┆ … ┆ 60     ┆ 60    ┆ 5          ┆ false     │
│ …         ┆ …      ┆ …        ┆ …     ┆ … ┆ …      ┆ …     ┆ …          ┆ …         │
│ Litleo    ┆ Fi

# <span style="color:brown;">TidyPolars4sci</span>

In [9]:
import tidypolars4sci as tp
from tidypolars4sci import col as c

In [10]:
tp_baseball = tp.read_data(fn = "data/baseball.csv", sep = ",")

print(tp_baseball.head(5))

Loading data 'baseball.csv'... done!
shape: (5, 7)
┌─────────────────┬──────┬───────────────┬────────┬────────┬───────┬─────────────┐
│ Name            ┆ Team ┆ Position      ┆ Height ┆ Weight ┆ Age   ┆ PosCategory │
│ ---             ┆ ---  ┆ ---           ┆ ---    ┆ ---    ┆ ---   ┆ ---         │
│ str             ┆ str  ┆ str           ┆ i64    ┆ i64    ┆ f64   ┆ str         │
╞═════════════════╪══════╪═══════════════╪════════╪════════╪═══════╪═════════════╡
│ Adam_Donachie   ┆ BAL  ┆ Catcher       ┆ 74     ┆ 180    ┆ 22.99 ┆ Catcher     │
│ Paul_Bako       ┆ BAL  ┆ Catcher       ┆ 74     ┆ 215    ┆ 34.69 ┆ Catcher     │
│ Ramon_Hernandez ┆ BAL  ┆ Catcher       ┆ 72     ┆ 210    ┆ 30.78 ┆ Catcher     │
│ Kevin_Millar    ┆ BAL  ┆ First_Baseman ┆ 72     ┆ 210    ┆ 35.43 ┆ Infielder   │
│ Chris_Gomez     ┆ BAL  ┆ First_Baseman ┆ 73     ┆ 188    ┆ 35.71 ┆ Infielder   │
└─────────────────┴──────┴───────────────┴────────┴────────┴───────┴─────────────┘


In [11]:
print(
    tp_baseball
    .mutate(
        Height = c("Height") * 0.0254, # Inch to M
        Weight = c("Weight") * 0.45,  # Pounds to Kg
        BMI = c("Weight") / c("Height")**2
    )
    .head(6)
)

shape: (6, 8)
┌─────────────────┬──────┬────────────────┬────────┬────────┬───────┬─────────────┬───────────┐
│ Name            ┆ Team ┆ Position       ┆ Height ┆ Weight ┆ Age   ┆ PosCategory ┆ BMI       │
│ ---             ┆ ---  ┆ ---            ┆ ---    ┆ ---    ┆ ---   ┆ ---         ┆ ---       │
│ str             ┆ str  ┆ str            ┆ f64    ┆ f64    ┆ f64   ┆ str         ┆ f64       │
╞═════════════════╪══════╪════════════════╪════════╪════════╪═══════╪═════════════╪═══════════╡
│ Adam_Donachie   ┆ BAL  ┆ Catcher        ┆ 1.8796 ┆ 81.0   ┆ 22.99 ┆ Catcher     ┆ 22.927365 │
│ Paul_Bako       ┆ BAL  ┆ Catcher        ┆ 1.8796 ┆ 96.75  ┆ 34.69 ┆ Catcher     ┆ 27.385464 │
│ Ramon_Hernandez ┆ BAL  ┆ Catcher        ┆ 1.8288 ┆ 94.5   ┆ 30.78 ┆ Catcher     ┆ 28.255265 │
│ Kevin_Millar    ┆ BAL  ┆ First_Baseman  ┆ 1.8288 ┆ 94.5   ┆ 35.43 ┆ Infielder   ┆ 28.255265 │
│ Chris_Gomez     ┆ BAL  ┆ First_Baseman  ┆ 1.8542 ┆ 84.6   ┆ 35.71 ┆ Infielder   ┆ 24.606917 │
│ Brian_Roberts   ┆ BAL  ┆

In [12]:
print(
    tp_baseball
    .group_by("Team")
    .summarize(
        count = tp.n(),
        avg_Height = c("Height").mean(),
        avg_Weight = c("Weight").mean(),
        avg_Age = c("Age").mean()
    )
)

shape: (30, 5)
┌──────┬───────┬────────────┬────────────┬───────────┐
│ Team ┆ count ┆ avg_Height ┆ avg_Weight ┆ avg_Age   │
│ ---  ┆ ---   ┆ ---        ┆ ---        ┆ ---       │
│ str  ┆ u32   ┆ f64        ┆ f64        ┆ f64       │
╞══════╪═══════╪════════════╪════════════╪═══════════╡
│ BAL  ┆ 34    ┆ 73.529412  ┆ 196.323529 ┆ 29.034706 │
│ CWS  ┆ 31    ┆ 74.580645  ┆ 209.935484 ┆ 28.077419 │
│ ANA  ┆ 35    ┆ 73.342857  ┆ 201.085714 ┆ 28.808857 │
│ BOS  ┆ 35    ┆ 74.171429  ┆ 204.114286 ┆ 29.741143 │
│ CLE  ┆ 34    ┆ 74.058824  ┆ 200.529412 ┆ 28.319706 │
│ …    ┆ …     ┆ …          ┆ …          ┆ …         │
│ SD   ┆ 32    ┆ 73.5       ┆ 203.875    ┆ 29.841563 │
│ WAS  ┆ 36    ┆ 74.138889  ┆ 199.75     ┆ 26.939444 │
│ PIT  ┆ 35    ┆ 73.6       ┆ 204.371429 ┆ 27.194857 │
│ SF   ┆ 34    ┆ 73.558824  ┆ 202.794118 ┆ 29.929706 │
│ STL  ┆ 32    ┆ 73.625     ┆ 201.625    ┆ 30.4775   │
└──────┴───────┴────────────┴────────────┴───────────┘


In [13]:
print(
    tp_baseball
    .group_by(["Team", "Position"])
    .summarize(
        count = tp.n(),
        avg_Height = c("Height").mean(),
        avg_Weight = c("Weight").mean(),
        avg_Age = c("Age").mean()
    )
)

shape: (238, 6)
┌──────┬──────────────────┬───────┬────────────┬────────────┬───────────┐
│ Team ┆ Position         ┆ count ┆ avg_Height ┆ avg_Weight ┆ avg_Age   │
│ ---  ┆ ---              ┆ ---   ┆ ---        ┆ ---        ┆ ---       │
│ str  ┆ str              ┆ u32   ┆ f64        ┆ f64        ┆ f64       │
╞══════╪══════════════════╪═══════╪════════════╪════════════╪═══════════╡
│ BAL  ┆ Catcher          ┆ 3     ┆ 73.333333  ┆ 201.666667 ┆ 29.486667 │
│ BAL  ┆ First_Baseman    ┆ 2     ┆ 72.5       ┆ 199.0      ┆ 35.57     │
│ BAL  ┆ Second_Baseman   ┆ 1     ┆ 69.0       ┆ 176.0      ┆ 29.39     │
│ BAL  ┆ Shortstop        ┆ 1     ┆ 69.0       ┆ 209.0      ┆ 30.77     │
│ BAL  ┆ Third_Baseman    ┆ 2     ┆ 73.5       ┆ 215.5      ┆ 32.63     │
│ …    ┆ …                ┆ …     ┆ …          ┆ …          ┆ …         │
│ STL  ┆ Shortstop        ┆ 1     ┆ 67.0       ┆ 165.0      ┆ 32.11     │
│ STL  ┆ Third_Baseman    ┆ 2     ┆ 75.0       ┆ 230.0      ┆ 33.175    │
│ STL  ┆ Outfielder   