# Selecting and Creatinng Columns

In [95]:
import polars as pl

In [96]:
starwars = pl.read_parquet("data/starwars.parquet")
rebels = starwars.drop("films").filter(pl.col("name")
                 .is_in(["Luke Skywalker", "Han Solo", "Leia Organa"]))

print(rebels[:, :6])
print(rebels[:, 6:11])
print(rebels[:, 11:])

shape: (3, 6)
┌────────────────┬────────┬──────┬────────────┬────────────┬───────────┐
│ name           ┆ height ┆ mass ┆ hair_color ┆ skin_color ┆ eye_color │
│ ---            ┆ ---    ┆ ---  ┆ ---        ┆ ---        ┆ ---       │
│ str            ┆ u16    ┆ f64  ┆ str        ┆ str        ┆ str       │
╞════════════════╪════════╪══════╪════════════╪════════════╪═══════════╡
│ Han Solo       ┆ 180    ┆ 80.0 ┆ brown      ┆ fair       ┆ brown     │
│ Leia Organa    ┆ 150    ┆ 49.0 ┆ brown      ┆ light      ┆ brown     │
│ Luke Skywalker ┆ 172    ┆ 77.0 ┆ blond      ┆ fair       ┆ blue      │
└────────────────┴────────┴──────┴────────────┴────────────┴───────────┘
shape: (3, 5)
┌────────────┬────────┬───────────┬───────────┬─────────┐
│ birth_year ┆ sex    ┆ gender    ┆ homeworld ┆ species │
│ ---        ┆ ---    ┆ ---       ┆ ---       ┆ ---     │
│ f64        ┆ cat    ┆ cat       ┆ str       ┆ str     │
╞════════════╪════════╪═══════════╪═══════════╪═════════╡
│ 29.0       ┆ male   ┆ m

In [97]:
rebels.shape

(3, 15)

## Selectiing Columns

In [98]:
df = pl.DataFrame({
    "a": [1, 2],
    "b": [3, 4]
})

df_struct = df.with_columns(
    pl.struct(["a", "b"]).alias("ab_struct")
)
print(df_struct)

shape: (2, 3)
┌─────┬─────┬───────────┐
│ a   ┆ b   ┆ ab_struct │
│ --- ┆ --- ┆ ---       │
│ i64 ┆ i64 ┆ struct[2] │
╞═════╪═════╪═══════════╡
│ 1   ┆ 3   ┆ {1,3}     │
│ 2   ┆ 4   ┆ {2,4}     │
└─────┴─────┴───────────┘


## Selecting Columns

In [99]:
print(
    rebels.select(
        "name",
        pl.col("homeworld"),
        pl.col("^.*_color$"),
        (pl.col("height") / 100).alias("height_m")
    )
)

shape: (3, 6)
┌────────────────┬───────────┬────────────┬────────────┬───────────┬──────────┐
│ name           ┆ homeworld ┆ hair_color ┆ skin_color ┆ eye_color ┆ height_m │
│ ---            ┆ ---       ┆ ---        ┆ ---        ┆ ---       ┆ ---      │
│ str            ┆ str       ┆ str        ┆ str        ┆ str       ┆ f64      │
╞════════════════╪═══════════╪════════════╪════════════╪═══════════╪══════════╡
│ Han Solo       ┆ Corellia  ┆ brown      ┆ fair       ┆ brown     ┆ 1.8      │
│ Leia Organa    ┆ Alderaan  ┆ brown      ┆ light      ┆ brown     ┆ 1.5      │
│ Luke Skywalker ┆ Tatooine  ┆ blond      ┆ fair       ┆ blue      ┆ 1.72     │
└────────────────┴───────────┴────────────┴────────────┴───────────┴──────────┘


## Introducing Selectors

In [100]:
import polars.selectors as cs

In [101]:
print(
    rebels.select(
        "name",
        cs.by_name("homeworld"),
        cs.by_name("^.*_color$"),
        (cs.by_name("height") / 100).alias("height_m")
    )
)

shape: (3, 6)
┌────────────────┬───────────┬────────────┬────────────┬───────────┬──────────┐
│ name           ┆ homeworld ┆ hair_color ┆ skin_color ┆ eye_color ┆ height_m │
│ ---            ┆ ---       ┆ ---        ┆ ---        ┆ ---       ┆ ---      │
│ str            ┆ str       ┆ str        ┆ str        ┆ str       ┆ f64      │
╞════════════════╪═══════════╪════════════╪════════════╪═══════════╪══════════╡
│ Han Solo       ┆ Corellia  ┆ brown      ┆ fair       ┆ brown     ┆ 1.8      │
│ Leia Organa    ┆ Alderaan  ┆ brown      ┆ light      ┆ brown     ┆ 1.5      │
│ Luke Skywalker ┆ Tatooine  ┆ blond      ┆ fair       ┆ blue      ┆ 1.72     │
└────────────────┴───────────┴────────────┴────────────┴───────────┴──────────┘


## Selecting based on Name

| Función              | Descripción                                                                 |
|----------------------|-----------------------------------------------------------------------------|
| `cs.by_name()`       | Selecciona columnas por nombre exacto o patrón regex.                       |
| `cs.alpha()`         | Selecciona columnas cuyos nombres contienen solo caracteres alfabéticos.     |
| `cs.alphanumeric()`  | Selecciona columnas cuyos nombres contienen solo caracteres alfanuméricos.   |
| `cs.contains()`      | Selecciona columnas cuyos nombres contienen una subcadena dada.              |
| `cs.starts_with()`   | Selecciona columnas cuyos nombres comienzan con un prefijo dado.             |
| `cs.ends_with()`     | Selecciona columnas cuyos nombres terminan con un sufijo dado.               |
| `cs.matches()`       | Selecciona columnas cuyos nombres coinciden con un patrón regex.             |
| `cs.digit()`         | Selecciona columnas cuyos nombres contienen solo dígitos.                    |

In [102]:
rebels.select(cs.starts_with("birth_"))

birth_year,birth_date
f64,date
29.0,1948-06-01
19.0,1958-05-30
19.0,1958-05-30


In [103]:
rebels.select(cs.ends_with("_color"))

hair_color,skin_color,eye_color
str,str,str
"""brown""","""fair""","""brown"""
"""brown""","""light""","""brown"""
"""blond""","""fair""","""blue"""


In [104]:
rebels.select(cs.contains("_"))

hair_color,skin_color,eye_color,birth_year,birth_date,screen_time
str,str,str,f64,date,duration[μs]
"""brown""","""fair""","""brown""",29.0,1948-06-01,1h 12m 37s
"""brown""","""light""","""brown""",19.0,1958-05-30,1h 3m 40s
"""blond""","""fair""","""blue""",19.0,1958-05-30,1h 58m 44s


In [105]:
rebels.select(cs.matches("^[a-z]{4}$"))

name,mass
str,f64
"""Han Solo""",80.0
"""Leia Organa""",49.0
"""Luke Skywalker""",77.0


## Selecting Baes on Data Type

In [106]:
rebels.group_by("hair_color").agg(cs.numeric().mean())

hair_color,height,mass,birth_year
str,f64,f64,f64
"""brown""",165.0,64.5,24.0
"""blond""",172.0,77.0,19.0


## Selección por tipo de dato

La siguiente tabla resume los selectores de columnas por tipo de dato en Polars:

| Función             | Descripción                                                                 |
|---------------------|-----------------------------------------------------------------------------|
| `cs.numeric()`      | Selecciona columnas de tipo numérico (enteros, flotantes).                  |
| `cs.binary()`       | Selecciona columnas de tipo binario.                                        |
| `cs.boolean()`      | Selecciona columnas de tipo booleano.                                       |
| `cs.string()`       | Selecciona columnas de tipo cadena de texto.                                |
| `cs.datetime()`     | Selecciona columnas de tipo fecha y hora (`datetime`).                      |
| `cs.date()`         | Selecciona columnas de tipo fecha (`date`).                                 |
| `cs.duration()`     | Selecciona columnas de tipo duración (`duration`).                          |
| `cs.categorical()`  | Selecciona columnas de tipo categórico.                                     |
| `cs.list()`         | Selecciona columnas que contienen listas.                                   |
| `cs.struct()`       | Selecciona columnas de tipo struct.                                         |
| `cs.by_dtype()`     | Selecciona columnas especificando uno o más tipos de dato de Polars.        |
| `cs.signed_integer()`    | Selecciona columnas de enteros con signo (`i8`, `i16`, `i32`, etc).|
| `cs.unsigned_integer()`  | Selecciona columnas de enteros sin signo (`u8`, `u16`, `u32`, etc).|
| `cs.float()`             | Selecciona columnas de tipo flotante (`f32`, `f64`).               |
| `cs.decimal()`           | Selecciona columnas de tipo decimal.                               |
| `cs.temporal()`          | Selecciona columnas de tipo temporal (`date`, `datetime`, `time`). |
| `cs.time()`              | Selecciona columnas de tipo hora (`time`).                         |

Estos selectores permiten trabajar de forma eficiente con columnas según su tipo, facilitando operaciones como agregaciones, transformaciones y filtrado en tus DataFrames.

In [107]:
rebels.select(cs.string())

name,hair_color,skin_color,eye_color,homeworld,species
str,str,str,str,str,str
"""Han Solo""","""brown""","""fair""","""brown""","""Corellia""","""Human"""
"""Leia Organa""","""brown""","""light""","""brown""","""Alderaan""","""Human"""
"""Luke Skywalker""","""blond""","""fair""","""blue""","""Tatooine""","""Human"""


In [108]:
rebels.select(cs.temporal())

birth_date,screen_time
date,duration[μs]
1948-06-01,1h 12m 37s
1958-05-30,1h 3m 40s
1958-05-30,1h 58m 44s


In [109]:
# Selecciona columnas de tipo lista de cadenas usando cs.by_dtype y pl.List(pl.String)
rebels.select(cs.by_dtype(pl.List(pl.String)))

vehicles,starships
list[str],list[str]
,"[""Millennium Falcon"", ""Imperial shuttle""]"
"[""Imperial Speeder Bike""]",
"[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]"


## Selectin Based on Position

| Función             | Descripción                                                                 |
|---------------------|-----------------------------------------------------------------------------|
| `cs.by_index()`     | Selecciona columnas por índice (posición), acepta enteros o listas de índices.|
| `cs.first()`        | Selecciona la primera columna del DataFrame.                                 |
| `cs.last()`         | Selecciona la última columna del DataFrame.                                  |
| `cs.head(n)`        | Selecciona las primeras `n` columnas.                                        |
| `cs.tail(n)`        | Selecciona las últimas `n` columnas.                                         |

Estas funciones permiten seleccionar columnas según su posición en el DataFrame, facilitando la manipulación cuando no se conocen los nombres o se requiere trabajar con rangos de columnas.

In [110]:
rebels.select(cs.by_index(range(0, rebels.width, 3)))

name,hair_color,birth_year,homeworld,starships
str,str,f64,str,list[str]
"""Han Solo""","""brown""",29.0,"""Corellia""","[""Millennium Falcon"", ""Imperial shuttle""]"
"""Leia Organa""","""brown""",19.0,"""Alderaan""",
"""Luke Skywalker""","""blond""",19.0,"""Tatooine""","[""X-wing"", ""Imperial shuttle""]"


In [111]:
rebels.select("name", cs.by_index((range(-2, 0))))

name,birth_date,screen_time
str,date,duration[μs]
"""Han Solo""",1948-06-01,1h 12m 37s
"""Leia Organa""",1958-05-30,1h 3m 40s
"""Luke Skywalker""",1958-05-30,1h 58m 44s


## Combining Selectors

In [112]:

rebels.select(cs.by_name("hair_color") | cs.numeric())

height,mass,hair_color,birth_year
u16,f64,str,f64
180,80.0,"""brown""",29.0
150,49.0,"""brown""",19.0
172,77.0,"""blond""",19.0


| Operation         | Inline Operator | Description                                                                                 |
|-------------------|-----------------|---------------------------------------------------------------------------------------------|
| Union             | `|`              | Combina selectores, seleccionando columnas presentes en cualquiera de los selectores.        |
| Intersection      | `&`              | Selecciona solo las columnas presentes en ambos selectores.                                 |
| Difference        | `-`              | Selecciona columnas presentes en el primer selector pero no en el segundo.                  |
| Symmetric Diff    | `^`              | Selecciona columnas presentes en uno u otro selector, pero no en ambos (diferencia simétrica). |

Estos operadores permiten combinar selectores de columnas en Polars de forma flexible y expresiva.

In [113]:
df = pl.DataFrame({"d": 1, "i": True, "s":True, "c": True, "o": 1.0})

print(df)

x = cs.by_name("d", "i", "s")
y = cs.boolean()

print("\nselector => columns")

for s in ["x", "y", "x | y", "x & y", "x - y", "x ^ y", "~x","x - x"]:
    print(f"{s:8} => {cs.expand_selector(df, eval(s))}")

shape: (1, 5)
┌─────┬──────┬──────┬──────┬─────┐
│ d   ┆ i    ┆ s    ┆ c    ┆ o   │
│ --- ┆ ---  ┆ ---  ┆ ---  ┆ --- │
│ i64 ┆ bool ┆ bool ┆ bool ┆ f64 │
╞═════╪══════╪══════╪══════╪═════╡
│ 1   ┆ true ┆ true ┆ true ┆ 1.0 │
└─────┴──────┴──────┴──────┴─────┘

selector => columns
x        => ('d', 'i', 's')
y        => ('i', 's', 'c')
x | y    => ('d', 'i', 's', 'c')
x & y    => ('i', 's')
x - y    => ('d',)
x ^ y    => ('d', 'c')
~x       => ('c', 'o')
x - x    => ()


In [114]:
df.select(x - x)

El operador "walrus" (`:=`) permite asignar valores a variables dentro de una expresión, sin necesidad de una línea aparte. Es útil para reutilizar resultados intermedios en operaciones complejas, como al crear nuevas columnas en Polars:

```python
df.with_columns([
    (col_a := pl.col("a")),
    (col_b := pl.col("b")),
    (col_a + col_b).alias("sum_ab")
])
```

Aquí, `col_a` y `col_b` se definen y reutilizan en la misma expresión, haciendo el código más conciso y legible. El walrus está disponible desde Python 3.8.

## Creating Columns

In [115]:
rebels.with_columns(
    bmi=pl.col("mass") / (pl.col("height") / 100) ** 2,
    age_destroy=(
        (pl.date(1983, 5, 25) - pl.col("birth_date")).dt.total_days() / 365
    ).cast(pl.UInt8),
)

name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time,bmi,age_destroy
str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs],f64,u8
"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s,24.691358,35
"""Leia Organa""",150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30,1h 3m 40s,21.777778,25
"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s,26.027582,25


Las expresiones dentro de `with_columns` en Polars no pueden referenciar directamente columnas recién creadas en la misma llamada. Es decir, si defines una nueva columna, no puedes usarla inmediatamente en otra expresión dentro del mismo `with_columns`. Para reutilizar resultados intermedios, debes usar el operador "walrus" (`:=`) en Python 3.8+, o encadenar múltiples llamadas a `with_columns`. Esto garantiza que cada expresión solo acceda a columnas ya existentes antes de la operación.

In [116]:
(
    rebels.with_columns(
        bmi=pl.col("mass") / (pl.col("height") / 100) ** 2
    ).with_columns(
        bmi_cat=pl.col("bmi").cut(
            [18.5, 25], labels=["underweight", "normal", "overweight"]
        )
    )
)

name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time,bmi,bmi_cat
str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs],f64,cat
"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s,24.691358,"""normal"""
"""Leia Organa""",150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30,1h 3m 40s,21.777778,"""normal"""
"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s,26.027582,"""overweight"""


In [117]:
(starwars.select(
        "name",
        (pl.col("mass") / (pl.col("height") / 100) ** 2).alias("bmi"),
        "species",
    )
    .drop_nulls()
    .top_k(5, by="bmi")
)

name,bmi,species
str,f64,str
"""Jabba Desilijic Tiure""",443.428571,"""Hutt"""
"""Dud Bolt""",50.928022,"""Vulptereen"""
"""Yoda""",39.02663,"""Yoda's species"""
"""Owen Lars""",37.874006,"""Human"""
"""IG-88""",35.0,"""Droid"""


## Related Column Operation

### Dropping

In [118]:
rebels.drop("name", "films", "screen_time", strict=False)

height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date
u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date
180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01
150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30
172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30


### Renaming

In [120]:
(
    rebels.rename({"homeworld": "planet", "mass": "weight"})
    .rename(lambda s: s.removesuffix("_color"))
    .select("name", "planet", "weight", "hair", "skin", "eye")
)

name,planet,weight,hair,skin,eye
str,str,f64,str,str,str
"""Han Solo""","""Corellia""",80.0,"""brown""","""fair""","""brown"""
"""Leia Organa""","""Alderaan""",49.0,"""brown""","""light""","""brown"""
"""Luke Skywalker""","""Tatooine""",77.0,"""blond""","""fair""","""blue"""


### Stacking

In [126]:
rebel_names = rebels.select("name")
rebel_colors = rebels.select(cs.ends_with("_color"))
rebel_quotes = pl.Series(
    "quote",
    [
        "You know, sometimes I amaze even myself.",
        "That doesn't sound too hard.",
        "I have a bad feeling about this."
    ]
)

(rebel_names.hstack(rebel_colors).hstack([rebel_quotes]))

name,hair_color,skin_color,eye_color,quote
str,str,str,str,str
"""Han Solo""","""brown""","""fair""","""brown""","""You know, sometimes I amaze ev…"
"""Leia Organa""","""brown""","""light""","""brown""","""That doesn't sound too hard."""
"""Luke Skywalker""","""blond""","""fair""","""blue""","""I have a bad feeling about thi…"


### Addinng Row Indices

In [127]:
rebels.with_row_index(name="rebel_id", offset=1)

rebel_id,name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time
u32,str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs]
1,"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s
2,"""Leia Organa""",150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30,1h 3m 40s
3,"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s


In [143]:
import uuid

# Agrega una columna 'rebel_id' con UUIDs únicos por fila usando with_row_index y map_elements
rebels.with_row_index(name="rebel_id").with_columns(
    rebel_id=pl.col("rebel_id").map_elements(lambda _: str(uuid.uuid4()), return_dtype=pl.String)
)

rebel_id,name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time
str,str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs]
"""62390de0-f9ce-4143-944a-6e6445…","""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s
"""86c45297-3424-4785-a807-4e0318…","""Leia Organa""",150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30,1h 3m 40s
"""226bbe96-151a-49f1-babc-a565ee…","""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s
