In [2]:
import polars as pl

## SELECTORES

In [3]:
salarios = pl.DataFrame(
    {
        'lenguaje': ['Python', 'Scala', 'C','Python' , 'Scala', 'Java'],
        'puesto': ['Sr','Sr','Jr','Jr','Sr','Jr'],
        'salario':[5000,7000,10000,4500,5400,6000],
        'experiencia': [3,5,8,2,4,2]
        }
)

In [6]:
operaciones = pl.DataFrame(
  schema= {
      "a": pl.UInt32,
      "b": pl.Float64,
      "c": pl.Float64,
      "d": pl.Boolean,
      "e": pl.Time,
      "f": pl.Date,
      "g": pl.Duration,
      "h": pl.Datetime("ms"),
      "i": pl.String,

  },
)

In [7]:
salarios

lenguaje,puesto,salario,experiencia
str,str,i64,i64
"""Python""","""Sr""",5000,3
"""Scala""","""Sr""",7000,5
"""C""","""Jr""",10000,8
"""Python""","""Jr""",4500,2
"""Scala""","""Sr""",5400,4
"""Java""","""Jr""",6000,2


In [8]:
operaciones

a,b,c,d,e,f,g,h,i
u32,f64,f64,bool,time,date,duration[μs],datetime[ms],str


In [9]:
import polars.selectors as cs

In [10]:
salarios

lenguaje,puesto,salario,experiencia
str,str,i64,i64
"""Python""","""Sr""",5000,3
"""Scala""","""Sr""",7000,5
"""C""","""Jr""",10000,8
"""Python""","""Jr""",4500,2
"""Scala""","""Sr""",5400,4
"""Java""","""Jr""",6000,2


In [14]:
salarios.group_by(cs.string()).agg(
    cs.numeric().mean().name.suffix('_avg')
)

lenguaje,puesto,salario_avg,experiencia_avg
str,str,f64,f64
"""Python""","""Sr""",5000.0,3.0
"""Java""","""Jr""",6000.0,2.0
"""C""","""Jr""",10000.0,8.0
"""Scala""","""Sr""",6200.0,4.5
"""Python""","""Jr""",4500.0,2.0


In [16]:
operaciones.select(
    cs.numeric() | cs.string()
)

a,b,c,i
u32,f64,f64,str


In [17]:
operaciones.select(
    cs.temporal() & cs.matches('g|h')
)

g,h
duration[μs],datetime[ms]


In [18]:
operaciones.select(
    cs.numeric() - cs.first()
)

b,c
f64,f64


In [20]:
operaciones.select(
      ~cs.numeric()
)

d,e,f,g,h,i
bool,time,date,duration[μs],datetime[ms],str


### LECTURA 49-funciones de seleccion

In [21]:
df = pl.DataFrame(
    schema={
        "edad": pl.UInt32,
        "salario": pl.Float64,
        "estatura": pl.Float32,
        "residente": pl.Boolean,
        "hora": pl.Time,
        "nacimiento": pl.Date,
        "trabajando": pl.Duration,
        "log": pl.Datetime("ms"),
        "direccion": pl.String,
    },
)

In [22]:
df

edad,salario,estatura,residente,hora,nacimiento,trabajando,log,direccion
u32,f64,f32,bool,time,date,duration[μs],datetime[ms],str


In [23]:
import polars.selectors as cs

In [24]:
df.select(
    cs.by_dtype(pl.String, pl.Date)
)

nacimiento,direccion
date,str


In [25]:
df.select(
    cs.contains('re')
)

residente,direccion
bool,str


## LECTURA 50- FUNCIONES DE CONVERSION

In [3]:
import pyarrow.parquet as pq
import polars as pl

In [4]:
valores = [['a','b','c'], [1,2,3]]

In [5]:
valores

[['a', 'b', 'c'], [1, 2, 3]]

In [6]:
diccionario = {"id": [1,2,3], 'letra': ['x','y','z']}

In [7]:
diccionario

{'id': [1, 2, 3], 'letra': ['x', 'y', 'z']}

In [9]:
tabla_vuelos = pq.read_table('/content/drive/MyDrive/polars/seccion05/vuelos/vuelos.parquet')

In [10]:
tabla_vuelos

pyarrow.Table
YEAR: int32
MONTH: int32
DAY: int32
DAY_OF_WEEK: int32
AIRLINE: string
FLIGHT_NUMBER: int32
TAIL_NUMBER: string
ORIGIN_AIRPORT: string
DESTINATION_AIRPORT: string
SCHEDULED_DEPARTURE: int32
DEPARTURE_TIME: int32
DEPARTURE_DELAY: int32
TAXI_OUT: int32
WHEELS_OFF: int32
SCHEDULED_TIME: int32
ELAPSED_TIME: int32
AIR_TIME: int32
DISTANCE: int32
WHEELS_ON: int32
TAXI_IN: int32
SCHEDULED_ARRIVAL: int32
ARRIVAL_TIME: int32
ARRIVAL_DELAY: int32
DIVERTED: int32
CANCELLED: int32
CANCELLATION_REASON: string
AIR_SYSTEM_DELAY: int32
SECURITY_DELAY: int32
AIRLINE_DELAY: int32
LATE_AIRCRAFT_DELAY: int32
WEATHER_DELAY: int32
----
YEAR: [[2015,2015,2015,2015,2015,...,2015,2015,2015,2015,2015],[2015,2015,2015,2015,2015,...,2015,2015,2015,2015,2015],...,[2015,2015,2015,2015,2015,...,2015,2015,2015,2015,2015],[2015,2015,2015,2015,2015,...,2015,2015,2015,2015,2015]]
MONTH: [[1,1,1,1,1,...,1,1,1,1,1],[1,1,1,1,1,...,1,1,1,1,1],...,[12,12,12,12,12,...,12,12,12,12,12],[12,12,12,12,12,...,12,12,12

In [12]:
vuelos_df = pl.from_arrow(tabla_vuelos)  ### Crear un DataFrame o una Serie a partir de una Tabla o Matriz de Flechas.

In [14]:
print(vuelos_df)

shape: (5_819_079, 31)
┌──────┬───────┬─────┬─────────────┬───┬──────────────┬──────────────┬──────────────┬──────────────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ SECURITY_DEL ┆ AIRLINE_DELA ┆ LATE_AIRCRAF ┆ WEATHER_DELA │
│ ---  ┆ ---   ┆ --- ┆ ---         ┆   ┆ AY           ┆ Y            ┆ T_DELAY      ┆ Y            │
│ i32  ┆ i32   ┆ i32 ┆ i32         ┆   ┆ ---          ┆ ---          ┆ ---          ┆ ---          │
│      ┆       ┆     ┆             ┆   ┆ i32          ┆ i32          ┆ i32          ┆ i32          │
╞══════╪═══════╪═════╪═════════════╪═══╪══════════════╪══════════════╪══════════════╪══════════════╡
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null        

In [15]:
vuelos_pd = tabla_vuelos.to_pandas()

In [16]:
vuelos_pd

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819074,2015,12,31,4,B6,688,N657JB,LAX,BOS,2359,...,753.0,-26.0,0,0,,,,,,
5819075,2015,12,31,4,B6,745,N828JB,JFK,PSE,2359,...,430.0,-16.0,0,0,,,,,,
5819076,2015,12,31,4,B6,1503,N913JB,JFK,SJU,2359,...,432.0,-8.0,0,0,,,,,,
5819077,2015,12,31,4,B6,333,N527JB,MCO,SJU,2359,...,330.0,-10.0,0,0,,,,,,


In [17]:
vuelos_df = pl.from_pandas(vuelos_pd)
print(vuelos_df)

shape: (5_819_079, 31)
┌──────┬───────┬─────┬─────────────┬───┬──────────────┬──────────────┬──────────────┬──────────────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ SECURITY_DEL ┆ AIRLINE_DELA ┆ LATE_AIRCRAF ┆ WEATHER_DELA │
│ ---  ┆ ---   ┆ --- ┆ ---         ┆   ┆ AY           ┆ Y            ┆ T_DELAY      ┆ Y            │
│ i32  ┆ i32   ┆ i32 ┆ i32         ┆   ┆ ---          ┆ ---          ┆ ---          ┆ ---          │
│      ┆       ┆     ┆             ┆   ┆ f64          ┆ f64          ┆ f64          ┆ f64          │
╞══════╪═══════╪═════╪═════════════╪═══╪══════════════╪══════════════╪══════════════╪══════════════╡
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null         ┆ null        

In [18]:
vuelos_np = vuelos_pd.to_numpy()

In [19]:
type(vuelos_np)

numpy.ndarray

In [20]:
vuelos_np

array([[2015, 1, 1, ..., nan, nan, nan],
       [2015, 1, 1, ..., nan, nan, nan],
       [2015, 1, 1, ..., nan, nan, nan],
       ...,
       [2015, 12, 31, ..., nan, nan, nan],
       [2015, 12, 31, ..., nan, nan, nan],
       [2015, 12, 31, ..., nan, nan, nan]], dtype=object)

In [21]:
vuelos_nf = pl.from_numpy(vuelos_np)
print(vuelos_nf)

shape: (5_819_079, 31)
┌──────────┬──────────┬──────────┬──────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ column_0 ┆ column_1 ┆ column_2 ┆ column_3 ┆ … ┆ column_27 ┆ column_28 ┆ column_29 ┆ column_30 │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ object   ┆ object   ┆ object   ┆ object   ┆   ┆ object    ┆ object    ┆ object    ┆ object    │
╞══════════╪══════════╪══════════╪══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 2015     ┆ 1        ┆ 1        ┆ 4        ┆ … ┆ nan       ┆ nan       ┆ nan       ┆ nan       │
│ 2015     ┆ 1        ┆ 1        ┆ 4        ┆ … ┆ nan       ┆ nan       ┆ nan       ┆ nan       │
│ 2015     ┆ 1        ┆ 1        ┆ 4        ┆ … ┆ nan       ┆ nan       ┆ nan       ┆ nan       │
│ 2015     ┆ 1        ┆ 1        ┆ 4        ┆ … ┆ nan       ┆ nan       ┆ nan       ┆ nan       │
│ 2015     ┆ 1        ┆ 1        ┆ 4        ┆ … ┆ nan       ┆ nan       ┆ nan       ┆ nan      

In [22]:
valores

[['a', 'b', 'c'], [1, 2, 3]]

In [23]:
diccionario

{'id': [1, 2, 3], 'letra': ['x', 'y', 'z']}

In [26]:
df_valores = pl.from_records(valores,schema=['letra', 'id'])

In [27]:
df_valores

letra,id
str,i64
"""a""",1
"""b""",2
"""c""",3


In [28]:
print(df_valores)

shape: (3, 2)
┌───────┬─────┐
│ letra ┆ id  │
│ ---   ┆ --- │
│ str   ┆ i64 │
╞═══════╪═════╡
│ a     ┆ 1   │
│ b     ┆ 2   │
│ c     ┆ 3   │
└───────┴─────┘


In [29]:
df_diccionario = pl.from_dict(diccionario)

In [32]:
print(df_diccionario)

shape: (3, 2)
┌─────┬───────┐
│ id  ┆ letra │
│ --- ┆ ---   │
│ i64 ┆ str   │
╞═════╪═══════╡
│ 1   ┆ x     │
│ 2   ┆ y     │
│ 3   ┆ z     │
└─────┴───────┘


In [33]:
df_str = pl.from_repr(
"""
shape: (3, 2)
┌─────┬───────┐
│ id  ┆ letra │
│ --- ┆ ---   │
│ i64 ┆ str   │
╞═════╪═══════╡
│ 1   ┆ x     │
│ 2   ┆ y     │
│ 3   ┆ z     │
└─────┴───────┘
"""
)
print(df_str)

shape: (3, 2)
┌─────┬───────┐
│ id  ┆ letra │
│ --- ┆ ---   │
│ i64 ┆ str   │
╞═════╪═══════╡
│ 1   ┆ x     │
│ 2   ┆ y     │
│ 3   ┆ z     │
└─────┴───────┘


# **LECTURA 51-OTRAS FUNCIONES **

In [6]:
import polars as pl
from polars import col

In [7]:
df1 = pl.DataFrame(
    {
        'nombre': ['Jose'],
        'sexo': ['M'],
    }
)

In [8]:
df2 = pl.DataFrame(
    {
        'nombre': ['Rosa','Teresa'],
        'sexo': ['F','F'],
    }
)

In [9]:
query_1 = (
    pl.scan_parquet('/content/drive/MyDrive/polars/seccion05/vuelos/vuelos.parquet')
    .bottom_k(20,by='DEPARTURE_TIME', descending=True)
)

  pl.scan_parquet('/content/drive/MyDrive/polars/seccion05/vuelos/vuelos.parquet')


In [2]:
import polars as pl

In [4]:
query_2 = (
    pl.scan_parquet('/content/drive/MyDrive/polars/seccion05/vuelos/vuelos.parquet')
    .filter((pl.col('MONTH')>6) & (pl.col('DEPARTURE_DELAY')<0)) # Add parentheses around the entire filter expression
    .select(
        pl.col('MONTH'),
        pl.col('DEPARTURE_DELAY'),
    )
)

In [10]:
df = pl.concat([df1,df2])
print(df)

shape: (3, 2)
┌────────┬──────┐
│ nombre ┆ sexo │
│ ---    ┆ ---  │
│ str    ┆ str  │
╞════════╪══════╡
│ Jose   ┆ M    │
│ Rosa   ┆ F    │
│ Teresa ┆ F    │
└────────┴──────┘


In [11]:
df2 = df2.rename({'nombre': 'n', 'sexo': 's'})
df2

n,s
str,str
"""Rosa""","""F"""
"""Teresa""","""F"""


In [13]:
df_1 = pl.concat([df1,df2], how='horizontal')
print(df_1)

shape: (2, 4)
┌────────┬──────┬────────┬─────┐
│ nombre ┆ sexo ┆ n      ┆ s   │
│ ---    ┆ ---  ┆ ---    ┆ --- │
│ str    ┆ str  ┆ str    ┆ str │
╞════════╪══════╪════════╪═════╡
│ Jose   ┆ M    ┆ Rosa   ┆ F   │
│ null   ┆ null ┆ Teresa ┆ F   │
└────────┴──────┴────────┴─────┘


In [15]:
df1,df2 = pl.collect_all([query_1,query_2])

In [16]:
print(df1)

shape: (20, 31)
┌──────┬───────┬─────┬─────────────┬───┬──────────────┬──────────────┬──────────────┬──────────────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ SECURITY_DEL ┆ AIRLINE_DELA ┆ LATE_AIRCRAF ┆ WEATHER_DELA │
│ ---  ┆ ---   ┆ --- ┆ ---         ┆   ┆ AY           ┆ Y            ┆ T_DELAY      ┆ Y            │
│ i32  ┆ i32   ┆ i32 ┆ i32         ┆   ┆ ---          ┆ ---          ┆ ---          ┆ ---          │
│      ┆       ┆     ┆             ┆   ┆ i32          ┆ i32          ┆ i32          ┆ i32          │
╞══════╪═══════╪═════╪═════════════╪═══╪══════════════╪══════════════╪══════════════╪══════════════╡
│ 2015 ┆ 2     ┆ 1   ┆ 7           ┆ … ┆ 0            ┆ 11           ┆ 0            ┆ 0            │
│ 2015 ┆ 1     ┆ 2   ┆ 5           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 2   ┆ 5           ┆ … ┆ null         ┆ null         ┆ null         ┆ null         │
│ 2015 ┆ 1     ┆ 3   ┆ 6           ┆ … ┆ 0            ┆ 361          ┆ 0   

In [17]:
print(df2)

shape: (1_712_529, 2)
┌───────┬─────────────────┐
│ MONTH ┆ DEPARTURE_DELAY │
│ ---   ┆ ---             │
│ i32   ┆ i32             │
╞═══════╪═════════════════╡
│ 7     ┆ -5              │
│ 7     ┆ -5              │
│ 7     ┆ -5              │
│ 7     ┆ -7              │
│ 7     ┆ -2              │
│ …     ┆ …               │
│ 12    ┆ -1              │
│ 12    ┆ -4              │
│ 12    ┆ -4              │
│ 12    ┆ -9              │
│ 12    ┆ -6              │
└───────┴─────────────────┘
