# Limpieza y preparación de datos

## 2.1 Tratamiento de los datos que faltan

Valor centinela (sentinel value) =  valor ausente (o nulo)

In [2]:
import pandas as pd
import numpy as np

### Flotantes 

In [3]:
float_data = pd.Series(
    [100.8, -30.5, 67, 101.9, -7.2, 
     -400, np.nan, 0, np.nan]
    )

In [4]:
float_data

0    100.8
1    -30.5
2     67.0
3    101.9
4     -7.2
5   -400.0
6      NaN
7      0.0
8      NaN
dtype: float64

In [5]:
float_data.isna() # devuelve serie booleana, True si tiene nulos 

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
8     True
dtype: bool

### String - cadenas 

In [6]:
string_data = pd.Series(['abcd', np.nan, None, 'dfeg', None, 'pyhton', None])

string_data

0      abcd
1       NaN
2      None
3      dfeg
4      None
5    pyhton
6      None
dtype: object

In [7]:
string_data.isna()

0    False
1     True
2     True
3    False
4     True
5    False
6     True
dtype: bool

### Flotantes

In [8]:
float_data = pd.Series([None, 1, 2, None, 3, np.nan], dtype='float64')
float_data

0    NaN
1    1.0
2    2.0
3    NaN
4    3.0
5    NaN
dtype: float64

In [9]:
float_data.isna()

0     True
1    False
2    False
3     True
4    False
5     True
dtype: bool

In [10]:
float_data.dropna() #Elimina nulos 

1    1.0
2    2.0
4    3.0
dtype: float64

### Filtrar los datos que faltan

In [11]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7, 2.7, np.nan, 9.1, 800, np.nan])
data


0      1.0
1      NaN
2      3.5
3      NaN
4      7.0
5      2.7
6      NaN
7      9.1
8    800.0
9      NaN
dtype: float64

In [12]:
data[data.notna()]

0      1.0
2      3.5
4      7.0
5      2.7
7      9.1
8    800.0
dtype: float64

In [13]:
data.dropna()

0      1.0
2      3.5
4      7.0
5      2.7
7      9.1
8    800.0
dtype: float64

==============================================================================

In [14]:
data = pd.DataFrame([[1., 700, 6.5, 100, 3.], [1., np.nan, np.nan, 300, 900],
             [300, np.nan, 400, np.nan, np.nan],
             [300, 6.5, 3., 500., 900], 
             [np.nan, 100, 400, 600, 800],
             [np.nan, np.nan, np.nan, np.nan, np.nan]])

data

Unnamed: 0,0,1,2,3,4
0,1.0,700.0,6.5,100.0,3.0
1,1.0,,,300.0,900.0
2,300.0,,400.0,,
3,300.0,6.5,3.0,500.0,900.0
4,,100.0,400.0,600.0,800.0
5,,,,,


`dropna()` por defecto elimina cualquier FILA que contenga un valor perdido:

In [15]:
data.dropna()

Unnamed: 0,0,1,2,3,4
0,1.0,700.0,6.5,100.0,3.0
3,300.0,6.5,3.0,500.0,900.0


Si se pasa `how="all"`, sólo se eliminarán las filas que sean todas NA:

In [16]:
data.dropna(how="all")

Unnamed: 0,0,1,2,3,4
0,1.0,700.0,6.5,100.0,3.0
1,1.0,,,300.0,900.0
2,300.0,,400.0,,
3,300.0,6.5,3.0,500.0,900.0
4,,100.0,400.0,600.0,800.0


In [17]:
data[5] = np.nan #Agrego nueva columna, la 5 con todos sus valores NaN 
data

Unnamed: 0,0,1,2,3,4,5
0,1.0,700.0,6.5,100.0,3.0,
1,1.0,,,300.0,900.0,
2,300.0,,400.0,,,
3,300.0,6.5,3.0,500.0,900.0,
4,,100.0,400.0,600.0,800.0,
5,,,,,,


In [18]:
data.dropna(axis="columns", how="all") 
#Uso con dropna (axis= 'columns') ---> para eliminar datos de una columna 
#Uso con dropna (axis= 'rows') ---> para eliminar datos de una fila
#Uso how='all' ---> Para eliminar todos los datos siempre que sean nulos 

Unnamed: 0,0,1,2,3,4
0,1.0,700.0,6.5,100.0,3.0
1,1.0,,,300.0,900.0
2,300.0,,400.0,,
3,300.0,6.5,3.0,500.0,900.0
4,,100.0,400.0,600.0,800.0
5,,,,,


=======================================================================

In [19]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
# Crea dataframe con 6 columnas y 3 filas 

In [20]:
df

Unnamed: 0,0,1,2
0,0.144478,0.011884,0.242829
1,0.078994,-0.096676,1.167637
2,-0.600723,0.002118,0.945123
3,1.384015,0.480928,-0.538316
4,0.475776,0.520785,-0.475529
5,-1.616989,0.351939,-0.621924
6,-0.053996,-1.179206,-0.973546


In [21]:
df.iloc[1:4, 1] = np.nan # Asigna np.nan a las filas 1 hasta la 3 en la columna 1
df.iloc[2:5, 2] = np.nan #Asigna np.nan a las filas 2 hasta la 4 en la columna 2

In [22]:
df

Unnamed: 0,0,1,2
0,0.144478,0.011884,0.242829
1,0.078994,,1.167637
2,-0.600723,,
3,1.384015,,
4,0.475776,0.520785,
5,-1.616989,0.351939,-0.621924
6,-0.053996,-1.179206,-0.973546


In [23]:
df.dropna() #Elimina filas con nulos 

Unnamed: 0,0,1,2
0,0.144478,0.011884,0.242829
5,-1.616989,0.351939,-0.621924
6,-0.053996,-1.179206,-0.973546


Con `thresh`, especifica el número mínimo de valores no nulos que deben estar presentes en una FILA o COLUMNA para que no sea eliminada.

In [24]:
df.dropna(thresh=2) #una fila debe tener al menos 2 valores no nulos para que no sea eliminada

Unnamed: 0,0,1,2
0,0.144478,0.011884,0.242829
1,0.078994,,1.167637
4,0.475776,0.520785,
5,-1.616989,0.351939,-0.621924
6,-0.053996,-1.179206,-0.973546


In [25]:
df

Unnamed: 0,0,1,2
0,0.144478,0.011884,0.242829
1,0.078994,,1.167637
2,-0.600723,,
3,1.384015,,
4,0.475776,0.520785,
5,-1.616989,0.351939,-0.621924
6,-0.053996,-1.179206,-0.973546


In [26]:
df.dropna(thresh=5, axis=1)
#Elimina las columnas que no tienen al menos 5 valores NO NULOS .

Unnamed: 0,0
0,0.144478
1,0.078994
2,-0.600723
3,1.384015
4,0.475776
5,-1.616989
6,-0.053996


### Rellenar los datos que faltan

In [27]:
df

Unnamed: 0,0,1,2
0,0.144478,0.011884,0.242829
1,0.078994,,1.167637
2,-0.600723,,
3,1.384015,,
4,0.475776,0.520785,
5,-1.616989,0.351939,-0.621924
6,-0.053996,-1.179206,-0.973546


In [28]:
df.fillna(0) # Rellena los NaN con cero 

Unnamed: 0,0,1,2
0,0.144478,0.011884,0.242829
1,0.078994,0.0,1.167637
2,-0.600723,0.0,0.0
3,1.384015,0.0,0.0
4,0.475776,0.520785,0.0
5,-1.616989,0.351939,-0.621924
6,-0.053996,-1.179206,-0.973546


Llamando a `fillna()` con un diccionario, puede utilizar un valor de relleno diferente para cada columna:

In [29]:
df.fillna({1: 400, 2: 500})

Unnamed: 0,0,1,2
0,0.144478,0.011884,0.242829
1,0.078994,400.0,1.167637
2,-0.600723,400.0,500.0
3,1.384015,400.0,500.0
4,0.475776,0.520785,500.0
5,-1.616989,0.351939,-0.621924
6,-0.053996,-1.179206,-0.973546


==========================================================================================

Los mismos métodos de interpolación disponibles para la reindexación pueden utilizarse con `fillna()`:

In [30]:
df = pd.DataFrame(np.random.standard_normal((7, 4)))

In [31]:
df.iloc[2:6, 0] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.863736,1.597964,-1.240931,0.278419
1,0.126226,0.52045,0.133521,0.836273
2,,0.132778,-2.293522,-0.52499
3,,1.09697,0.507357,0.656633
4,,-2.631458,,0.188368
5,,1.219062,,2.111522
6,0.315611,1.41114,,-0.168688


In [32]:
df.fillna(method="ffill") # Rellena nuloscon el primer valor de la columna antes del nulo

  df.fillna(method="ffill") # Rellena nuloscon el primer valor de la columna antes del nulo


Unnamed: 0,0,1,2,3
0,0.863736,1.597964,-1.240931,0.278419
1,0.126226,0.52045,0.133521,0.836273
2,0.126226,0.132778,-2.293522,-0.52499
3,0.126226,1.09697,0.507357,0.656633
4,0.126226,-2.631458,0.507357,0.188368
5,0.126226,1.219062,0.507357,2.111522
6,0.315611,1.41114,0.507357,-0.168688


In [33]:
df

Unnamed: 0,0,1,2,3
0,0.863736,1.597964,-1.240931,0.278419
1,0.126226,0.52045,0.133521,0.836273
2,,0.132778,-2.293522,-0.52499
3,,1.09697,0.507357,0.656633
4,,-2.631458,,0.188368
5,,1.219062,,2.111522
6,0.315611,1.41114,,-0.168688


In [34]:
df.fillna(method="ffill", limit=2) # Solo rellena 2 nulos por cada columna 

  df.fillna(method="ffill", limit=2) # Solo rellena 2 nulos por cada columna


Unnamed: 0,0,1,2,3
0,0.863736,1.597964,-1.240931,0.278419
1,0.126226,0.52045,0.133521,0.836273
2,0.126226,0.132778,-2.293522,-0.52499
3,0.126226,1.09697,0.507357,0.656633
4,,-2.631458,0.507357,0.188368
5,,1.219062,0.507357,2.111522
6,0.315611,1.41114,,-0.168688


==========================================================================================

`fillna()` = imputación simple de datos utilizando la mediana o la media estadística

In [35]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7, np.nan, np.nan])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
5    NaN
6    NaN
dtype: float64

In [36]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
5    3.833333
6    3.833333
dtype: float64

__________________________

## Transformación de datos

### Remover duplicados

In [37]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                     "k2":[1, 1, 2, 3, 3, 4, 4]})

In [38]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [39]:
data.duplicated() #Devuelve serie booleana 

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [40]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [41]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [42]:
# Añadimos una tercera columna
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


Elimina las filas duplicadas basándose en los valores de la columna "k1". La primera aparición de cada valor único en "k1" se conserva, y las filas subsecuentes con el mismo valor en "k1" se eliminan.

In [43]:
data.drop_duplicates(subset=["k1"]) #Eliminó duplicados y dejó por defecto la 1ra fila de cada duplicado

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [44]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [45]:
data.drop_duplicates(["k1", "k2"], keep="last") #Elimina duplicados de  k1 y k2 y deja el último

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


-------------------------------

### Transformación de datos mediante una `Function` o `Mapping`

In [46]:
data = pd.DataFrame({"food": ["bacon", "pulled pork",
                              "bacon","pastrami", "corned beef",
                              "bacon", "pastrami", "honey ham",
                              "nova lox", "bacon", "bacon"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6, 20, 30]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0
9,bacon,20.0


Añadimos una columna 

In [47]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon",
}

El método `map` MAPEO los valores de la columna food y los reemplaza dependiendo de los valores del diccionario meat_to_animal, y crea una nueva columna 'animal' con lo datos ya mapeados. 

In [48]:
data["animal"] = data["food"].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon
9,bacon,20.0,pig


In [49]:
def get_animal(x):
    return meat_to_animal[x]

data["animal"] = data["food"].map(get_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon
9,bacon,20.0,pig


--------------------

### Sustitución de valores

In [50]:
data = pd.Series([1., -999., 2., -999., -1000., 3., 100, np.nan])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
6     100.0
7       NaN
dtype: float64

In [51]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
6     100.0
7       NaN
dtype: float64

In [52]:
data.replace([-999, -1000], np.nan)

0      1.0
1      NaN
2      2.0
3      NaN
4      NaN
5      3.0
6    100.0
7      NaN
dtype: float64

In [53]:
data.replace([-999, -1000], [np.nan, 0])

0      1.0
1      NaN
2      2.0
3      NaN
4      0.0
5      3.0
6    100.0
7      NaN
dtype: float64

In [54]:
data.replace({-999: np.nan, -1000: 0})

0      1.0
1      NaN
2      2.0
3      NaN
4      0.0
5      3.0
6    100.0
7      NaN
dtype: float64

------------------------

### Renombrar índices de ejes

In [55]:
data = pd.DataFrame(np.arange(20).reshape((4, 5)),
                    index=["Ohio", "Colorado", "New York", "Ecuador"],
                    columns=["one", "two", "three", "four", "ten"])
                    
data

Unnamed: 0,one,two,three,four,ten
Ohio,0,1,2,3,4
Colorado,5,6,7,8,9
New York,10,11,12,13,14
Ecuador,15,16,17,18,19


In [56]:
def transform(x):
    return x[:4].upper()

data.index.map(transform)
#Index(['OHIO', 'COLO', 'NEW '], dtype='object')

Index(['OHIO', 'COLO', 'NEW ', 'ECUA'], dtype='object')

In [57]:
data

Unnamed: 0,one,two,three,four,ten
Ohio,0,1,2,3,4
Colorado,5,6,7,8,9
New York,10,11,12,13,14
Ecuador,15,16,17,18,19


In [58]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four,ten
OHIO,0,1,2,3,4
COLO,5,6,7,8,9
NEW,10,11,12,13,14
ECUA,15,16,17,18,19


In [59]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR,TEN
Ohio,0,1,2,3,4
Colo,5,6,7,8,9
New,10,11,12,13,14
Ecua,15,16,17,18,19


In [60]:
data.rename(index={"OHIO": "Madrid"},
            columns={"three": "tres"})
            

Unnamed: 0,one,two,tres,four,ten
Madrid,0,1,2,3,4
COLO,5,6,7,8,9
NEW,10,11,12,13,14
ECUA,15,16,17,18,19


------------------------------------

### Discretización y `binning`

Convertir datos continuos en discretos. 

Técnicas de PREPROCESAMIENTO DE DATOS = La discretización y el binning

> Discretización ---> Convierte datos continuos en datos categóricos dividiendo el rango de valores continuos en intervalos (bins).

> Binning ---> agrupa los datos en intervalos, o `"bins"`.  

In [61]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32, 80, 100, 101, 1, 4, 77, 44, 88, 87, 92, 104]

In [62]:
bins = [10, 20, 30, 40, 50 ,60, 70, 80, 90, 100, 110]

In [63]:
age_categories = pd.cut(ages, bins)

In [64]:
age_categories

[(10, 20], (20, 30], (20, 30], (20, 30], (20, 30], ..., (40, 50], (80, 90], (80, 90], (90, 100], (100, 110]]
Length: 23
Categories (10, interval[int64, right]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] ... (70, 80] < (80, 90] < (90, 100] < (100, 110]]

El objeto que pandas devuelve es un objeto `Categorical` especial. La salida que se ve describe los bins calculados por `pandas.cut`. Cada bin se identifica por un tipo de valor de intervalo especial (único en pandas) que contiene el límite inferior y superior de cada bin:

In [65]:
age_categories.codes

array([ 0,  1,  1,  1,  1,  1,  2,  2,  5,  3,  3,  2,  6,  8,  9, -1, -1,
        6,  3,  7,  7,  8,  9], dtype=int8)

In [66]:
age_categories.categories

IntervalIndex([  (10, 20],   (20, 30],   (30, 40],   (40, 50],   (50, 60],
                 (60, 70],   (70, 80],   (80, 90],  (90, 100], (100, 110]],
              dtype='interval[int64, right]')

In [67]:
age_categories.categories[0]

Interval(10, 20, closed='right')

In [68]:
pd.value_counts(age_categories)

  pd.value_counts(age_categories)


(20, 30]      5
(30, 40]      3
(40, 50]      3
(70, 80]      2
(80, 90]      2
(90, 100]     2
(100, 110]    2
(10, 20]      1
(60, 70]      1
(50, 60]      0
Name: count, dtype: int64

In [69]:
pd.cut(ages, bins, right=False)

[[20, 30), [20, 30), [20, 30), [20, 30), [20, 30), ..., [40, 50), [80, 90), [80, 90), [90, 100), [100, 110)]
Length: 23
Categories (10, interval[int64, left]): [[10, 20) < [20, 30) < [30, 40) < [40, 50) ... [70, 80) < [80, 90) < [90, 100) < [100, 110)]

In [70]:
data = np.random.uniform(size=20)
data

array([0.95848018, 0.65708787, 0.86658914, 0.78184018, 0.54376347,
       0.63082504, 0.38426943, 0.89796208, 0.290644  , 0.96127923,
       0.73587681, 0.68413314, 0.86191452, 0.75411735, 0.58097998,
       0.59816618, 0.79914479, 0.43049885, 0.34209414, 0.32066985])

In [71]:
pd.cut(data, 4, precision=2)

[(0.79, 0.96], (0.63, 0.79], (0.79, 0.96], (0.63, 0.79], (0.46, 0.63], ..., (0.46, 0.63], (0.79, 0.96], (0.29, 0.46], (0.29, 0.46], (0.29, 0.46]]
Length: 20
Categories (4, interval[float64, right]): [(0.29, 0.46] < (0.46, 0.63] < (0.63, 0.79] < (0.79, 0.96]]

============================================================================

In [72]:
data = np.random.standard_normal(1000)
data

array([ 1.64839945e+00, -7.28946569e-01,  5.72544143e-02,  3.43823538e-01,
       -5.59727139e-01,  1.92112087e-01, -4.11939532e-01,  7.74662633e-01,
        1.32471286e+00,  2.19118876e+00, -4.61452407e-02,  3.30905725e-01,
       -4.48823698e-01, -1.69643085e+00,  1.40478298e+00,  2.23817987e-02,
       -3.07252963e-01,  7.12730104e-01, -3.32668009e-01, -5.26632073e-01,
        2.28710124e+00,  4.31122052e-01, -1.51113947e+00, -1.78354444e-01,
        4.13706743e-01,  8.84026674e-01, -5.46937344e-01,  6.96749466e-01,
        3.41408325e-01,  1.23682710e+00,  3.08658982e-01,  1.01345726e+00,
       -1.74824562e-02,  8.70037111e-01,  1.07311954e+00,  1.50408434e-02,
        7.91789035e-01,  2.65560432e-01, -7.07919961e-01,  3.38467553e-01,
        7.80840039e-02,  1.96033668e+00, -3.03179801e-01,  3.45616399e-01,
       -4.74922820e-01, -1.51915757e+00,  1.21024628e-02,  5.19207186e-03,
        9.66544275e-01,  9.48847357e-01, -6.18328364e-01, -4.21485975e-01,
       -1.06007593e+00,  

In [73]:
quartiles = pd.qcut(data, 4, precision=2)
quartiles

[(0.63, 3.68], (-2.9099999999999997, -0.71], (-0.011, 0.63], (-0.011, 0.63], (-0.71, -0.011], ..., (-0.011, 0.63], (-2.9099999999999997, -0.71], (-0.011, 0.63], (-2.9099999999999997, -0.71], (-0.71, -0.011]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9099999999999997, -0.71] < (-0.71, -0.011] < (-0.011, 0.63] < (0.63, 3.68]]

In [74]:
pd.value_counts(quartiles)

  pd.value_counts(quartiles)


(-2.9099999999999997, -0.71]    250
(-0.71, -0.011]                 250
(-0.011, 0.63]                  250
(0.63, 3.68]                    250
Name: count, dtype: int64

In [75]:
pd.Series(quartiles).value_counts()

(-2.9099999999999997, -0.71]    250
(-0.71, -0.011]                 250
(-0.011, 0.63]                  250
(0.63, 3.68]                    250
Name: count, dtype: int64

In [76]:
data

array([ 1.64839945e+00, -7.28946569e-01,  5.72544143e-02,  3.43823538e-01,
       -5.59727139e-01,  1.92112087e-01, -4.11939532e-01,  7.74662633e-01,
        1.32471286e+00,  2.19118876e+00, -4.61452407e-02,  3.30905725e-01,
       -4.48823698e-01, -1.69643085e+00,  1.40478298e+00,  2.23817987e-02,
       -3.07252963e-01,  7.12730104e-01, -3.32668009e-01, -5.26632073e-01,
        2.28710124e+00,  4.31122052e-01, -1.51113947e+00, -1.78354444e-01,
        4.13706743e-01,  8.84026674e-01, -5.46937344e-01,  6.96749466e-01,
        3.41408325e-01,  1.23682710e+00,  3.08658982e-01,  1.01345726e+00,
       -1.74824562e-02,  8.70037111e-01,  1.07311954e+00,  1.50408434e-02,
        7.91789035e-01,  2.65560432e-01, -7.07919961e-01,  3.38467553e-01,
        7.80840039e-02,  1.96033668e+00, -3.03179801e-01,  3.45616399e-01,
       -4.74922820e-01, -1.51915757e+00,  1.21024628e-02,  5.19207186e-03,
        9.66544275e-01,  9.48847357e-01, -6.18328364e-01, -4.21485975e-01,
       -1.06007593e+00,  

In [77]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts() #porcentajes que indican cómo se deben dividir los datos

(-2.903, -1.336]     100
(-1.336, -0.0113]    400
(-0.0113, 1.23]      400
(1.23, 3.681]        100
Name: count, dtype: int64

-----------------------------------

### Detección y filtrado de valores atípicos (outliers)


In [78]:
data = pd.DataFrame(np.random.standard_normal((50, 4)))
data

Unnamed: 0,0,1,2,3
0,-0.549387,2.500095,-0.312986,-0.428554
1,-1.009456,-0.091466,0.403415,1.362264
2,-0.414647,-1.160948,0.43559,1.025078
3,-0.155117,-0.436029,1.210193,0.092144
4,0.040193,-0.659973,0.425066,-1.000703
5,0.548556,-2.69281,0.239153,-0.908703
6,-0.588691,0.081287,-0.016639,0.483252
7,0.592319,-0.007525,0.353154,0.336143
8,-1.689028,0.627111,-1.005869,0.601956
9,1.86403,0.200176,-0.203874,-0.096117


In [79]:
data.describe()

Unnamed: 0,0,1,2,3
count,50.0,50.0,50.0,50.0
mean,0.147674,-0.312968,0.037895,0.11454
std,0.734964,1.052833,0.950657,0.970667
min,-1.689028,-2.711853,-1.853466,-2.464253
25%,-0.350556,-1.069854,-0.484474,-0.40834
50%,0.254508,-0.118029,0.066091,0.1041
75%,0.582073,0.315565,0.501686,0.593667
max,1.86403,2.500095,3.018854,2.235604


In [80]:
col = data[2]
col

0    -0.312986
1     0.403415
2     0.435590
3     1.210193
4     0.425066
5     0.239153
6    -0.016639
7     0.353154
8    -1.005869
9    -0.203874
10   -0.439729
11    0.510095
12    0.823915
13   -1.853466
14    0.151973
15   -0.150003
16   -1.641312
17   -0.269652
18    0.079281
19    0.297928
20    0.476462
21   -1.817205
22    0.862095
23   -0.034202
24   -1.207945
25    1.989367
26   -1.264346
27   -1.187607
28    0.051513
29   -0.145025
30   -0.073923
31   -1.342301
32    0.716441
33    0.793536
34   -0.499389
35    0.109686
36    0.598954
37    0.052902
38   -0.549538
39   -0.631276
40   -1.216606
41   -0.526837
42    1.529180
43    0.810895
44    1.045480
45    3.018854
46    1.141681
47    0.349618
48   -0.368345
49    0.176414
Name: 2, dtype: float64

In [81]:
col[col.abs() > 2]

45    3.018854
Name: 2, dtype: float64

In [82]:
data[(data.abs() > 2).any(axis="columns")]
# Verifica (o entra) por columnas pero selecciona toda la fila.

Unnamed: 0,0,1,2,3
0,-0.549387,2.500095,-0.312986,-0.428554
5,0.548556,-2.69281,0.239153,-0.908703
13,1.260756,-2.13292,-1.853466,-0.347698
21,0.840929,-2.711853,-1.817205,0.282217
29,0.358618,-2.198916,-0.145025,-0.474329
37,-0.505708,1.239307,0.052902,-2.464253
44,1.205236,0.110905,1.04548,2.235604
45,0.603615,-0.807975,3.018854,1.589379


In [83]:
data_1 = data[data.abs() > 2] 
data_1

Unnamed: 0,0,1,2,3
0,,2.500095,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,-2.69281,,
6,,,,
7,,,,
8,,,,
9,,,,


In [84]:
data_2 = np.sign(data) * 3
data_2

Unnamed: 0,0,1,2,3
0,-3.0,3.0,-3.0,-3.0
1,-3.0,-3.0,3.0,3.0
2,-3.0,-3.0,3.0,3.0
3,-3.0,-3.0,3.0,3.0
4,3.0,-3.0,3.0,-3.0
5,3.0,-3.0,3.0,-3.0
6,-3.0,3.0,-3.0,3.0
7,3.0,-3.0,3.0,3.0
8,-3.0,3.0,-3.0,3.0
9,3.0,3.0,-3.0,-3.0


In [85]:
data_1.describe()

Unnamed: 0,0,1,2,3
count,0.0,5.0,1.0,2.0
mean,,-1.447281,3.018854,-0.114324
std,,2.223023,,3.3233
min,,-2.711853,3.018854,-2.464253
25%,,-2.69281,3.018854,-1.289289
50%,,-2.198916,3.018854,-0.114324
75%,,-2.13292,3.018854,1.06064
max,,2.500095,3.018854,2.235604


In [86]:
np.sign(data).head() #produce valores 1 y -1 en función de si los valores de los datos son positivos o negativo

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,-1.0,-1.0,1.0,1.0
2,-1.0,-1.0,1.0,1.0
3,-1.0,-1.0,1.0,1.0
4,1.0,-1.0,1.0,-1.0


------------------------------------

### Permutación y muestreo aleatorio

In [87]:
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [88]:
sampler = np.random.permutation(5)
sampler

array([4, 2, 3, 0, 1])

In [89]:
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13


In [90]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13


In [91]:
column_sampler = np.random.permutation(7)
column_sampler

array([6, 5, 3, 2, 1, 4, 0])

In [92]:
df.take(column_sampler, axis="columns")

Unnamed: 0,6,5,3,2,1,4,0
0,6,5,3,2,1,4,0
1,13,12,10,9,8,11,7
2,20,19,17,16,15,18,14
3,27,26,24,23,22,25,21
4,34,33,31,30,29,32,28


In [93]:
df.sample(n=3) #selecciona aleatoriamente 

Unnamed: 0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6


In [94]:
choices = pd.Series([5, 7, -1, 6, 4])
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [95]:
choices.sample(n=10, replace=True) #Muestra con elecciones repetidas

4    4
3    6
1    7
1    7
1    7
3    6
2   -1
3    6
1    7
0    5
dtype: int64

-----------------------

### Cálculo de indicadores/variables ficticias (dummy)

In [96]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b", "z"],
                   "data1": range(7)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5
6,z,6


In [97]:
pd.get_dummies(df["key"], dtype=float)

Unnamed: 0,a,b,c,z
0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0
6,0.0,0.0,0.0,1.0


In [98]:
dummies = pd.get_dummies(df["key"], prefix="key", dtype=float)
dummies

Unnamed: 0,key_a,key_b,key_c,key_z
0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0
6,0.0,0.0,0.0,1.0


In [99]:
df_with_dummy = df[["data1"]].join(dummies) # .join lo veremos a detalle mas adelante
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c,key_z
0,0,0.0,1.0,0.0,0.0
1,1,0.0,1.0,0.0,0.0
2,2,1.0,0.0,0.0,0.0
3,3,0.0,0.0,1.0,0.0
4,4,1.0,0.0,0.0,0.0
5,5,0.0,1.0,0.0,0.0
6,6,0.0,0.0,0.0,1.0


============================================================================

Lectura del dataset movies.dat

In [100]:
mnames = ["movie_id", "title", "genres"]
movies = pd.read_table('movies.dat', sep="::",
                       header=None, names=mnames, engine="python")

movies[:10]                      

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [130]:
dummies = movies["genres"].str.get_dummies("|")
dummies

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [132]:
dummies.iloc[:10, :6] #Seleccionan las filas de la 0 hasta la 9, y las columnas de la 1 hasta la 6. 

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime
0,0,0,1,1,1,0
1,0,1,0,1,0,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,0,0,1,0
5,1,0,0,0,0,1
6,0,0,0,0,1,0
7,0,1,0,1,0,0
8,1,0,0,0,0,0
9,1,1,0,0,0,0


In [133]:
movies_windic = movies.join(dummies.add_prefix("Genre_"))

In [134]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western       

In [135]:
np.random.seed(12345) # para que el ejemplo sea repetible

values = np.random.uniform(size=10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [136]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [137]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,False,False,False,False,True
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,False,True,False,False
5,False,False,True,False,False
6,False,False,False,False,True
7,False,False,False,True,False
8,False,False,False,True,False
9,False,False,False,True,False


----------------------------------

## 2.3 Extension de tipos de datos 

In [139]:
s = pd.Series([1, 2, 3, 400, None])
s

0      1.0
1      2.0
2      3.0
3    400.0
4      NaN
dtype: float64

In [140]:
s.dtype

dtype('float64')

In [141]:
s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [142]:
s.isna()

0    False
1    False
2    False
3     True
dtype: bool

In [143]:
s.dtype

Int64Dtype()

In [144]:
s[3]

<NA>

In [145]:
s[3] is pd.NA

True

In [115]:
s = pd.Series([1, 2, 3, None], dtype="Int64")
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [146]:
s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype())
s

0      one
1      two
2     <NA>
3    three
dtype: string

======================================================================================

In [147]:
df = pd.DataFrame({"A": [1, 2, None, 4],
                   "B": ["one", "two", "three", None],
                   "C": [False, None, False, True]})

df


Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


In [148]:
df["A"] = df["A"].astype("Int64")

In [149]:
df["B"] = df["B"].astype("string")

In [152]:
df["C"] = df["C"].astype("boolean")

In [151]:
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


## 2.4 Manipulación de cadenas (string)

In [155]:
val = "a,b,  separacion, es"
val.split(",")

['a', 'b', '  separacion', ' es']

In [156]:
pieces = [x.strip() for x in val.split(",")]
pieces

['a', 'b', 'separacion', 'es']

In [159]:
first, second, third, four = pieces
first + "::" + second + "::" + third + "::" + four

'a::b::separacion::es'

In [160]:
"::".join(pieces)

'a::b::separacion::es'

In [161]:
"separacion" in val

True

In [162]:
val.index(",")

1

In [163]:
val.find(":")

-1

In [164]:
val.index(":")

ValueError: substring not found

In [165]:
val.count(",")

3

In [166]:
val.replace(",", "::")

'a::b::  separacion:: es'

In [167]:
val.replace(",", "")

'ab  separacion es'

-----------------------------------------------------------

### Expresiones Regulares

La expresión regular que describe uno o más caracteres de espacio en blanco es `\s+`:

In [169]:
import re

text = "foo  bar\t baz  \tqux"
text

'foo    bar\t baz  \tqux'

In [170]:
re.split(r"\s+", text)

['foo', 'bar', 'baz', 'qux']

In [171]:
regex = re.compile(r"\s+")

regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [172]:
regex.findall(text) #lista de todos los patrones que coincidan con la expresión regular

['    ', '\t ', '  \t']

In [173]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""

pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

# re.IGNORECASE hace que la expresión regular no distinga 
# entre mayúsculas y minúsculas. 
regex = re.compile(pattern, flags=re.IGNORECASE)



In [174]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [175]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [176]:
text[m.start():m.end()]

'dave@google.com'

In [177]:
print(regex.match(text))

None


In [178]:
print(regex.sub("REDACTED", text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


In [179]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"

regex = re.compile(pattern, flags=re.IGNORECASE)

In [180]:
m = regex.match("wesm@bright.net")

m.groups()

('wesm', 'bright', 'net')

In [181]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [182]:
print(regex.sub(r"Username: \1, Domain: \2, Suffix: \3", text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


-------------------------------

### Funciones de cadena en pandas

In [183]:
data = {"Dave": "dave@google.com",
        "Steve": "steve@gmail.com",
        "Rob": "rob@gmail.com",
        "Wes": np.nan}

data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [184]:
data.isna()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [185]:
data.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [186]:
data_as_string_ext = data.astype('string')
data_as_string_ext

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                 <NA>
dtype: string

In [187]:
data_as_string_ext.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes       <NA>
dtype: boolean

In [188]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [189]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]
# La línea .str[0] se utiliza para obtener la primera tupla
# de la lista, que contiene los valores encontrados para el 
# primer grupo de captura en la expresión regular.
matches

Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [190]:
matches.str.get(1)

Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

In [191]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [192]:
data.str.extract(pattern, flags=re.IGNORECASE)

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,


---------------------------------------------------------------------

## 2.5 Datos Categóricos

In [193]:
values = pd.Series(['apple', 'orange', 'apple',
                    'apple'] * 2)
values               

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [194]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [195]:
pd.value_counts(values)

  pd.value_counts(values)


apple     6
orange    2
Name: count, dtype: int64

In [196]:
values = pd.Series([0, 1, 0, 0] * 2)
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [197]:
dim = pd.Series(['apple', 'orange'])
dim

0     apple
1    orange
dtype: object

In [198]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

---------------------------------------------------------------

### Extension de tipos de datos categóricos en Pandas

In [199]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [200]:
N = len(fruits)
N

8

In [201]:
rng = np.random.default_rng(seed=12345)

In [202]:
df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': rng.integers(3, 15, size=N),
                   'weight': rng.uniform(0, 4, size=N)},
                    columns=['basket_id', 'fruit', 'count', 'weight'])

df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,1.564438
1,1,orange,5,1.331256
2,2,apple,12,2.393235
3,3,apple,6,0.746937
4,4,apple,5,2.691024
5,5,orange,12,3.767211
6,6,apple,10,0.992983
7,7,apple,11,3.795525


In [203]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [204]:
c = fruit_cat.array
type(c)

pandas.core.arrays.categorical.Categorical

In [205]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [206]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [207]:
dict(enumerate(c.categories))


{0: 'apple', 1: 'orange'}

In [208]:
df['fruit'] = df['fruit'].astype('category')

In [209]:
df["fruit"]

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [210]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
my_categories

['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']

In [211]:
categories = ['foo', 'bar', 'baz']
codes = [0, 1, 2, 0, 0, 1]

In [212]:
my_cats_2 = pd.Categorical.from_codes(codes, categories)
my_cats_2

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

In [213]:
ordered_cat = pd.Categorical.from_codes(codes, categories,
                                        ordered=True)
ordered_cat

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [214]:
my_cats_2.as_ordered()

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

-------------------------------------------------------

### Cálculos con datos categorías

In [216]:
rng = np.random.default_rng(seed=12345)

draws = rng.standard_normal(1000)

draws[:5]

array([-1.42382504,  1.26372846, -0.87066174, -0.25917323, -0.07534331])

In [217]:
bins = pd.qcut(draws, 4)
bins

[(-3.121, -0.675], (0.687, 3.211], (-3.121, -0.675], (-0.675, 0.0134], (-0.675, 0.0134], ..., (0.0134, 0.687], (0.0134, 0.687], (-0.675, 0.0134], (0.0134, 0.687], (-0.675, 0.0134]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.121, -0.675] < (-0.675, 0.0134] < (0.0134, 0.687] < (0.687, 3.211]]

In [218]:
bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
bins

['Q1', 'Q4', 'Q1', 'Q2', 'Q2', ..., 'Q3', 'Q3', 'Q2', 'Q3', 'Q2']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [219]:
bins.codes[:10]

array([0, 3, 0, 1, 1, 0, 0, 2, 2, 0], dtype=int8)

In [220]:
bins = pd.Series(bins, name='quartile')

In [221]:
results = (pd.Series(draws)
           .groupby(bins)
           .agg(['count', 'min', 'max'])
           .reset_index())
# el método agg calcula las estadísticas resumidas para cada grupo, 
# en este caso, el número de elementos en cada grupo, 
# el valor mínimo y el valor máximo.
results

  .groupby(bins)


Unnamed: 0,quartile,count,min,max
0,Q1,250,-3.119609,-0.678494
1,Q2,250,-0.673305,0.008009
2,Q3,250,0.018753,0.686183
3,Q4,250,0.688282,3.211418


In [222]:
results['quartile']

0    Q1
1    Q2
2    Q3
3    Q4
Name: quartile, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

----------------------------------------------------------------------------

### Mejor rendimiento con categóricos

In [223]:
N = 10_000_000

In [224]:
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))
labels

0          foo
1          bar
2          baz
3          qux
4          foo
          ... 
9999995    qux
9999996    foo
9999997    bar
9999998    baz
9999999    qux
Length: 10000000, dtype: object

In [225]:
categories = labels.astype('category')

In [226]:
labels.memory_usage(deep=True)

600000132

In [227]:
categories.memory_usage(deep=True)

10000544

In [228]:
%time _ = labels.astype('category')

CPU times: total: 469 ms
Wall time: 469 ms


Las operaciones `GroupBy` pueden ser significativamente más rápidas con categóricas porque los algoritmos subyacentes utilizan arrays de códigos basada en enteros en lugar de un array  de cadenas. Aquí comparamos el rendimiento de value_counts(), que utiliza internamente la maquinaria `GroupBy`:

In [None]:
 %timeit labels.value_counts()

718 ms ± 72.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%timeit categories.value_counts()

115 ms ± 7.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


###  Métodos categóricos

Las series que contienen datos categóricos disponen de varios métodos especiales similares a los métodos de cadena especializados de Series.str. Esto también proporciona un acceso conveniente a las categorías y códigos. Considere la Serie:

In [None]:
 s = pd.Series(['a', 'b', 'c', 'd'] * 2)
    
cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

El atributo especial de acceso `cat` proporciona acceso a métodos categóricos:

In [None]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [None]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

Supongamos que sabemos que el conjunto real de categorías para estos datos se extiende más allá de los cuatro valores observados en los datos. Podemos utilizar el método `set_categories` para cambiarlas:

In [None]:
actual_categories = ['a', 'b', 'c', 'd', 'e']

cat_s2 = cat_s.cat.set_categories(actual_categories)

cat_s2

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

Aunque parezca que los datos no cambian, las nuevas categorías se reflejarán en las operaciones que las utilicen. Por ejemplo, value_counts respeta las categorías, si están presentes:



In [None]:
 cat_s.value_counts()

a    2
b    2
c    2
d    2
Name: count, dtype: int64

In [None]:
cat_s2.value_counts()

a    2
b    2
c    2
d    2
e    0
Name: count, dtype: int64

En grandes conjuntos de datos, las categorías se utilizan a menudo como una herramienta conveniente para ahorrar memoria y mejorar el rendimiento. Después de filtrar un gran DataFrame o Series, muchas de las categorías pueden no aparecer en los datos. Para ayudar con esto, podemos utilizar el método remove_unused_categories para recortar las categorías no observadas:

In [None]:
cat_s3 = cat_s[cat_s.isin(['a', 'b'])]
cat_s3

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [None]:
cat_s3.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']

### Creación de variables ficticias para la modelización

Cuando se utilizan herramientas estadísticas o de aprendizaje automático, a menudo se transforman los datos categóricos en variables ficticias, lo que también se conoce como codificación de una sola vez (one-hot encoding). Esto implica crear un DataFrame con una columna para cada categoría distinta; estas columnas contienen 1s para ocurrencias de una categoría dada y 0 en caso contrario.

Consideremos el ejemplo anterior:

In [None]:
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')

Como se ha mencionado anteriormente , la función pandas.get_dummies convierte estos datos categóricos unidimensionales en un DataFrame que contiene la variable ficticia:

In [None]:
pd.get_dummies(cat_s, dtype=float)

Unnamed: 0,a,b,c,d
0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0
6,0.0,0.0,1.0,0.0
7,0.0,0.0,0.0,1.0
