# Guía Definitiva para Pandas

In [1]:
# Carga de librerías necesarias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

Pandas es una librería de python que añade una funcionalidad que, a diferencia de R, python no trae en su distribución base: los dataframes. Estos son estructuras similares a las encontradas en las tablas de bases de datos relacionales, en donde los datos se almacenan por filas (registros) y columnas (variables). 

In [2]:
# Creacion de datos desde un dictionary
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy', 'Richy', 'Josh'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Recarey','Brown'], 
        'age': [42, 52, 36, 24, 73,12,19], 
        'preTestScore': [4, 24, 31, 2, 3,21,9],
        'postTestScore': [25, 94, 57, 62, 70,21,10],
        'Birthday':['1994-02-19','1982-02-07','1987-09-11','1992-08-02','1994-05-04', '1992-03-03','2000-09-12']}

# La creacion es directa, con los nombres de la columna inferido
# Ambas versiones son correctas
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore', 'Birthday'])
df = pd.DataFrame(raw_data)
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,Birthday
0,Jason,Miller,42,4,25,1994-02-19
1,Molly,Jacobson,52,24,94,1982-02-07
2,Tina,Ali,36,31,57,1987-09-11
3,Jake,Milner,24,2,62,1992-08-02
4,Amy,Cooze,73,3,70,1994-05-04
5,Richy,Recarey,12,21,21,1992-03-03
6,Josh,Brown,19,9,10,2000-09-12


### Comandos básicos de información sobre el DataFrame

In [44]:
# Forma
print(df.shape)

# Nombre de las columnas
print(df.columns)

(7, 7)
Index(['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore',
       'Birthday', 'Team'],
      dtype='object')


In [4]:
# Tipos de las columnas
print(df.age.dtype)
print(df.Birthday.dtype)

df.Birthday = pd.to_datetime(df.Birthday, format = '%Y-%m-%d')

print(df.age.dtype)
print(df.Birthday.dtype)

int64
object
int64
datetime64[ns]


In [5]:
# Transponer
df.T

Unnamed: 0,0,1,2,3,4,5,6
first_name,Jason,Molly,Tina,Jake,Amy,Richy,Josh
last_name,Miller,Jacobson,Ali,Milner,Cooze,Recarey,Brown
age,42,52,36,24,73,12,19
preTestScore,4,24,31,2,3,21,9
postTestScore,25,94,57,62,70,21,10
Birthday,1994-02-19 00:00:00,1982-02-07 00:00:00,1987-09-11 00:00:00,1992-08-02 00:00:00,1994-05-04 00:00:00,1992-03-03 00:00:00,2000-09-12 00:00:00


In [6]:
# Obtenemos un array de arrays, donde cada array va una FILA!
print(df.values, '\n\n')
print(type(df.values))

[['Jason' 'Miller' 42 4 25 Timestamp('1994-02-19 00:00:00')]
 ['Molly' 'Jacobson' 52 24 94 Timestamp('1982-02-07 00:00:00')]
 ['Tina' 'Ali' 36 31 57 Timestamp('1987-09-11 00:00:00')]
 ['Jake' 'Milner' 24 2 62 Timestamp('1992-08-02 00:00:00')]
 ['Amy' 'Cooze' 73 3 70 Timestamp('1994-05-04 00:00:00')]
 ['Richy' 'Recarey' 12 21 21 Timestamp('1992-03-03 00:00:00')]
 ['Josh' 'Brown' 19 9 10 Timestamp('2000-09-12 00:00:00')]] 


<class 'numpy.ndarray'>


## Slicing básico

In [7]:
# Acceder a unha columna:
## Los datos en si mismos:
print(df.first_name)

## Que tipo de datos nos dió dicho slice:
print(type(df.first_name))

print(df.first_name.dtype)

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
5    Richy
6     Josh
Name: first_name, dtype: object
<class 'pandas.core.series.Series'>
object


In [8]:
# Acceder a unha columna:
## Los datos en si mismos:
print(df.age)

## Que tipo de datos nos dió dicho slice:
print(type(df.age))



0    42
1    52
2    36
3    24
4    73
5    12
6    19
Name: age, dtype: int64
<class 'pandas.core.series.Series'>


### Slicing básico, estilo R

Coge filas por defecto, y por posición como en R.

In [52]:
# De la segunda fila, incluido, al final
print(df[2:],'\n\n')

# Del principio hasta la 5ª, sin incluir
print(df[:5], '\n\n')

# Solo una file
print(df[1:2],'\n\n')

# Serie horizontal, por fila, y por posición de columna tambien
print(df.iloc[2:, 1:4],'\n\n')


print(type(df.iloc[1]), "\n")

# Si la queremos por etiqueta en vez de posición
print(df.loc[[1,2,5]])

  first_name last_name  age  preTestScore  postTestScore   Birthday  Team
4       Tina       Ali   36            31             57 1987-09-11     1
5       Jake    Milner   24             2             62 1992-08-02     1
2        Amy     Cooze   73             3             70 1994-05-04     0
3      Richy   Recarey   12            21             21 1992-03-03     1
6       Josh     Brown   19             9             10 2000-09-12     1 


  first_name last_name  age  preTestScore  postTestScore   Birthday  Team
1      Jason    Miller   42             4             25 1994-02-19     0
0      Molly  Jacobson   52            24             94 1982-02-07     0
4       Tina       Ali   36            31             57 1987-09-11     1
5       Jake    Milner   24             2             62 1992-08-02     1
2        Amy     Cooze   73             3             70 1994-05-04     0 


  first_name last_name  age  preTestScore  postTestScore   Birthday  Team
0      Molly  Jacobson   52     

In [10]:
# Accediendo a una Serie por un indice normal
df.first_name[1:3]

1    Molly
2     Tina
Name: first_name, dtype: object

In [11]:
# Indexamos cada 2!
df.age[::2]

0    42
2    36
4    73
6    19
Name: age, dtype: int64

### More advanced slicing:

- Slicing normal va por posición, y filtra por toda la fila. Una vez hecho, se pueden coger determinadas columnas a mano.
- Con .loc se permiten boleans, o etiquetas
- Con .iloc NO se permiten boleans, y va por posición


In [12]:
print(df[df.age == 19], '\n\n')

#Se utiliza el operador bitewise "&" en pandas!
print(df[(df.age > 19) & (df.postTestScore > 10)])

  first_name last_name  age  preTestScore  postTestScore   Birthday
6       Josh     Brown   19             9             10 2000-09-12 


  first_name last_name  age  preTestScore  postTestScore   Birthday
0      Jason    Miller   42             4             25 1994-02-19
1      Molly  Jacobson   52            24             94 1982-02-07
2       Tina       Ali   36            31             57 1987-09-11
3       Jake    Milner   24             2             62 1992-08-02
4        Amy     Cooze   73             3             70 1994-05-04


### Esto sería lo mismo

In [53]:
print(df.iloc[1:3,:])
df[1:3]

  first_name last_name  age  preTestScore  postTestScore   Birthday  Team
0      Molly  Jacobson   52            24             94 1982-02-07     0
4       Tina       Ali   36            31             57 1987-09-11     1


Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,Birthday,Team
0,Molly,Jacobson,52,24,94,1982-02-07,0
4,Tina,Ali,36,31,57,1987-09-11,1


### Esto también sería lo mismo, ya que loc permite boleans:

In [56]:
print(df[df.age > 40])

  first_name last_name  age  preTestScore  postTestScore   Birthday  Team
1      Jason    Miller   42             4             25 1994-02-19     0
0      Molly  Jacobson   52            24             94 1982-02-07     0
2        Amy     Cooze   73             3             70 1994-05-04     0


In [58]:
print(df.loc[df.age > 40,:])

  first_name last_name  age  preTestScore  postTestScore   Birthday  Team
1      Jason    Miller   42             4             25 1994-02-19     0
0      Molly  Jacobson   52            24             94 1982-02-07     0
2        Amy     Cooze   73             3             70 1994-05-04     0


In [65]:
# No onstante loc nos permite coger columnas!!
print(df.loc[df.age > 40,['age','Team']])

   age  Team
1   42     0
0   52     0
2   73     0


### Slicing con %in% bitewise

In [71]:
# Pandas permite operaciones del estilo
print(df.age == 19)

1    False
0    False
4    False
5    False
2    False
3    False
6     True
Name: age, dtype: bool


In [75]:
# Pero no del estilo:
print(df.age in [18,19,20,21,22,23,24,25])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [76]:
# Esa se haría de forma:
print(df.age.isin([18,19,20,21,22,23,24,25]))

1    False
0    False
4    False
5     True
2    False
3    False
6     True
Name: age, dtype: bool


##### Información sobre el índice:

In [14]:
print(df.index)

print(type(df.index))

data = pd.Series([0.25, 0.5, 0.75, 1.0], index = ['a', 'b', 'c', 'd'])

# Ya no es el por defecto
print(data.index)

print(type(data.index))

RangeIndex(start=0, stop=7, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>
Index(['a', 'b', 'c', 'd'], dtype='object')
<class 'pandas.core.indexes.base.Index'>


In [15]:
# Los objetos de tipo Index son inmutables: Más info sobre ellos:
print(df.index.size, df.index.shape, df.index.ndim, df.index.dtype)

7 (7,) 1 int64


# iloc vs loc

In [16]:
df.index = [1,0,4,5,2,3,6]
print(df, "\n")
# loc: Etiquetas y booleans por fila: A por Jason
print(df.loc[[1,2]], "\n") # Fila con el indice 1 o 2!
print(df.loc[(df.index == 1) | (df.index == 2)], "\n") # Igual
# Filas con las posicion 1 y 2
print(df[1:3])

  first_name last_name  age  preTestScore  postTestScore   Birthday
1      Jason    Miller   42             4             25 1994-02-19
0      Molly  Jacobson   52            24             94 1982-02-07
4       Tina       Ali   36            31             57 1987-09-11
5       Jake    Milner   24             2             62 1992-08-02
2        Amy     Cooze   73             3             70 1994-05-04
3      Richy   Recarey   12            21             21 1992-03-03
6       Josh     Brown   19             9             10 2000-09-12 

  first_name last_name  age  preTestScore  postTestScore   Birthday
1      Jason    Miller   42             4             25 1994-02-19
2        Amy     Cooze   73             3             70 1994-05-04 

  first_name last_name  age  preTestScore  postTestScore   Birthday
1      Jason    Miller   42             4             25 1994-02-19
2        Amy     Cooze   73             3             70 1994-05-04 

  first_name last_name  age  preTestScore 

In [17]:
print(df.iloc[1])

first_name                     Molly
last_name                   Jacobson
age                               52
preTestScore                      24
postTestScore                     94
Birthday         1982-02-07 00:00:00
Name: 0, dtype: object


In [18]:
print(df, '\n')
print(df.iloc[0:3, 0:2])
df.loc[0:3, ['first_name', 'last_name']]

  first_name last_name  age  preTestScore  postTestScore   Birthday
1      Jason    Miller   42             4             25 1994-02-19
0      Molly  Jacobson   52            24             94 1982-02-07
4       Tina       Ali   36            31             57 1987-09-11
5       Jake    Milner   24             2             62 1992-08-02
2        Amy     Cooze   73             3             70 1994-05-04
3      Richy   Recarey   12            21             21 1992-03-03
6       Josh     Brown   19             9             10 2000-09-12 

  first_name last_name
1      Jason    Miller
0      Molly  Jacobson
4       Tina       Ali


Unnamed: 0,first_name,last_name
0,Molly,Jacobson
4,Tina,Ali
5,Jake,Milner
2,Amy,Cooze
3,Richy,Recarey


In [19]:
print(df.iloc[2])
df.loc[df.age == 42, 'age']


first_name                      Tina
last_name                        Ali
age                               36
preTestScore                      31
postTestScore                     57
Birthday         1987-09-11 00:00:00
Name: 4, dtype: object


1    42
Name: age, dtype: int64

In [20]:
# Booleans en columnas con loc
print(df.loc[1:5,:]) # Todas las columnas entre la etiqueta 1 y 5
# queremos las columnas tal que age es 24

  first_name last_name  age  preTestScore  postTestScore   Birthday
1      Jason    Miller   42             4             25 1994-02-19
0      Molly  Jacobson   52            24             94 1982-02-07
4       Tina       Ali   36            31             57 1987-09-11
5       Jake    Milner   24             2             62 1992-08-02


In [21]:
data = pd.Series(['a', 'b', 'c'], index=[1, 5, 3])
print(data)

1    a
5    b
3    c
dtype: object


In [22]:
# Slicing por indice
print(data[1])

a


In [23]:
# loc permite indexar mediante índice normal
print(data.loc[1])
print(data.loc[1])

a
a


In [24]:
# iloc permite indexar de manera convencional
print(data.iloc[0])

a


In [25]:
print(data.loc[data == 'a'])
print(data[data == 'a'])
print(data == 'a')

1    a
dtype: object
1    a
dtype: object
1     True
5    False
3    False
dtype: bool


##### iloc permite slicing como el de R, con loc hai que ser congruente

In [26]:
print(df.loc[1,'age'])
# print(df[1,'age']) # ERROR!
print(df.iloc[1,2])

42
52


In [27]:
# Para este tipo de indexado se utiliza iloc
df.iloc[1:2,2]

0    52
Name: age, dtype: int64

# Group by

In [28]:
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,Birthday
1,Jason,Miller,42,4,25,1994-02-19
0,Molly,Jacobson,52,24,94,1982-02-07
4,Tina,Ali,36,31,57,1987-09-11
5,Jake,Milner,24,2,62,1992-08-02
2,Amy,Cooze,73,3,70,1994-05-04
3,Richy,Recarey,12,21,21,1992-03-03
6,Josh,Brown,19,9,10,2000-09-12


In [29]:
df["Team"] = np.where(df.age < 40, 1, 0)

#### Automaticamente elige las que puede sumar, y las coge todas

In [30]:
df.groupby("Team").agg("sum")

Unnamed: 0_level_0,age,preTestScore,postTestScore
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,167,31,189
1,91,63,150


### Seleccionamos las que queremos agregar únicamente

In [31]:
df.groupby("Team").agg({"age" : np.average, "postTestScore" : np.sum})

Unnamed: 0_level_0,age,postTestScore
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
0,55.666667,189
1,22.75,150


#### O incluso poner mas de una para un campo

In [32]:
df.groupby("Team").agg({"age" : [np.average,np.sum]})

Unnamed: 0_level_0,age,age
Unnamed: 0_level_1,average,sum
Team,Unnamed: 1_level_2,Unnamed: 2_level_2
0,55.666667,167
1,22.75,91


In [33]:
df.groupby("Team").agg({"age" : [np.average,'sum']})

Unnamed: 0_level_0,age,age
Unnamed: 0_level_1,average,sum
Team,Unnamed: 1_level_2,Unnamed: 2_level_2
0,55.666667,167
1,22.75,91


### Writing your own functions

In [34]:
def my_agg(serie):
    return serie.sum()

def my_agg2(serie):
    return serie['age'].sum()

In [35]:
df.groupby("Team").agg(my_agg)

Unnamed: 0_level_0,first_name,last_name,age,preTestScore,postTestScore
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,JasonMollyAmy,MillerJacobsonCooze,167,31,189
1,TinaJakeRichyJosh,AliMilnerRecareyBrown,91,63,150


In [36]:
df.groupby("Team").agg(my_agg)

Unnamed: 0_level_0,first_name,last_name,age,preTestScore,postTestScore
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,JasonMollyAmy,MillerJacobsonCooze,167,31,189
1,TinaJakeRichyJosh,AliMilnerRecareyBrown,91,63,150


# Analisis del objeto groupby

In [106]:
print(type(df.groupby("Team")), "\n")
print(df.head(2))

<class 'pandas.core.groupby.groupby.DataFrameGroupBy'> 

  first_name last_name  age  preTestScore  postTestScore   Birthday  Team
1      Jason    Miller   42             4             25 1994-02-19     0
0      Molly  Jacobson   52            24             94 1982-02-07     0


In [109]:
df.groupby("Team").Birthday
df.groupby("Team").first_name
# etc = EXISTEN y ya estan agrupados!

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000025026F20518>

In [127]:
pd.Series(df.groupby("Team").Birthday)

0    (0, [1994-02-19 00:00:00, 1982-02-07 00:00:00,...
1    (1, [1987-09-11 00:00:00, 1992-08-02 00:00:00,...
dtype: object

### Queda pendiente de mejorar el principio de la notebook y este final

In [149]:
def my_agg3(series):
    return series.sum()

In [150]:
df.groupby("Team").agg({'age':my_agg3})

Unnamed: 0_level_0,age
Team,Unnamed: 1_level_1
0,167
1,91


In [151]:
df.groupby("Team").agg(my_agg3)

Unnamed: 0_level_0,first_name,last_name,age,preTestScore,postTestScore
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,JasonMollyAmy,MillerJacobsonCooze,167,31,189
1,TinaJakeRichyJosh,AliMilnerRecareyBrown,91,63,150


https://theplopfactor.wordpress.com/2016/07/22/custom-aggregate-functions-in-pandas/