### Librería Pandas

Libreria que nos permite trabajar con dataframes (tablas). Trabaja sobre series.

https://pandas.pydata.org/pandas-docs/stable/

In [91]:
# Importación librería 
import pandas as pd
import numpy as np

In [92]:
df =pd.DataFrame(np.array([[1,2],[3,4],[2,3],[4,5]]),columns=['a','b']) # Creamos un dataframe 
df.apply(lambda x: x+3) # Aplicamos la formula x+3 usando lambda

Unnamed: 0,a,b
0,4,5
1,6,7
2,5,6
3,7,8


In [93]:
df =pd.DataFrame(np.array([[1,2],[3,4],[2,3],[4,5]]),columns=['a','b']) # Creamos un dataframe 
df['a'] = df.apply(lambda x:x['a']+3,axis=1) # Aplicamos la formula x+3 a la columna a
df

Unnamed: 0,a,b
0,4,2
1,6,4
2,5,3
3,7,5


##### SERIES

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html

In [94]:
array = np.random.randint(1, 50,5)
print(array)

# Creamos series desde un array de numpy, *Un DataFrame es un conjunto de series
series = pd.Series(array)
series

[28  6 20 33  2]


0    28
1     6
2    20
3    33
4     2
dtype: int32

In [95]:
series?

[31mType:[39m        Series
[31mString form:[39m
0    28
1     6
2    20
3    33
4     2
dtype: int32
[31mLength:[39m      5
[31mFile:[39m        c:\users\pauri\appdata\roaming\python\python313\site-packages\pandas\core\series.py
[31mDocstring:[39m  
One-dimensional ndarray with axis labels (including time series).

Labels need not be unique but must be a hashable type. The object
supports both integer- and label-based indexing and provides a host of
methods for performing operations involving the index. Statistical
methods from ndarray have been overridden to automatically exclude
missing data (currently represented as NaN).

Operations between Series (+, -, /, \*, \*\*) align values based on their
associated index values-- they need not be the same length. The result
index will be the sorted union of the two indexes.

Parameters
----------
data : array-like, Iterable, dict, or scalar value
    Contains data stored in Series. If data is a dict, argument order is
    maintain

In [96]:
# Series funciones
print(type(series.values))
print(type(series.index))
print(series.values)
print(series.index)

<class 'numpy.ndarray'>
<class 'pandas.core.indexes.range.RangeIndex'>
[28  6 20 33  2]
RangeIndex(start=0, stop=5, step=1)


In [97]:
# Podemos cambiar el indice 
series.index = [list('abcde')]
print(series)

a    28
b     6
c    20
d    33
e     2
dtype: int32


In [98]:
# indexació
print(series[0])
print(series['a'])
print(series[1:3])
print(series[['e','d']])

28
28
b     6
c    20
dtype: int32
e     2
d    33
dtype: int32


  print(series[0])


In [99]:
# Transformaciones
data = {'a': 0., 'b':1., 'c': 2.} # Creamos un diccionario
serie1 = pd.Series(data) # Creamos una serie de pandas a raiz del diccionario

In [100]:
serie1

a    0.0
b    1.0
c    2.0
dtype: float64

In [101]:
# condiciones
serie1 > 0

a    False
b     True
c     True
dtype: bool

In [102]:
serie1[serie1 > 0] # Filramos a raiz de una condición

b    1.0
c    2.0
dtype: float64

In [103]:
serie2 = pd.Series(['Lamine Yamal','Joan Garcia','Eric Garcia','Pau Cubarsí'])
serie2

0    Lamine Yamal
1     Joan Garcia
2     Eric Garcia
3     Pau Cubarsí
dtype: object

In [104]:
serie2?

[31mType:[39m        Series
[31mString form:[39m
0    Lamine Yamal
1     Joan Garcia
2     Eric Garcia
3     Pau Cubarsí
dtype: object
[31mLength:[39m      4
[31mFile:[39m        c:\users\pauri\appdata\roaming\python\python313\site-packages\pandas\core\series.py
[31mDocstring:[39m  
One-dimensional ndarray with axis labels (including time series).

Labels need not be unique but must be a hashable type. The object
supports both integer- and label-based indexing and provides a host of
methods for performing operations involving the index. Statistical
methods from ndarray have been overridden to automatically exclude
missing data (currently represented as NaN).

Operations between Series (+, -, /, \*, \*\*) align values based on their
associated index values-- they need not be the same length. The result
index will be the sorted union of the two indexes.

Parameters
----------
data : array-like, Iterable, dict, or scalar value
    Contains data stored in Series. If data is a dic

In [105]:
prueba = serie2.str.split(expand=True) # haznos un split por el espacio (por defecto) y expandes la dimension a otra serie
prueba

Unnamed: 0,0,1
0,Lamine,Yamal
1,Joan,Garcia
2,Eric,Garcia
3,Pau,Cubarsí


In [106]:
prueba1 = serie2.str.split('a',expand=True) # haznos un split por a y expandes la dimension a otra serie
prueba1

Unnamed: 0,0,1,2,3
0,L,mine Y,m,l
1,Jo,n G,rci,
2,Eric G,rci,,
3,P,u Cub,rsí,


In [107]:
prueba?

[31mType:[39m        DataFrame
[31mString form:[39m
        0        1
0  Lamine    Yamal
1    Joan   Garcia
2    Eric   Garcia
3     Pau  Cubarsí
[31mLength:[39m      4
[31mFile:[39m        c:\users\pauri\appdata\roaming\python\python313\site-packages\pandas\core\frame.py
[31mDocstring:[39m  
Two-dimensional, size-mutable, potentially heterogeneous tabular data.

Data structure also contains labeled axes (rows and columns).
Arithmetic operations align on both row and column labels. Can be
thought of as a dict-like container for Series objects. The primary
pandas data structure.

Parameters
----------
data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
    Dict can contain Series, arrays, constants, dataclass or list-like objects. If
    data is a dict, column order follows insertion-order. If a dict contains Series
    which have an index defined, it is aligned by its index. This alignment also
    occurs if data is a Series or a DataFrame itself. Alignm

In [108]:
serie2.columns = ['nombre','apellido'] # Le damos nombre a las columnas

In [109]:
serie2.columns

['nombre', 'apellido']

In [110]:
serie2.sort_values(ascending=True) # ordenamos 

2     Eric Garcia
1     Joan Garcia
0    Lamine Yamal
3     Pau Cubarsí
dtype: object

In [111]:
s1 = pd.Series(np.random.randn(500))
s2 = pd.Series(np.random.randn(500))

print(s1.corr(s2)) # correlación entre series
print(s1.cov(s2)) # Covarianza entre vectores

-0.09087928183376388
-0.08967302223452864


##### DataFrames

In [112]:
data = [['lucia', 10], ['marta',15],['ana',14]] # creamos un array con tres arrays como elementos
data

[['lucia', 10], ['marta', 15], ['ana', 14]]

In [113]:
df0 = pd.DataFrame(data, columns = ['nombre','edad']) # Creamos el dataframe con los arrays anteriores, definiendo los nombres de las columnas
df0

Unnamed: 0,nombre,edad
0,lucia,10
1,marta,15
2,ana,14


In [114]:
# Creamos dataframes desde listas
nombre  = ['lucia','marta','ana']
edad = [10, 15, 14]
lista = list(zip(nombre,edad)) # con la función zip juntamos dos arrays distnitos, tienen que tener la misma dimensión los arrays
lista

[('lucia', 10), ('marta', 15), ('ana', 14)]

In [116]:
df1 = pd.DataFrame([nombre,edad])
df1 = df1.T # Aplicamos la traspuesta
df1.columns = ['nombre','edad']
df1

Unnamed: 0,nombre,edad
0,lucia,10
1,marta,15
2,ana,14


In [118]:
# Creamos dataframes desde diccionarios
data = {'1-edad':[10,15,14], '0-nombre':['lucia','marta','ana']} # diccionario de arrays
df2 = pd.DataFrame(data)
df2

Unnamed: 0,1-edad,0-nombre
0,10,lucia
1,15,marta
2,14,ana


In [120]:
lista = [{'nombre':'juan','edad':18},{'nombre':'hugo','edad':19}] # array de diccionarios
pd.DataFrame(lista)

Unnamed: 0,nombre,edad
0,juan,18
1,hugo,19


##### Filtración en dataframes

In [121]:
# Condicionados
df1[df1['edad'] >10]

Unnamed: 0,nombre,edad
1,marta,15
2,ana,14


In [124]:
# multicondicion
df1[(df1['edad'] >10) & (df1['nombre'].str.len()>3)] #edad mayor a 10 y texto mayor a tres 

Unnamed: 0,nombre,edad
1,marta,15


In [127]:
# borrar columnas
df1.drop(columns=['nombre'])

Unnamed: 0,edad
0,10
1,15
2,14
