# Introducción a NumPy

NumPy es una extensión de Python, que le agrega mayor soporte para vectores y matrices, constituyendo una biblioteca de funciones matemáticas de alto nivel para operar con esos vectores o matrices.

**Fuente:** https://jakevdp.github.io/PythonDataScienceHandbook/02.03-computation-on-arrays-ufuncs.html

In [3]:
import numpy as np

In [5]:
a = np.array([1,2,3])

In [6]:
type(a)

numpy.ndarray

In [7]:
a

array([1, 2, 3])

Numpy convierte todos los elementos de un array al mismo tipo de elemento.

In [14]:
b = np.array([1.12312,2,3]) # todos a flotantes

b

In [13]:
c = np.array([1.12312,2,"dadasd"]) # todos a cadenas

In [11]:
c

array(['1.12312', '2', 'dadasd'], dtype='<U32')

Podemos incluso forzar el tipo de un array con `dtype`.

In [15]:
d = np.array([1,2,3], dtype='float32')

Es fácil crear y ver matrices:

In [16]:
e = np.array([[0,0,0], [1,1,1], [2,2,2]])

In [17]:
e

array([[0, 0, 0],
       [1, 1, 1],
       [2, 2, 2]])

Se pueden crear arrays de ceros y unos en un momento. Esto nos puede servir para hacer una máscara (como por ejemplo en los filtros de instagram).

In [26]:
np.zeros(5) # una dimensión

array([0., 0., 0., 0., 0.])

In [27]:
np.zeros((3,4)) # 3 x 4

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [24]:
np.ones((3,4))

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

Con `full` llenamos la matriz con el número indicado

In [28]:
np.full((4,4), 15)

array([[15, 15, 15, 15],
       [15, 15, 15, 15],
       [15, 15, 15, 15],
       [15, 15, 15, 15]])

In [29]:
np.full((4,3,2), 15, dtype="float") #profundidad(z), filas y columnas

array([[[15., 15.],
        [15., 15.],
        [15., 15.]],

       [[15., 15.],
        [15., 15.],
        [15., 15.]],

       [[15., 15.],
        [15., 15.],
        [15., 15.]],

       [[15., 15.],
        [15., 15.],
        [15., 15.]]])

In [34]:
np.arange(0,10) # array usando range(inicio,fin,step)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [35]:
np.arange(0,10,3)

array([0, 3, 6, 9])

In [38]:
np.linspace(0, 10, 10) # (mínimo, máximo, cuántos valores quieres dentro) #equidistantes

array([ 0.        ,  1.11111111,  2.22222222,  3.33333333,  4.44444444,
        5.55555556,  6.66666667,  7.77777778,  8.88888889, 10.        ])

In [40]:
np.random.random((3,3)) # datos aleatorios, entre 0 y 1, distribuidos uniformemente

array([[0.15643609, 0.11269101, 0.83583047],
       [0.3814178 , 0.79765132, 0.10837773],
       [0.69381183, 0.36204625, 0.04783755]])

In [52]:
np.random.normal(0, 1 ,(3 ,3)) # podemos indicar la media y 
#la desviación estándar de la distribución normal

array([[-1.93821768, -0.17197212,  0.85056153],
       [ 0.25000099,  0.51164398, -1.3740405 ],
       [-1.61265242,  0.19697467, -0.84006013]])

In [45]:
np.random.randint(0,30, (4,4)) # igual, pero con enteros

array([[ 0, 26,  2, 25],
       [23, 18, 25, 23],
       [16, 24, 11, 23],
       [25,  5, 19, 28]])

In [46]:
np.eye(4) # matriz con diagonal a 1

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [50]:
np.empty((3, 5)) # rellena con cosas de la memoria

array([[1.79408596e-316, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000]])

## Tipos de datos

In [53]:
np.zeros(10, dtype="int")

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [54]:
np.zeros(10, dtype="int32")

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [58]:
np.zeros(10, dtype=np.int32) # puede representar desde el -128 hasta el 127

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [59]:
np.zeros(10, dtype=np.uint32) # representa hasta 255, al ser unsigned

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint32)

## Propiedades de los arrays de numpy

In [60]:
f = np.random.randint(10, size=6)

In [61]:
f

array([9, 9, 8, 4, 8, 5])

In [64]:
g = np.random.randint(10, size=(3,4)) # tres filas cuatro columnas

In [63]:
g

array([[0, 5, 5, 9],
       [7, 5, 8, 2],
       [1, 9, 9, 9]])

In [65]:
h = np.random.randint(10, size=(3,4,5))

In [66]:
h

array([[[0, 3, 9, 8, 2],
        [1, 1, 4, 0, 7],
        [8, 4, 4, 1, 8],
        [0, 8, 4, 0, 7]],

       [[0, 8, 0, 8, 3],
        [6, 3, 1, 5, 4],
        [9, 8, 2, 8, 9],
        [2, 1, 0, 3, 5]],

       [[3, 5, 7, 9, 5],
        [3, 0, 4, 6, 4],
        [3, 7, 0, 8, 3],
        [3, 6, 4, 1, 9]]])

In [67]:
f.ndim # len de dimensiones

1

In [68]:
g.ndim

2

In [69]:
h.ndim

3

In [71]:
f.shape # su forma, cómo están dispuestos

(6,)

In [72]:
g.shape

(3, 4)

In [75]:
h.shape

(3, 4, 5)

In [77]:
f.dtype #consulta el tipo

dtype('int64')

In [78]:
f.itemsize #consulta el tamaño en bits de cada elemento

8

In [79]:
f.nbytes # tamaño total, sumando todos los elementos, en bytes

48

### Indexado en arrays

##### Sigue las reglas de subslicing de python

In [80]:
a

array([1, 2, 3])

In [81]:
a[0]

1

In [83]:
a[2]

3

In [84]:
a[1:2]

array([2])

In [85]:
a[::-1]

array([3, 2, 1])

In [87]:
g

array([[4, 1, 2, 9],
       [3, 1, 4, 5],
       [2, 9, 9, 9]])

In [89]:
g[2,3] #fila 2, elemento 3

9

In [90]:
g[-1, -1] #última fila, último elemento

9

In [91]:
g[-1, -1] = 10

In [92]:
g[-1, -1]

10

In [95]:
g[1, 2] = 2.0 # se va a cargar la parte decimal, ya que el array es dtype=int

In [94]:
g

array([[ 4,  1,  2,  9],
       [ 3,  1,  2,  5],
       [ 2,  9,  9, 10]])

### Subarrays multidimensionales

In [99]:
g

array([[ 4,  1,  2,  9],
       [ 3,  1,  2,  5],
       [ 2,  9,  9, 10]])

In [104]:
g[1:, 2:] # hemos hecho un trocito del array

array([[ 2,  5],
       [ 9, 10]])

In [103]:
g[:1, :2]

array([[4, 1]])

In [106]:
g[:3, 1::2]

array([[ 1,  9],
       [ 1,  5],
       [ 9, 10]])

In [108]:
g[:,0] # dame todas la fila de la columna 0

array([4, 3, 2])

In [109]:
g[2] # dame la columna num 2

array([ 2,  9,  9, 10])

In [110]:
g[:, :2] # ltodas las filas de las columnas desde el inicio hasta la 2

array([[4, 1],
       [3, 1],
       [2, 9]])

In [112]:
g #quiero del 3 al 9

array([[ 4,  1,  2,  9],
       [ 3,  1,  2,  5],
       [ 2,  9,  9, 10]])

In [115]:
trocito = g[1:, :2]

In [116]:
trocito

array([[3, 1],
       [2, 9]])

In [117]:
trocito[0,0] = 42

In [118]:
g

array([[ 4,  1,  2,  9],
       [42,  1,  2,  5],
       [ 2,  9,  9, 10]])

Al asignar un subslicing a una variable estamos creando una vista. Los cambios a la vista de reflejan en el inicial. Si queremos que no sea una vista, podemos usar el método `.copy`

In [119]:
trocito = g[1:, :2].copy()

In [120]:
trocito

array([[42,  1],
       [ 2,  9]])

In [121]:
trocito[0,0] = 69

In [122]:
trocito

array([[69,  1],
       [ 2,  9]])

In [123]:
g

array([[ 4,  1,  2,  9],
       [42,  1,  2,  5],
       [ 2,  9,  9, 10]])

### Reshapear arrays

In [126]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [130]:
np.arange(10).reshape((5,2)) # primero filas y luego columnas

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [132]:
np.arange(10).reshape((3,3)) # no encaja y da error 3x3 = 9 elementos

ValueError: cannot reshape array of size 10 into shape (3,3)

In [134]:
j= np.arange(10)

In [135]:
j

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [139]:
j[np.newaxis,:] # nos crea una dimensión más

array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [138]:
j.shape

(10,)

## Concatenar y dividir arrays

In [140]:
a = np.arange(10)

In [144]:
b = np.arange(9, -1, -1)

In [142]:
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [145]:
b

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [146]:
np.concatenate([a,b])

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [147]:
np.concatenate([a, b], axis=1) # no tiene suficientes dimensiones

AxisError: axis 1 is out of bounds for array of dimension 1

In [148]:
# Hay que añadirle la segunda dimensión

In [151]:
a = a.reshape(1,10)

In [152]:
b = b.reshape(1,10)

In [156]:
np.concatenate([a, b], axis=1) # todo en la 1º dimensión

array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]])

In [157]:
np.concatenate([a, b], axis=0) # divide en las dimensiones que haya

array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]])

In [155]:
np.concatenate([a, b], axis=2) #no tiene suficientes dimensiones y peta

AxisError: axis 2 is out of bounds for array of dimension 2

In [160]:
a = np.array([[1, 2, 3], [4,5,6]])

In [161]:
b = np.array([[1, 2, 3, 4, 5], [4,5,6, 7, 7]])

In [162]:
a

array([[1, 2, 3],
       [4, 5, 6]])

In [163]:
b

array([[1, 2, 3, 4, 5],
       [4, 5, 6, 7, 7]])

In [164]:
np.hstack([a,b])

array([[1, 2, 3, 1, 2, 3, 4, 5],
       [4, 5, 6, 4, 5, 6, 7, 7]])

In [166]:
np.vstack([a,b]) #no se pueden concatenar en vertical por que no pegan

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 3 and the array at index 1 has size 5

## Splitting arrays

In [168]:
x = [1, 2, 3, 99, 99, 3, 2, 1]

In [171]:
x1, x2, x3 = np.split(x, [3, 5])

In [172]:
print(x1, x2, x3)

[1 2 3] [99 99] [3 2 1]


In [177]:
grid = np.arange(16).reshape((4, 4))

In [178]:
grid

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [179]:
upper, lower = np.vsplit(grid, [2])
print(upper)
print(lower)

[[0 1 2 3]
 [4 5 6 7]]
[[ 8  9 10 11]
 [12 13 14 15]]


In [180]:
left, right = np.hsplit(grid, [2])
print(left)
print(right)

[[ 0  1]
 [ 4  5]
 [ 8  9]
 [12 13]]
[[ 2  3]
 [ 6  7]
 [10 11]
 [14 15]]


## Ufuncs: Funciones Universales

In [182]:
grid

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [184]:
grid + 1 # suma uno a cada elemento del array

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16]])

In [185]:
grid / 2

array([[0. , 0.5, 1. , 1.5],
       [2. , 2.5, 3. , 3.5],
       [4. , 4.5, 5. , 5.5],
       [6. , 6.5, 7. , 7.5]])

In [186]:
grid ** 2

array([[  0,   1,   4,   9],
       [ 16,  25,  36,  49],
       [ 64,  81, 100, 121],
       [144, 169, 196, 225]])

In [187]:
grid // 2

array([[0, 0, 1, 1],
       [2, 2, 3, 3],
       [4, 4, 5, 5],
       [6, 6, 7, 7]])

In [188]:
grid

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [194]:
res = np.empty(16).reshape((4,4))

In [195]:
for ridx, row in enumerate(grid):
    for cidx, col in enumerate(row):
        res[ridx][cidx] = col + 2

In [197]:
res # esto es igual que grid + 2

array([[ 2.,  3.,  4.,  5.],
       [ 6.,  7.,  8.,  9.],
       [10., 11., 12., 13.],
       [14., 15., 16., 17.]])

In [200]:
2 ** grid

array([[    1,     2,     4,     8],
       [   16,    32,    64,   128],
       [  256,   512,  1024,  2048],
       [ 4096,  8192, 16384, 32768]])

In [203]:
ngrid = -grid

In [204]:
ngrid

array([[  0,  -1,  -2,  -3],
       [ -4,  -5,  -6,  -7],
       [ -8,  -9, -10, -11],
       [-12, -13, -14, -15]])

In [205]:
np.abs(ngrid)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [206]:
np.sin(ngrid)

array([[ 0.        , -0.84147098, -0.90929743, -0.14112001],
       [ 0.7568025 ,  0.95892427,  0.2794155 , -0.6569866 ],
       [-0.98935825, -0.41211849,  0.54402111,  0.99999021],
       [ 0.53657292, -0.42016704, -0.99060736, -0.65028784]])

### Funciones trigonométricas

In [211]:
theta = np.linspace(0, np.pi, 3) # array de ángulos

In [208]:
theta

array([0.        , 1.57079633, 3.14159265])

In [209]:
print("theta      = ", theta)
print("sin(theta) = ", np.sin(theta))
print("cos(theta) = ", np.cos(theta))
print("tan(theta) = ", np.tan(theta))

theta      =  [0.         1.57079633 3.14159265]
sin(theta) =  [0.0000000e+00 1.0000000e+00 1.2246468e-16]
cos(theta) =  [ 1.000000e+00  6.123234e-17 -1.000000e+00]
tan(theta) =  [ 0.00000000e+00  1.63312394e+16 -1.22464680e-16]


In [212]:
x = [1, 2, 3]
print("x     =", x)
print("e^x   =", np.exp(x))
print("2^x   =", np.exp2(x))
print("3^x   =", np.power(3, x))

x     = [1, 2, 3]
e^x   = [ 2.71828183  7.3890561  20.08553692]
2^x   = [2. 4. 8.]
3^x   = [ 3  9 27]


## Agregados

In [213]:
from functools import reduce

In [216]:
reduce(lambda a,b: a+b, [1,2,3,4,5,6]) #f = para cada par de elementos, los sumas

21

In [217]:
sum([1,2,3,4,5,6])

21

In [218]:
reduce(lambda a,b: a*b, [1,2,3,4,5,6])

720

In [221]:
from operator import mul

In [222]:
reduce(mul,[1,2,3,4,5,6])

720

In [223]:
 x = np.arange(1,6)

In [224]:
x

array([1, 2, 3, 4, 5])

In [225]:
np.add.reduce(x)

15

In [226]:
np.subtract.reduce(x)

-13

In [227]:
np.power.reduce(x)

1

In [230]:
x = np.arange(2,6)

In [229]:
np.power.reduce(x)

1152921504606846976

In [231]:
np.add.accumulate(x)

array([ 2,  5,  9, 14])

In [235]:
np.multiply.accumulate(x) # guardamos cada resultado y lo mostramos,
#para ver la progresión

array([  2,   6,  24, 120])

In [233]:
np.power.accumulate(x)

array([                  2,                   8,                4096,
       1152921504606846976])

In [236]:
np.prod(x)

120

In [239]:
np.cumsum(x) # cum = acumulado

array([ 2,  5,  9, 14])

In [238]:
np.cumprod(x)

array([  2,   6,  24, 120])

In [241]:
x

array([2, 3, 4, 5])

In [240]:
np.multiply.outer(x,x) # genera todas las combinaciones

array([[ 4,  6,  8, 10],
       [ 6,  9, 12, 15],
       [ 8, 12, 16, 20],
       [10, 15, 20, 25]])

## Suma, máximos y mínimos - Eficiencia de usar numpy

En las sumas, sumar con np es mucho más eficiente.

In [243]:
big_array = np.random.rand(1000000)
%timeit sum(big_array)
%timeit np.sum(big_array)

131 ms ± 5.57 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
373 µs ± 3.23 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [244]:
min(big_array), max(big_array)

(3.036509018672362e-07, 0.9999996818659987)

In [246]:
np.min(big_array), np.max(big_array)

(3.036509018672362e-07, 0.9999996818659987)

Igual que antes, trabajar con np es mucho mejor

In [245]:
%timeit min(big_array)
%timeit np.min(big_array)

75.3 ms ± 1.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
419 µs ± 1.53 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Encima existen métodos para usar esas funciones!!!

In [247]:
print(big_array.min(), big_array.max(), big_array.sum())

3.036509018672362e-07 0.9999996818659987 500399.3749649822


## Agregados multidimensionales

In [249]:
M = np.random.random((3, 4))
print(M)

[[0.36582707 0.26634073 0.35450865 0.40656006]
 [0.39652662 0.54691079 0.76064289 0.64116523]
 [0.42346028 0.63535308 0.46822299 0.31698244]]


In [253]:
M.sum() #suma agregada

5.582500836533692

In [254]:
M.min(axis=0) #mínimo de cada columna

array([0.36582707, 0.26634073, 0.35450865, 0.31698244])

In [255]:
M.max(axis=1) # máximo de cada fila

array([0.40656006, 0.76064289, 0.63535308])

### Listado de todas las agregaciones

- `np.sum`	`np.nansum`	Compute sum of elements
- `np.prod`	`np.nanprod`	Compute product of elements
- `np.mean`	`np.nanmean`	Compute mean of elements
- `np.std`	`np.nanstd`	Compute standard deviation
- `np.var`	`np.nanvar`	Compute variance
- `np.min`	`np.nanmin`	Find minimum value
- `np.max`	`np.nanmax`	Find maximum value
- `np.argmin`	`np.nanargmin`	Find index of minimum value
- `np.argmax`	`np.nanargmax`	Find index of maximum value
- `np.median`	`np.nanmedian`	Compute median of elements
- `np.percentile`	`np.nanpercentile`	Compute rank-based statistics of elements
- `np.any`	`N/A`	Evaluate whether any elements are true
- `np.all`	`N/A`	Evaluate whether all elements are true

## Ejercicio: Media de altura de los presidentes de USA

In [258]:
!head -4 president_heights.csv

order,name,height(cm)
1,George Washington,189
2,John Adams,170
3,Thomas Jefferson,189


In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import csv

In [4]:
import pandas as pd
data = pd.read_csv('president_heights.csv')
heights = np.array(data['height(cm)'])

In [5]:
print(heights)

[189 170 189 163 183 171 185 168 173 183 173 173 175 178 183 193 178 173
 174 183 183 168 170 178 182 180 183 178 182 188 175 179 183 193 182 183
 177 185 188 188 182 185]


In [6]:
print("Mean height:       ", heights.mean())
print("Standard deviation:", heights.std())
print("Minimum height:    ", heights.min())
print("Maximum height:    ", heights.max())

Mean height:        179.73809523809524
Standard deviation: 6.931843442745892
Minimum height:     163
Maximum height:     193


In [7]:
print("25th percentile:   ", np.percentile(heights, 25))
print("Median:            ", np.median(heights))
print("75th percentile:   ", np.percentile(heights, 75))

25th percentile:    174.25
Median:             182.0
75th percentile:    183.0
