## Comparing performance with list

In [2]:
import numpy as np

In [1]:
N = 100000000

In [10]:
%%time
list_ = list(range(N))
for i in range(N):
  list_[i] = list_[i] * list_[i]

CPU times: user 20.9 s, sys: 1.42 s, total: 22.3 s
Wall time: 22.3 s


In [2]:
%%time 
list_ = list(range(N))
list_ = [item * item for item in list_]

CPU times: user 8.83 s, sys: 4.23 s, total: 13.1 s
Wall time: 13.1 s


In [3]:
%%time
list_ = list(range(N))
item_ = map(lambda x: x * x, list_)

CPU times: user 2.63 s, sys: 1.78 s, total: 4.42 s
Wall time: 4.4 s


In [5]:
%%time
list_ = list(range(N))
list_sum = 0
for item in list_:
  list_sum += item


CPU times: user 12.6 s, sys: 3.33 s, total: 15.9 s
Wall time: 15.9 s


In [6]:
%%time
list_ = list(range(N))
list_sum = sum(list_)

CPU times: user 3.13 s, sys: 3.38 s, total: 6.51 s
Wall time: 6.59 s


In [10]:
%%time
arr = np.arange(N)
arr = arr * arr

CPU times: user 558 ms, sys: 705 ms, total: 1.26 s
Wall time: 1.28 s


In [11]:
%%time
arr = np.arange(N)
arr_sum = np.sum(arr)

CPU times: user 334 ms, sys: 36.6 ms, total: 370 ms
Wall time: 376 ms


## Creating up arrays

*   **np.arrange()**
*   **np.linspace()**
*   **np.array()**
*   **np.randon.randn()**
*   **np.randon.rand()**
*   **np.randon.randint()**
*   **np.ones()**
*   **np.zeros()**

In [2]:
arr = np.arange(5)
print(arr, type(arr))

[0 1 2 3 4] <class 'numpy.ndarray'>


In [4]:
arr = np.array([0, 1, 2, 3, 4])
print(arr, type(arr))

[0 1 2 3 4] <class 'numpy.ndarray'>


In [19]:
print("dtype", arr.dtype)
print("ndim", arr.ndim)
print("shape", arr.shape)
print("size", arr.size)
print("itemsize", arr.itemsize)  # int64 requires 8 bytes to store

dtype int64
ndim 1
shape (5,)
size 5
itemsize 8



In [20]:
arr = np.array([0., 1, 2, 3, 4])
print(arr, type(arr))
print("dtype", arr.dtype)
print("ndim", arr.ndim)
print("shape", arr.shape)
print("size", arr.size)
print("itemsize", arr.itemsize)  # int64 requires 8 bytes to store

[0. 1. 2. 3. 4.] <class 'numpy.ndarray'>
dtype float64
ndim 1
shape (5,)
size 5
itemsize 8


In [25]:
arr2d = np.array([
                   [1, 2, 3],
                   [4, 5, 6]
])
print(arr2d, type(arr2d))
print("dtype", arr2d.dtype)
print("ndim", arr2d.ndim)
print("shape", arr2d.shape)    # We move backwards dim_1 <- dim_2  => (2, 3)
print("size", arr2d.size)
print("itemsize", arr2d.itemsize) 

[[1 2 3]
 [4 5 6]] <class 'numpy.ndarray'>
dtype int64
ndim 2
shape (2, 3)
size 6
itemsize 8


In [26]:
arr3d = np.array([
                  [
                   [1, 2, 3],
                   [4, 5, 6]
                  ],
                  [
                   [7, 8, 9],
                   [10, 11, 12]
                  ]
])
print(arr3d, type(arr3d))
print("dtype", arr3d.dtype)
print("ndim", arr3d.ndim)
print("shape", arr3d.shape)    # We move backwards dim_0 <- dim_1 <- dim_2  => (2, 2, 3)
print("size", arr3d.size)
print("itemsize", arr3d.itemsize)
 

[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]] <class 'numpy.ndarray'>
dtype int64
ndim 3
shape (2, 2, 3)
size 12
itemsize 8


In [27]:
np.ones((3,4))

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [29]:
7 * np.ones((3, 2, 4))

array([[[7., 7., 7., 7.],
        [7., 7., 7., 7.]],

       [[7., 7., 7., 7.],
        [7., 7., 7., 7.]],

       [[7., 7., 7., 7.],
        [7., 7., 7., 7.]]])

In [33]:
np.zeros((2,3,4))

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

In [35]:
np.random.randn(3,5)    # numbers sampled from normal distribution with mean -> 0, variance -> 1 (rand n-> denotes normal distribution)

array([[-0.18046979,  2.05720755, -1.15158523,  1.88360195, -0.53092506],
       [ 0.44580324,  0.64281046,  1.61658394,  0.14385076, -0.09909543],
       [ 0.84498516, -0.08147961,  0.33974856, -0.01269024, -0.00274298]])

In [36]:
np.random.rand(3,5)    # numbers uniformally sampled between 0 and 1

array([[0.8479349 , 0.1204364 , 0.16865522, 0.41918278, 0.55520041],
       [0.55237442, 0.9319078 , 0.98744012, 0.61321006, 0.23625287],
       [0.34102103, 0.85956913, 0.86330049, 0.33054649, 0.49919491]])

In [37]:
np.random.randint(0, 10, (2, 3))

array([[3, 8, 6],
       [7, 2, 8]])

In [38]:
np.arange(1, 10, 2)     # 2 is step size

array([1, 3, 5, 7, 9])

In [39]:
np.linspace(7, 70, 10)   # 10 numbers between 7-70

array([ 7., 14., 21., 28., 35., 42., 49., 56., 63., 70.])

In [40]:
np.array([True, False, True])

array([ True, False,  True])

In [44]:
str_array = np.array(['3.1', '2.1', '1.1'])

In [48]:
arr = np.array(str_array, dtype='float')
arr

array([3.1, 2.1, 1.1])

## Indexing and Slicing

In [50]:
arr3d = np.array([
                  [
                   [1, 2, 3],
                   [4, 5, 6]
                  ],
                  [
                   [7, 8, 9],
                   [10, 11, 12]
                  ]
])
print(arr3d)

[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]]


In [56]:
print(arr3d[0, 0, 0])   # Access first element
print(arr3d[1, 1, 2])   # Access last element

1
12


In [68]:
print(arr3d[0, :, :])   
print(10*'-')
print(arr3d[1, :, :])
print(10*'-')
print(arr3d[1, 0, :])
print(10*'-')
print(arr3d[1, :, 0:2])
print(10*'-')
print(arr3d[:, :, 0:2])

[[1 2 3]
 [4 5 6]]
----------
[[ 7  8  9]
 [10 11 12]]
----------
[7 8 9]
----------
[[ 7  8]
 [10 11]]
----------
[[[ 1  2]
  [ 4  5]]

 [[ 7  8]
  [10 11]]]


In [69]:
arr3d % 2 == 0       # to check even number in a array

array([[[False,  True, False],
        [ True, False,  True]],

       [[False,  True, False],
        [ True, False,  True]]])

In [72]:
print(arr3d[arr3d % 2 == 0] )    # find even numbers
print(arr3d[arr3d % 2 == 1]  )   # find odd numbers
print(arr3d[(arr3d % 2 == 1) & (arr3d > 3)] )    # find odd numbers greater than three

[ 2  4  6  8 10 12]
[ 1  3  5  7  9 11]
[ 5  7  9 11]


In [76]:
# If you modify slice that will reflect in the array
arr_slice = arr3d[:, :, 0:2]
print(arr_slice)
print(arr_slice[0,0,0])
arr_slice[0,0,0] = 1170
print(arr_slice)

[[[ 1  2]
  [ 4  5]]

 [[ 7  8]
  [10 11]]]
1
[[[1170    2]
  [   4    5]]

 [[   7    8]
  [  10   11]]]


In [77]:
print(arr3d)

[[[1170    2    3]
  [   4    5    6]]

 [[   7    8    9]
  [  10   11   12]]]


In [78]:
arr_slice = np.copy(arr3d[:, :, 0:2])   # deep copy
arr_slice[0, 0, 0] = 1
print(arr_slice)
print(arr3d)

[[[ 1  2]
  [ 4  5]]

 [[ 7  8]
  [10 11]]]
[[[1170    2    3]
  [   4    5    6]]

 [[   7    8    9]
  [  10   11   12]]]


In [81]:
arr = np.random.randint(1, 10, (7))
arr

array([1, 1, 3, 2, 4, 5, 3])

In [82]:
my_indices = [1, 3, 4]

In [83]:
arr[my_indices]

array([1, 2, 4])

## Operations

In [88]:
arr1 = np.random.rand(3,4)
arr2 = np.random.rand(3,4)
print(arr1)
print(10*"-")
print(arr2)

[[0.36157727 0.88764762 0.11416759 0.42388794]
 [0.74928292 0.33760497 0.40284372 0.96257455]
 [0.75679546 0.8298889  0.15421203 0.12083383]]
----------
[[0.57765781 0.6112036  0.84088791 0.07111772]
 [0.31811723 0.88514564 0.79515401 0.36433593]
 [0.9564157  0.23876053 0.11752022 0.87939222]]


In [93]:
print(arr1 + arr2)
print(10*"-")
print(arr1 * arr2)
print(10*"-")
print(arr1 / arr2)
print(10*"-")
print(np.exp(arr1))
print(10*"-")
print(np.log(np.exp(arr1)))
print(10*"-")
print(np.sqrt(arr1))

[[0.93923508 1.49885122 0.9550555  0.49500566]
 [1.06740015 1.22275061 1.19799773 1.32691048]
 [1.71321116 1.06864942 0.27173224 1.00022605]]
----------
[[0.20886793 0.54253342 0.09600214 0.03014594]
 [0.23835981 0.29882957 0.3203228  0.35070049]
 [0.72381106 0.19814471 0.01812303 0.10626033]]
----------
[[0.62593679 1.4522945  0.13577028 5.96037049]
 [2.35536728 0.38141177 0.50662352 2.64199733]
 [0.79128297 3.4758212  1.31221703 0.13740607]]
----------
[[1.43559195 2.42940804 1.12093996 1.52789037]
 [2.1154825  1.40158672 1.49607307 2.61842909]
 [2.13143499 2.29306396 1.16673824 1.12843738]]
----------
[[0.36157727 0.88764762 0.11416759 0.42388794]
 [0.74928292 0.33760497 0.40284372 0.96257455]
 [0.75679546 0.8298889  0.15421203 0.12083383]]
----------
[[0.60131296 0.94215053 0.33788694 0.65106677]
 [0.8656113  0.58103784 0.63469971 0.98110884]
 [0.86993992 0.91098238 0.39269839 0.3476116 ]]


## example of points outside of a circle

In [78]:
ndim = 2
npoints = 1000000
outside_points = 0

In [79]:
points = np.random.rand(npoints, ndim)
dfo = np.zeros((npoints, 1))

In [80]:
points[0:2,:]

array([[0.71583758, 0.54701165],
       [0.70612178, 0.56400189]])

In [81]:
%%time
for i in range(npoints):
  for j in range(ndim):
    dfo[i] += points[i, j] ** 2
  dfo[i] = np.sqrt(dfo[i])
  if dfo[i] > 1:
    outside_points += 1

print('Fraction of points outside is: ', outside_points/npoints)    

Fraction of points outside is:  0.214274
CPU times: user 10.8 s, sys: 13.7 ms, total: 10.8 s
Wall time: 10.8 s


In [83]:
%%time
sq_points = points * points
dfo = np.sqrt(np.sum(sq_points, axis=1))
outside_points = np.sum(dfo > 1)
print('Fraction of points outside is: ', outside_points/npoints)   

Fraction of points outside is:  0.214274
CPU times: user 25.5 ms, sys: 5.04 ms, total: 30.5 ms
Wall time: 32.2 ms


In [84]:
%%time
outside_points = np.sum(np.sqrt(np.sum(points * points, axis=1)) > 1)
print('Fraction of points outside is: ', outside_points/npoints)  

Fraction of points outside is:  0.214274
CPU times: user 24.8 ms, sys: 0 ns, total: 24.8 ms
Wall time: 28.7 ms


In [87]:
def area_outside_square(npoints, ndim):
  points = np.random.rand(npoints, ndim)
  outside_points = np.sum(np.sqrt(np.sum(points * points, axis=1)) > 1) / npoints
  return outside_points
   
area_outside_square(1000000, 2)

0.214543

In [88]:
for i in range(2, 11):
  print(i, area_outside_square(1000000, i))

2 0.214005
3 0.476638
4 0.691622
5 0.835432
6 0.918959
7 0.963109
8 0.98436
9 0.993521
10 0.997518


## Broadcasting

In [9]:
arr1 = np.arange(6)
print(arr1)
print("shape: :", arr1.shape)
print('*'*10)
# Rearrange arr1 into 2*3 matrix
arr1 = arr1.reshape((3, 2))
print(arr1)
print("shape: :", arr1.shape)

[0 1 2 3 4 5]
shape: : (6,)
**********
[[0 1]
 [2 3]
 [4 5]]
shape: : (3, 2)


In [19]:
arr2 = np.arange(6).reshape((3, 2))
print(arr2)

[[0 1]
 [2 3]
 [4 5]]


In [13]:
arr1 + arr2

array([[ 0,  2],
       [ 4,  6],
       [ 8, 10]])

In [17]:
arr2[0].reshape((1, 2))

array([[0, 1]])

In [18]:
arr1 + arr2[0].reshape((1, 2))      # (3, 2) + (1, 2)

array([[0, 2],
       [2, 4],
       [4, 6]])

In [21]:
arr2[:, 0].reshape((3, 1))

array([[0],
       [2],
       [4]])

In [22]:
arr1 + arr2[:, 0].reshape((3, 1))     # (3, 2) + (3, 1)

array([[0, 1],
       [4, 5],
       [8, 9]])

In [23]:
arr1 + 1     # (3, 2) + (1,)

array([[1, 2],
       [3, 4],
       [5, 6]])

In [25]:
arr1 = np.arange(24).reshape((2, 3, 4))
print(arr1)

[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]


In [26]:
arr2 = np.ones((1, 4))

In [27]:
arr1 + arr2     # (2, 3, 4) + (1, 4)

array([[[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]],

       [[13., 14., 15., 16.],
        [17., 18., 19., 20.],
        [21., 22., 23., 24.]]])

In [28]:
arr1 = np.arange(4)
arr2 = np.arange(5)

print(arr1.shape, arr2.shape)

(4,) (5,)


In [29]:
arr1 + arr2  # (4) + (5)

ValueError: ignored

In [30]:
arr1.reshape((4,1)) + arr2     # (4, 1) + (5)

array([[0, 1, 2, 3, 4],
       [1, 2, 3, 4, 5],
       [2, 3, 4, 5, 6],
       [3, 4, 5, 6, 7]])

## File Handling

In [7]:
planets_small = np.loadtxt("planets_small.txt")

ValueError: ignored

In [8]:
planets_small = np.loadtxt("planets_small.txt", skiprows=1)

ValueError: ignored

In [9]:
planets_small = np.loadtxt("planets_small.txt",
                           skiprows=1,
                           usecols=(1, 2, 3, 4, 5, 6, 7, 8, 9))

In [10]:
planets_small

array([[3.3000e-01, 4.8700e+00, 5.9700e+00, 6.4200e-01, 1.8980e+03,
        5.6800e+02, 8.6800e+01, 1.0200e+02, 1.4600e-02],
       [5.7900e+01, 1.0820e+02, 1.4960e+02, 2.2790e+02, 7.7860e+02,
        1.4335e+03, 2.8725e+03, 4.4951e+03, 5.9064e+03],
       [4.2226e+03, 2.8020e+03, 2.4000e+01, 2.4700e+01, 9.9000e+00,
        1.0700e+01, 1.7200e+01, 1.6100e+01, 1.5330e+02]])

In [11]:
print(planets_samll.ndim)
print(planets_samll.shape)

2
(3, 9)


In [12]:
planets = np.loadtxt("planets.txt",
                           skiprows=1,
                           usecols=(1, 2, 3, 4, 5, 6, 7, 8, 9))

ValueError: ignored

In [13]:
planets = np.genfromtxt("planets.txt",
                           skip_header=1,
                           usecols=(1, 2, 3, 4, 5, 6, 7, 8, 9))

In [14]:
planets

array([[ 3.30000e-01,  4.87000e+00,  5.97000e+00,  7.30000e-02,
         6.42000e-01,  1.89800e+03,  5.68000e+02,  8.68000e+01,
         1.02000e+02],
       [ 4.87900e+03,  1.21040e+04,  1.27560e+04,  3.47500e+03,
         6.79200e+03,  1.42984e+05,  1.20536e+05,  5.11180e+04,
         4.95280e+04],
       [ 5.42700e+03,  5.24300e+03,  5.51400e+03,  3.34000e+03,
         3.93300e+03,  1.32600e+03,  6.87000e+02,  1.27100e+03,
         1.63800e+03],
       [ 3.70000e+00,  8.90000e+00,  9.80000e+00,  1.60000e+00,
         3.70000e+00,  2.31000e+01,  9.00000e+00,  8.70000e+00,
         1.10000e+01],
       [ 4.30000e+00,  1.04000e+01,  1.12000e+01,  2.40000e+00,
         5.00000e+00,  5.95000e+01,  3.55000e+01,  2.13000e+01,
         2.35000e+01],
       [ 1.40760e+03, -5.83250e+03,  2.39000e+01,  6.55700e+02,
         2.46000e+01,  9.90000e+00,  1.07000e+01, -1.72000e+01,
         1.61000e+01],
       [ 4.22260e+03,  2.80200e+03,  2.40000e+01,  7.08700e+02,
         2.47000e+01,  9.90000

In [15]:
print(planets.shape)

(20, 9)


In [16]:
np.isnan(planets)

array([[False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, 

In [17]:
planets_new = np.nan_to_num(planets, nan=-1)

In [18]:
planets_new

array([[ 3.30000e-01,  4.87000e+00,  5.97000e+00,  7.30000e-02,
         6.42000e-01,  1.89800e+03,  5.68000e+02,  8.68000e+01,
         1.02000e+02],
       [ 4.87900e+03,  1.21040e+04,  1.27560e+04,  3.47500e+03,
         6.79200e+03,  1.42984e+05,  1.20536e+05,  5.11180e+04,
         4.95280e+04],
       [ 5.42700e+03,  5.24300e+03,  5.51400e+03,  3.34000e+03,
         3.93300e+03,  1.32600e+03,  6.87000e+02,  1.27100e+03,
         1.63800e+03],
       [ 3.70000e+00,  8.90000e+00,  9.80000e+00,  1.60000e+00,
         3.70000e+00,  2.31000e+01,  9.00000e+00,  8.70000e+00,
         1.10000e+01],
       [ 4.30000e+00,  1.04000e+01,  1.12000e+01,  2.40000e+00,
         5.00000e+00,  5.95000e+01,  3.55000e+01,  2.13000e+01,
         2.35000e+01],
       [ 1.40760e+03, -5.83250e+03,  2.39000e+01,  6.55700e+02,
         2.46000e+01,  9.90000e+00,  1.07000e+01, -1.72000e+01,
         1.61000e+01],
       [ 4.22260e+03,  2.80200e+03,  2.40000e+01,  7.08700e+02,
         2.47000e+01,  9.90000

In [19]:
np.savetxt('planets_new.txt', planets_new, delimiter=',')

In [20]:
np.save('planets_new', planets_new)

In [22]:
!ls -lh

total 24K
-rw-r--r-- 1 root root 1.6K Sep 14 07:30 planets_new.npy
-rw-r--r-- 1 root root 4.5K Sep 14 07:29 planets_new.txt
-rw-r--r-- 1 root root  254 Sep 14 07:14 planets_small.txt
-rw-r--r-- 1 root root 1.5K Sep 14 07:14 planets.txt
drwxr-xr-x 1 root root 4.0K Aug 27 16:39 sample_data


In [23]:
arr1 = np.random.rand(1000, 10)
arr2 = np.random.rand(2000, 5)
arr3 = np.random.rand(20, 10000)

In [24]:
np.savez("many_arrs", arr1, arr2, arr3)

In [33]:
!ls -lh

total 1.8M
-rw-r--r-- 1 root root 1.7M Sep 14 07:36 many_arrs.npz
-rw-r--r-- 1 root root 1.6K Sep 14 07:30 planets_new.npy
-rw-r--r-- 1 root root 4.5K Sep 14 07:29 planets_new.txt
-rw-r--r-- 1 root root  254 Sep 14 07:14 planets_small.txt
-rw-r--r-- 1 root root 1.5K Sep 14 07:14 planets.txt
drwxr-xr-x 1 root root 4.0K Aug 27 16:39 sample_data


In [25]:
arrs = np.load('many_arrs.npz')

In [26]:
type(arrs)

numpy.lib.npyio.NpzFile

In [27]:
arrs.files

['arr_0', 'arr_1', 'arr_2']

In [30]:
arrs['arr_0']

array([[0.6001878 , 0.15407126, 0.62480416, ..., 0.6638357 , 0.20630952,
        0.64575668],
       [0.79853725, 0.73532359, 0.08389687, ..., 0.20147766, 0.33732668,
        0.23227029],
       [0.33400328, 0.54010334, 0.11302655, ..., 0.58842882, 0.66767403,
        0.37146185],
       ...,
       [0.30717726, 0.89343261, 0.972147  , ..., 0.01136805, 0.74560646,
        0.83406113],
       [0.78570434, 0.77458509, 0.8308329 , ..., 0.43761671, 0.26204134,
        0.85086319],
       [0.48197353, 0.91697518, 0.86987418, ..., 0.73353933, 0.19899185,
        0.2438503 ]])

In [32]:
arrs['arr_0'].shape

(1000, 10)

In [34]:
np.savez_compressed("many_arrs_compressed", arr1, arr2, arr3)

In [35]:
!ls -lh

total 3.3M
-rw-r--r-- 1 root root 1.6M Sep 14 07:42 many_arrs_compressed.npz
-rw-r--r-- 1 root root 1.7M Sep 14 07:36 many_arrs.npz
-rw-r--r-- 1 root root 1.6K Sep 14 07:30 planets_new.npy
-rw-r--r-- 1 root root 4.5K Sep 14 07:29 planets_new.txt
-rw-r--r-- 1 root root  254 Sep 14 07:14 planets_small.txt
-rw-r--r-- 1 root root 1.5K Sep 14 07:14 planets.txt
drwxr-xr-x 1 root root 4.0K Aug 27 16:39 sample_data


In [36]:
arr1 = np.zeros((10000, 10000))

In [37]:
np.savez("zeros", arr1)

In [38]:
np.savez_compressed("zeros_copressed", arr1)

In [39]:
!ls -lh

total 767M
-rw-r--r-- 1 root root 1.6M Sep 14 07:42 many_arrs_compressed.npz
-rw-r--r-- 1 root root 1.7M Sep 14 07:36 many_arrs.npz
-rw-r--r-- 1 root root 1.6K Sep 14 07:30 planets_new.npy
-rw-r--r-- 1 root root 4.5K Sep 14 07:29 planets_new.txt
-rw-r--r-- 1 root root  254 Sep 14 07:14 planets_small.txt
-rw-r--r-- 1 root root 1.5K Sep 14 07:14 planets.txt
drwxr-xr-x 1 root root 4.0K Aug 27 16:39 sample_data
-rw-r--r-- 1 root root 760K Sep 14 07:44 zeros_copressed.npz
-rw-r--r-- 1 root root 763M Sep 14 07:43 zeros.npz


# Stats with NumPy


In [41]:
arr = np.random.randn(10000000,)      # Draws numbers from normal distribution
print("Minimum value: ", np.amin(arr))        
print("Maximum value: ", np.amax(arr))        
print("Mean value: ", np.mean(arr))           # mean will be 0 because of normal distribution
print("variance value: ", np.var(arr))        # variance and std will be 1
print("std value: ", np.std(arr)) 
print("Median value: ", np.median(arr)) 
print("percentile value: ", np.percentile(arr, 50)) 

Minimum value:  -5.225238254431168
Maximum value:  5.2863398097461225
Mean value:  0.0002030726905131512
variance value:  1.0004477829224923
std value:  1.0002238664031629
Median value:  -1.3952747006648546e-05
percentile value:  -1.3952747006648546e-05


In [18]:
arr = np.random.rand(10000000,)      # Draws numbers between 0 and 1

In [15]:
print("Minimum value: ", np.amin(arr))        # amin will be close to 0 because numbers are chosen between 0 and 1
print("Maximum value: ", np.amax(arr))        # amax will be close t0 1
print("Mean value: ", np.mean(arr)) 
print("variance value: ", np.var(arr)) 
print("std value: ", np.std(arr)) 
print("Median value: ", np.median(arr)) 
print("percentile value: ", np.percentile(arr, 50)) 

Minimum value:  2.1467187263368714e-06
Maximum value:  0.9999975188536331
Mean value:  0.49947000788140394
variance value:  0.08311069757216359
std value:  0.2882892602442269
Median value:  0.4979015621070905
percentile value:  0.4979015621070905


In [19]:
%%time
iqr = np.percentile(arr, 75) - np.percentile(arr, 25)
print("IQR: ", iqr)

IQR:  0.5001372303179574
CPU times: user 284 ms, sys: 41.4 ms, total: 325 ms
Wall time: 327 ms


In [20]:
%%time
quartiles = np.percentile(arr, [75, 25])
print(quartiles)
iqr = quartiles[0] - quartiles[1]
print(iqr)

[0.75011767 0.24998044]
0.5001372303179574
CPU times: user 211 ms, sys: 0 ns, total: 211 ms
Wall time: 210 ms


In [22]:
# z-score
(arr-np.mean(arr)) / np.std(arr)

array([ 0.2076759 , -1.61263066, -1.40088913, ..., -1.17242275,
       -0.394217  , -0.06273243])

In [23]:
np.histogram(arr)   # first is number of points in the bins
                    # Second is bins

(array([1000594, 1000116, 1000975,  998547,  998453,  999886, 1000937,
         999248,  999691, 1001553]),
 array([2.79968515e-08, 1.00000021e-01, 2.00000015e-01, 3.00000008e-01,
        4.00000002e-01, 4.99999995e-01, 5.99999989e-01, 6.99999982e-01,
        7.99999976e-01, 8.99999969e-01, 9.99999963e-01]))

In [24]:
np.histogram(arr, bins=5)

(array([2000710, 1999522, 1998339, 2000185, 2001244]),
 array([2.79968515e-08, 2.00000015e-01, 4.00000002e-01, 5.99999989e-01,
        7.99999976e-01, 9.99999963e-01]))

In [25]:
np.histogram(arr, bins=[0, 0.25, 0.5, 0.75, 1])

(array([2500188, 2498497, 2500161, 2501154]),
 array([0.  , 0.25, 0.5 , 0.75, 1.  ]))

In [26]:
bins=[0, 0.25, 0.5, 0.75, 1]      # first number is in 3rd bin (0.5 - 0.75)
np.digitize(arr, bins)            # second number is in 1st bin (0 - 0.25)

array([3, 1, 1, ..., 1, 2, 2])

In [29]:
arr1 = np.random.randint(0, 10, (10))
print(arr1)
bins = [0, 6, 10]                    # 1st bin (0 - 6) & 2nd bin (6 - 10)
np.digitize(arr1, bins)              # number 6 is in 2nd bin

[5 2 9 7 6 2 6 7 2 1]


array([1, 1, 2, 2, 2, 1, 2, 2, 1, 1])

In [30]:
print(arr1)
bins = [0, 6, 10]                    # 1st bin (0 - 6) & 2nd bin (6 - 10)
np.digitize(arr1, bins, right=True)  # number 6 is in 1st bin

[5 2 9 7 6 2 6 7 2 1]


array([1, 1, 2, 2, 1, 1, 1, 2, 1, 1])

In [31]:
arr1 = np.random.randint(50, 80, 100)      # weight
arr2 = np.random.randint(150, 185, 100)    # height
arr3 = np.random.randint(17, 22, 100)      # age

In [32]:
np.concatenate((arr1, arr2, arr3)).shape

(300,)

In [33]:
np.vstack((arr1, arr2, arr3)).shape

(3, 100)

In [35]:
arr2d = np.vstack((arr1, arr2, arr3))

In [36]:
np.amin(arr2d, axis=1)

array([ 50, 150,  17])

In [38]:
np.amax(arr2d, axis=1)

array([ 79, 184,  21])

In [40]:
np.mean(arr2d, axis=1)

array([ 65.07, 166.65,  19.13])

# Checking stats rules with NumPy

## mean subtracted array has 0 mean



In [45]:
arr = np.random.rand(1000)

In [46]:
mean = np.mean(arr)
arr1 = arr - mean
print(np.mean(arr1))

-1.7319479184152442e-17


## Computing mean with smallar set of values

In [52]:
arr = np.random.rand(1000)

In [None]:
for k in range(1, 50):
  arr1 = arr[0:k]
  print(k , np.mean(arr1)) 

In [56]:
means = np.cumsum(arr) / np.arange(1, 1001)
means[0:15]

array([0.63668385, 0.74673168, 0.70181951, 0.62117463, 0.52494719,
       0.43952086, 0.48902064, 0.5367312 , 0.49998348, 0.54883787,
       0.56932094, 0.58343522, 0.58883108, 0.58573912, 0.60668574])

## Effect of outliers on median and median

In [58]:
arr = np.random.randint(1, 100, 100)
print(np.mean(arr))
print(np.median(arr))

54.09
54.5


In [60]:
arr = np.append(arr, [1000, 2000])
print(np.mean(arr))                  # mean has changed after adding outliers
print(np.median(arr))                # median has not been affected

82.44117647058823
55.0


## Effect of scalling on mean and median

In [72]:
arr = np.random.rand(100)
print(np.mean(arr))
print(np.median(arr))

0.4741313479637832
0.44832454204308997


In [73]:
arr1 = 2.5 * arr + 0.65

In [74]:
print(np.mean(arr1), 2.5 * np.mean(arr) + 0.65)
print(np.mean(2.5 * arr + 0.65), 2.5 * np.mean(arr) + 0.65)

1.8353283699094578 1.8353283699094578
1.8353283699094578 1.8353283699094578


In [75]:
print(np.median(2.5 * arr + 0.65), 2.5 * np.median(arr) + 0.65)

1.770811355107725 1.7708113551077251


In [77]:
print(np.var(2.5 * arr + 0.65), 2.5*2.5 * np.var(arr))

0.555413197690125 0.5554131976901251


In [78]:
print(np.std(2.5 * arr + 0.65), 2.5 * np.std(arr))

0.7452604898222668 0.7452604898222669


In [80]:
arr1 = np.random.rand(100)
arr2 = np.random.rand(100)

In [81]:
print(np.mean(0.21 * arr1 - 0.75 * arr2),
      0.21 * np.mean(arr1) - 0.75 * np.mean(arr2))

-0.28474113172995724 -0.2847411317299572


# Case Study



1.   Find mean, median and IQR for Sachin, Rhual and India.
2.   Find the histogram of Sachin's scores with 10 bins.
3.   Find the mean of Sachin's scores grouped by 25 matches.
4.   Find the mean of Sachin's scores where he has scored a century.
5.   Find the mean of Sachin's scores when Rahul has scored less than 10.
6.   Find the mean of Sachin's scores based on which quartile India's score falls in.
7.   For every match find out who has scored more: Sachin or Rahul.
8.   How many more runs does Sachin scores on average after having scored x runs.
9.   How many matches did Sachin take to score first 1000 runs, next 1000 runs...



In [3]:
!head /content/cric_data-200320-181217.tsv

	Sachin Tendulkar	Rahul Dravid	India
0	100	78	342
1	11	62	191
2	8	85	252
3	71	24	307
4	104	17	229
5	18	104	246
6	8	76	226
7	86	74	288
8	12	60	216


In [4]:
cric = np.loadtxt("/content/cric_data-200320-181217.tsv", skiprows=1, usecols=[1, 2, 3] , delimiter="\t")

In [5]:
cric.shape

(225, 3)

In [6]:
cric[0:9, :]

array([[100.,  78., 342.],
       [ 11.,  62., 191.],
       [  8.,  85., 252.],
       [ 71.,  24., 307.],
       [104.,  17., 229.],
       [ 18., 104., 246.],
       [  8.,  76., 226.],
       [ 86.,  74., 288.],
       [ 12.,  60., 216.]])

In [7]:
# Find mean, median and IQR for Sachin, Rhual and India.
print("mean: ", np.mean(cric, axis=0))
print("median: ", np.median(cric, axis=0))

quartile = np.percentile(cric, [75, 25], axis=0)
print("IQR: ", quartile[0]-quartile[1])

mean:  [ 39.87555556  32.06222222 220.79555556]
median:  [ 27.  22. 216.]
IQR:  [57. 46. 98.]


In [8]:
# Find the histogram of Sachin's scores with 10 bins.
sachin_runs = cric[:, 0]
np.histogram(sachin_runs, bins=10)

(array([99, 36, 28, 16, 11, 17,  8,  8,  1,  1]),
 array([  0. ,  18.6,  37.2,  55.8,  74.4,  93. , 111.6, 130.2, 148.8,
        167.4, 186. ]))

In [9]:
# Find the mean of Sachin's scores grouped by 25 matches
print(sachin_runs.shape)
print(sachin_runs.reshape(9, 25).shape)

sachin_25s = sachin_runs.reshape(9, 25)
print(np.mean(sachin_25s, axis=1))

(225,)
(9, 25)
[33.96 49.4  38.48 40.16 39.36 38.2  44.6  39.52 35.2 ]


In [10]:
# Find the mean of Sachin's scores where he has scored a century.
sachin_runs = cric[:, 0]
print(sachin_runs[sachin_runs >= 100])
print(np.mean(sachin_runs[sachin_runs >= 100]))

[100. 104. 138. 141. 186. 146. 141. 123. 120. 122. 100. 152. 105. 122.
 100. 117. 141. 139. 114. 127. 110. 146. 101. 140. 113. 102.]
125.0


In [11]:
# Find the mean of Sachin's scores when Rahul has scored less than 10.
rahul_runs = cric[:, 1]
print(np.mean(sachin_runs[rahul_runs <= 10]))

40.2112676056338


In [13]:
# Find the mean of Sachin's scores based on which quartile India's score falls in.
india = cric[:, 2]
quartiles = np.percentile(india, [25, 50, 75, 100])
print(quartiles)

[175. 216. 273. 499.]


1. if India <= 175 sachin average is ...
2. if India <= 216 sachin average is ...
3. if India <= 273 sachin average is ...
4. if India <= 499 sachin average is ...

In [16]:
print(india.shape)
print(quartiles.shape)        # incompatible to broadcasting (225) & (4)

india < quartiles

(225,)
(4,)


ValueError: ignored

In [18]:
quartiles = quartiles.reshape(4, 1)   # now (225) & (4, 1) are compatible

In [24]:
indices = india < quartiles
indices.shape

(4, 225)

In [25]:
sachin_runs[indices[0, :]]

array([18., 14.,  0., 62., 46., 65.,  0., 39., 48.,  3., 11., 65., 27.,
       28.,  3.,  4., 15., 40.,  5.,  8., 89.,  0.,  0.,  1.,  0.,  0.,
       81., 13.,  2., 36., 12., 19.,  0.,  6., 35.,  0., 44.,  3., 47.,
       17., 35., 33.,  7.,  9.,  2., 11., 17.,  1., 10.,  0., 23.,  1.,
        2., 25.,  0.])

In [26]:
for i in range(4):
  print(i, np.mean(sachin_runs[indices[i, :]]))

0 19.672727272727272
1 28.18018018018018
2 31.688622754491018
3 39.799107142857146


In [38]:
# For every match find out who has scored more: Sachin or Rahul.
snr = cric[:, 0:2]
print(snr.shape)

is_rahul_higher = np.argmax(snr, axis=1)

(225, 2)


In [39]:
np.sum(is_rahul_higher) / 225     # 47 percent of the times rahul scored higher

0.4711111111111111

In [40]:
np.where(is_rahul_higher == 0, 'Sachin', 'Rahul')

array(['Sachin', 'Rahul', 'Rahul', 'Sachin', 'Sachin', 'Rahul', 'Rahul',
       'Sachin', 'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Sachin',
       'Sachin', 'Rahul', 'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Sachin',
       'Rahul', 'Sachin', 'Sachin', 'Rahul', 'Sachin', 'Sachin', 'Sachin',
       'Sachin', 'Sachin', 'Sachin', 'Sachin', 'Rahul', 'Rahul', 'Sachin',
       'Rahul', 'Rahul', 'Sachin', 'Rahul', 'Sachin', 'Sachin', 'Sachin',
       'Sachin', 'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Sachin', 'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Sachin', 'Sachin', 'Sachin', 'Sachin', 'Rahul', 'Sachin', 'Rahul',
       'Rahul', 'Sachin', 'Sachin', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Rahul', 'Rahul',
       'Rahul', 'Sachin', 'Sachin', 'Sachin', 'Rahul', 'Sachin', 'Sachin',
       'Sachin', 'Sachin', 'Sachin', 'Sachin'

In [44]:
# How many more runs does Sachin scores on average after having scored x runs.
x_arr = np.arange(0, 101, 5)
print(x_arr.shape)
print(sachin_runs.shape)      # not compatible (21,) & (225,)

(21,)
(225,)


In [47]:
x_arr = x_arr.reshape(x_arr.shape[0], 1)
print(x_arr.shape)

(21, 1)


In [49]:
indices = sachin_runs >= x_arr
print(indices.shape)

(21, 225)


In [52]:
for i in range(x_arr.shape[0]):
  print(x_arr[i, 0],
        np.mean(sachin_runs[indices[i, :]]) - x_arr[i, 0])

0 39.87555555555556
5 45.61363636363637
10 47.48026315789474
15 47.45255474452555
20 46.824
25 44.10084033613445
30 45.13461538461539
35 43.24742268041237
40 44.05882352941177
45 43.41558441558442
50 43.98529411764706
55 42.317460317460316
60 38.67213114754098
65 37.654545454545456
70 37.08163265306122
75 34.347826086956516
80 30.75
85 28.650000000000006
90 27.400000000000006
95 26.433333333333337
100 25.0


In [53]:
# How many matches did Sachin take to score first 1000 runs, next 1000 runs...
sachin_runs

array([100.,  11.,   8.,  71., 104.,  18.,   8.,  86.,  12.,  85.,  18.,
         4.,   7.,  37.,  14.,   0.,   4.,   0.,  21.,   1.,  62.,   0.,
       138.,  38.,   2.,  46.,  65.,   0.,  39.,  48., 141.,  62.,  12.,
         1.,  41.,  11.,   3., 186.,  11.,  27.,  27.,  51.,  18.,  32.,
       146.,   5.,  45., 141.,  12.,  65.,  27.,   7.,  16.,   2.,  28.,
         6., 123., 120.,   7.,   3.,   0.,  81.,   2.,  54., 122.,   4.,
        14.,   0., 100.,  15.,   0.,  57.,  99.,  37.,  38.,  32.,  21.,
        32.,  40.,   0.,   5.,   8.,   5.,   0.,  50.,  30.,  37.,  89.,
         4.,  98.,  83.,  93.,   0.,  52., 152.,   1.,   8.,  93.,  45.,
        26.,   0.,   1.,   0.,  16.,  47.,  89.,   3.,   1.,  53.,  16.,
         0.,  81.,  14.,  78.,   6., 105., 122.,   9.,   8.,  28.,  35.,
        69.,  13.,  97.,  93.,   2.,  36.,  39.,   2.,  29.,  12.,  19.,
        34.,   2., 100.,  44.,  82.,   0.,  79.,   6.,   9.,   8.,  23.,
        93.,  35.,  63.,  74.,   8., 117.,  39.,  4

In [54]:
sachin_cum_score = np.cumsum(sachin_runs)
print(sachin_cum_score)

[ 100.  111.  119.  190.  294.  312.  320.  406.  418.  503.  521.  525.
  532.  569.  583.  583.  587.  587.  608.  609.  671.  671.  809.  847.
  849.  895.  960.  960.  999. 1047. 1188. 1250. 1262. 1263. 1304. 1315.
 1318. 1504. 1515. 1542. 1569. 1620. 1638. 1670. 1816. 1821. 1866. 2007.
 2019. 2084. 2111. 2118. 2134. 2136. 2164. 2170. 2293. 2413. 2420. 2423.
 2423. 2504. 2506. 2560. 2682. 2686. 2700. 2700. 2800. 2815. 2815. 2872.
 2971. 3008. 3046. 3078. 3099. 3131. 3171. 3171. 3176. 3184. 3189. 3189.
 3239. 3269. 3306. 3395. 3399. 3497. 3580. 3673. 3673. 3725. 3877. 3878.
 3886. 3979. 4024. 4050. 4050. 4051. 4051. 4067. 4114. 4203. 4206. 4207.
 4260. 4276. 4276. 4357. 4371. 4449. 4455. 4560. 4682. 4691. 4699. 4727.
 4762. 4831. 4844. 4941. 5034. 5036. 5072. 5111. 5113. 5142. 5154. 5173.
 5207. 5209. 5309. 5353. 5435. 5435. 5514. 5520. 5529. 5537. 5560. 5653.
 5688. 5751. 5825. 5833. 5950. 5989. 6038. 6102. 6145. 6217. 6222. 6239.
 6239. 6304. 6324. 6465. 6493. 6537. 6564. 6624. 66

In [56]:
np.histogram(sachin_cum_score, bins=np.arange(0, 10000, 1000))

(array([29, 18, 26, 25, 26, 26, 23, 22, 30]),
 array([   0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]))

# Exercise:

In [20]:
# Write a program to multiply two matrices of size  (100,100)  in two methods: (a) by using np.dot(mat_1, mat_2) and (b) by using for-loops. Comapre the time of execution in both the case.

mat_1 = np.random.rand(100, 100)
mat_2 = np.random.rand(100, 100)
result = np.zeros([100, 100])

In [29]:
%%time
for i in range(100):
    for j in range(100):
        sum = 0
        for k in range(100):
            sum = sum + mat_1[i][k] * mat_2[k][j]
        result[i][j] = sum

CPU times: user 994 ms, sys: 0 ns, total: 994 ms
Wall time: 995 ms


In [30]:
result.sum()

248570.16739871414

In [31]:
%%time
result = np.dot(mat_1, mat_2)

CPU times: user 1.59 ms, sys: 760 µs, total: 2.35 ms
Wall time: 3.13 ms


In [32]:
result.sum()

248570.16739871414