# Daily Data Science 002i

This notebook is to cover some less frequently used numpy functions that are still used in Data Science.

In [1]:
# Importing the NumPy library
import numpy as np

# Numpy Extras

## Sorting

In [2]:
# To sort something we can use the built in sorted function.
# It takes an iterable and returns a new sorted list.

a = np.random.randint(1, 100, 10)
print(a)
print()
print(np.sort(a))
print(type(np.sort(a)))
print()

# We can also use the axis parameter to sort along a specific axis.
b = np.random.randint(1, 100, (3, 3))
print(b)
print()
print(np.sort(b, axis=0))  # Sort along the columns
print()
print(np.sort(b, axis=1))  # Sort along the rows

[28 80 64 99 26 76 23 33 91 59]

[23 26 28 33 59 64 76 80 91 99]
<class 'numpy.ndarray'>

[[45 91 34]
 [76 47 27]
 [ 1 68  3]]

[[ 1 47  3]
 [45 68 27]
 [76 91 34]]

[[34 45 91]
 [27 47 76]
 [ 1  3 68]]


## Appending

In [3]:
# To append we can use the np.append function. It takes an array, 
# values to append, and an optional axis parameter.
c = np.array([1, 2, 3])
print(c)
print()
print(np.append(c, 4)) # Append a single value

[1 2 3]

[1 2 3 4]


In [4]:
# To append in a 2d array we need to specify the axis.
b = np.append(b, [[10, 20, 30]], axis=0) # Append a new row
print(b)
print()

b = np.append(b, np.ones((b.shape[0], 1)), axis=1) # Append a new column
print(b)

[[45 91 34]
 [76 47 27]
 [ 1 68  3]
 [10 20 30]]

[[45. 91. 34.  1.]
 [76. 47. 27.  1.]
 [ 1. 68.  3.  1.]
 [10. 20. 30.  1.]]


## Concatenation

In [5]:
c = np.arange(6).reshape(2,3)
d = np.arange(6,12).reshape(2,3)
print(c)
print()
print(d)

[[0 1 2]
 [3 4 5]]

[[ 6  7  8]
 [ 9 10 11]]


In [6]:
# to concat we can use the np.concatenate function. It takes a sequence of arrays and an optional axis parameter.

np.concatenate((c, d), axis=0) # Concatenate along the rows

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [7]:
np.concatenate((c, d), axis=1) # Concatenate along the columns

array([[ 0,  1,  2,  6,  7,  8],
       [ 3,  4,  5,  9, 10, 11]])

## Unique

In [8]:
# to find the unique values in an array we can use the np.unique function. It takes an array and returns the sorted unique values.
a = np.array([1, 2, 3, 4, 5, 5, 6, 6, 7])
print(a)
print()
print(np.unique(a))

[1 2 3 4 5 5 6 6 7]

[1 2 3 4 5 6 7]


## Expand Dimensions

In [9]:
# To expand the dimensions of an array we can use the np.expand_dims function. It takes an array and an axis parameter and returns a new array with the dimensions expanded.
a = np.array([1, 2, 3])
print(a)
print()
print(np.expand_dims(a, axis=0)) # Expand along the rows
print()
print(np.expand_dims(a, axis=1)) # Expand along the columns

[1 2 3]

[[1 2 3]]

[[1]
 [2]
 [3]]


## Where

In [10]:
a = np.arange(10)
print(a)
print()
# We can use where to find the indices of the elements that satisfy a condition. It takes a condition and returns the indices of the elements that satisfy the condition.
print(np.where(a%2 == 0)) # Find the indices of the even numbers
print()
print(a[np.where(a%2 == 0)])

[0 1 2 3 4 5 6 7 8 9]

(array([0, 2, 4, 6, 8]),)

[0 2 4 6 8]


In [11]:
# We can also use where to get custom responses for the elements that satisfy a condition. It takes a condition, a value to return for the elements that satisfy the condition, and a value to return for the elements that do not satisfy the condition.
# np.where(condition, true, false) 
print(np.where(a%2 == 0, 'Even', 'Odd')) # Return 'Even' for even numbers and 'Odd' for odd numbers

['Even' 'Odd' 'Even' 'Odd' 'Even' 'Odd' 'Even' 'Odd' 'Even' 'Odd']


## ArgMax

In [12]:
# np.argmax is used to find the indices of the maximum values along an axis.
a = np.array([[1, 2, 3], [4, 5, 6]])
print(a, "\n")
print(np.argmax(a, axis=0)) # Find the indices of the maximum values along the columns
print(np.argmax(a, axis=1)) # Find the indices of the maximum values along the rows
print(np.argmax(a)) # Find the index of the maximum value in the flattened array

[[1 2 3]
 [4 5 6]] 

[1 1 1]
[2 2]
5


In [13]:
# Similarly, np.argmin is used to find the indices of the minimum values along an axis.
print(np.argmin(a, axis=0)) # Find the indices of the minimum values along
print(np.argmin(a, axis=1)) # Find the indices of the minimum values along the rows
print(np.argmin(a)) # Find the index of the minimum value in the flattened array

[0 0 0]
[0 0]
0


## Cumulative Sum

In [14]:
# np.cumsum is used to find the cumulative sum of the elements along an axis.
a = np.array([[1, 2, 3], [4, 5, 6]])
print(a, "\n")
print(np.cumsum(a, axis=0)) # Find the cumulative sum along the columns

[[1 2 3]
 [4 5 6]] 

[[1 2 3]
 [5 7 9]]


In [15]:
# for a simple 1d array
a = np.array([1, 2, 3, 4, 5])
print(a, "\n")
print(np.cumsum(a)) # Find the cumulative sum of the elements in the array

[1 2 3 4 5] 

[ 1  3  6 10 15]


In [16]:
# Similarly we have np.cumprod to find the cumulative product of the elements along an axis.
a = np.array([[1, 2, 3], [4, 5, 6]])
print(a, "\n")
print(np.cumprod(a, axis=0), "\n") # Find the cumulative product along the columns
print(np.cumprod(a, axis=1)) # Find the cumulative product along the rows

[[1 2 3]
 [4 5 6]] 

[[ 1  2  3]
 [ 4 10 18]] 

[[  1   2   6]
 [  4  20 120]]


## Percentile

np.percentile() is used to compute the nth percentile of the given data alongside the given axes.

In [18]:
# np.percentile is used to find the percentile of the elements along an axis. It takes an array, a percentile value, and an optional axis parameter.
a = np.array([[1, 2, 3], [4, 5, 6]])
print(a, "\n")
print(np.percentile(a, 0)) # Find the 0th percentile (minimum)
print(np.percentile(a, 50)) # Find the 50th percentile (median)
print(np.percentile(a, 25)) # Find the 25th percentile (first quartile)
print(np.percentile(a, 75)) # Find the 75th percentile (third quartile)
print(np.percentile(a, 100)) # Find the 100th percentile (maximum)

[[1 2 3]
 [4 5 6]] 

1.0
3.5
2.25
4.75
6.0


## Histogram

np.histogram() represents the frequency of data distribution in graphical form.

In [20]:
# np.histogram is used to compute the histogram of the elements in an array. It takes an array and a number of bins and returns the histogram and the bin edges.
a = np.random.randint(1, 100, 100)
hist, bin_edges = np.histogram(a, bins=10)
print("Histogram:", hist)
print("Bin edges:", bin_edges)

Histogram: [19 10  6  3 13 10 12  6 10 11]
Bin edges: [ 4.  13.4 22.8 32.2 41.6 51.  60.4 69.8 79.2 88.6 98. ]


In [21]:
# we can also specify the bins
a = np.random.randint(1, 100, 100)
bins = [0, 20, 40, 60, 80, 100]
hist, bin_edges = np.histogram(a, bins=bins)
print("Histogram:", hist)
print("Bin edges:", bin_edges)

Histogram: [13 15 22 22 28]
Bin edges: [  0  20  40  60  80 100]


## Correlation Coefficient

np.corrcoef() returns the Pearson product moment correlation coefficients.

In [22]:
# np.corrcoef is used to find the correlation coefficient between two arrays. It takes two arrays and returns the correlation coefficient.
salary = np.array([50000, 60000, 55000, 80000, 75000])
experience = np.array([1, 2, 3, 4, 5])
correlation_coefficient = np.corrcoef(salary, experience)
print("Correlation coefficient between salary and experience:", correlation_coefficient[0, 1])

Correlation coefficient between salary and experience: 0.8551861104941365
