##  Session - I 
### Topic - Basics of Numpy

NumPy is a Python library used for working with arrays



In [1]:
import numpy as np
a = np.array([0,1,2,3]) #passing a list
print(a)

[0 1 2 3]


In [2]:
type(a)

numpy.ndarray

In [3]:
print(np.arange(10)) #creating numpy array for a range of values

[0 1 2 3 4 5 6 7 8 9]


#### How it is faster ?

In [4]:
L = range(1000)
%timeit [i**2 for i in L]

182 µs ± 3.75 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [5]:
a = np.arange(1000)
%timeit a**2

1.29 µs ± 11.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


## 1.Creating array 

### Manual Construction of arrays

In [6]:
#1-D

a = np.array([0,1,2,3])

a

array([0, 1, 2, 3])

In [7]:
#printing dimension of array
a.ndim

1

In [8]:
#shape
a.shape

(4,)

In [9]:
#length
len(a)

4

In [10]:
# 2-D, 3-D

b = np.array([[0,1,2], [3,4,5]])

b

array([[0, 1, 2],
       [3, 4, 5]])

In [11]:
b.ndim

2

In [12]:
b.shape

(2, 3)

In [13]:
len(b) #returns the size of the first dimension

2

In [14]:
c = np.array([[[0,1], [3,4]], [[4,5], [6,7]]])

c

array([[[0, 1],
        [3, 4]],

       [[4, 5],
        [6, 7]]])

In [15]:
c.ndim

3

In [16]:
c.shape

(2, 2, 2)

In [17]:
#1.2 Function for creating array

In [18]:
#using arange function
a = np.arange(10) #0, .. n-1
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [19]:
b = np.arange(1, 10, 2) #start, end (exclusive) , step

In [20]:
b

array([1, 3, 5, 7, 9])

In [21]:
#using linspace

a = np.linspace(0, 1, 6) #start, end, number of points

a

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

In [22]:
#common arrays

a = np.ones((3, 3))

a

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [23]:
b = np.zeros((3, 3))

b

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [24]:
c = np.eye(3) #Return a 2-D array with ones on the diagonal and zeroes on other positions

In [25]:
c

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [26]:
d = np.eye(3,2) #number of rows is 3, and 2 columns

In [27]:
d

array([[1., 0.],
       [0., 1.],
       [0., 0.]])

In [28]:
#create array using diag function

a = np.diag([1,2,3,4]) #constructs a diagonal array

a

array([[1, 0, 0, 0],
       [0, 2, 0, 0],
       [0, 0, 3, 0],
       [0, 0, 0, 4]])

In [29]:
np.diag(a) #Extract diagonal

array([1, 2, 3, 4])

In [30]:
#Create array using random (uniform distribution)
a = np.random.rand(4)

a

array([0.22535037, 0.87057711, 0.22767378, 0.90630539])

In [31]:
a = np.random.randn(4) #Return samples from standard normal distribution

a

array([-0.32029032, -0.73769782,  0.32103613, -1.91541881])

In [32]:
#Basic Datatypes

In [33]:
a = np.arange(10)

a.dtype

dtype('int32')

In [34]:
#Explicitly you can specify the data-type

a = np.arange(10, dtype='float64')

a

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [35]:
#The default data type is float for zeros and ones function

a = np.zeros((3, 3))

print(a)

a.dtype

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


dtype('float64')

<span style="color:green"> *Exercise* : Create a numpy array with complex data-type (complex number), boolean and string</span>

About data types
https://numpy.org/doc/stable/reference/arrays.dtypes.html

## 3. Indexing and Slicing

#### Indexing

In [36]:
a = np.arange(10)
    
print(a[5]) #index starts at 0

5


In [37]:
#For multidimensional array

a = np.diag([1,2,3])

print(a[2,2])

3


In [38]:
a[2,1] = 5 #assigning value

a

array([[1, 0, 0],
       [0, 2, 0],
       [0, 5, 3]])

#### 3.2 Slicing

In [39]:
a = np.arange(10)

a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [40]:
a[1:8:2] #[startindex: endindex(exclusive) : step]

array([1, 3, 5, 7])

In [41]:
#we can also combine assignment and slicing:

a = np.arange(10)
a[5:] = 10

a

array([ 0,  1,  2,  3,  4, 10, 10, 10, 10, 10])

<span style="color:green"> *Exercise 2* : Let we have b=[0,1,2,3,4], what combination of assignment and slicing gives you the following output? 
array([0, 1, 2, 3, 4, 4, 3, 2, 1, 0])</span>

In [42]:
b = np.arange(5)

#? = ?

#a

### 4. Copies and Views


When modifying the view, the original array is modified as well:

In [43]:
a = np.arange(10)

a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [44]:
b =a[::2]
b

array([0, 2, 4, 6, 8])

In [45]:
np.shares_memory(a, b)

True

In [46]:
b[0] = 10
b

array([10,  2,  4,  6,  8])

In [47]:
a #even we are modifying b, it updates a 

array([10,  1,  2,  3,  4,  5,  6,  7,  8,  9])

In [48]:
a = np.arange(10)

c = a[::2].copy() #force a copy
c

array([0, 2, 4, 6, 8])

In [49]:
np.shares_memory(a, c)

False

In [50]:
c[0] = 10

a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [51]:
#Another way of Indexing

In [52]:
a = np.random.randint(0, 20, 15)

a

array([19, 15, 18, 12,  5,  6,  6,  9,  0, 19,  1, 13, 18, 13, 12])

In [53]:
mask = (a % 2 == 0)

In [54]:
mask

array([False, False,  True,  True, False,  True,  True, False,  True,
       False, False, False,  True, False,  True])

In [55]:
extract_from_a = a[mask]

extract_from_a

array([18, 12,  6,  6,  0, 18, 12])

In [56]:
#Indexing using mask
a[mask] = -1
a

array([19, 15, -1, -1,  5, -1, -1,  9, -1, 19,  1, 13, -1, 13, -1])

In [57]:
#Indexing with an array of integers

In [58]:
a =np.arange(0, 100, 10)

a

array([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90])

In [59]:
a[[2,3,2,4,2]]

array([20, 30, 20, 40, 20])

In [60]:
a[[9,7]] =-300
a

array([   0,   10,   20,   30,   40,   50,   60, -300,   80, -300])

In [61]:
#Numerical Operations

In [62]:
a =np.array([1,2,3,4])

In [63]:
a+1

array([2, 3, 4, 5])

In [64]:
a**2

array([ 1,  4,  9, 16], dtype=int32)

In [65]:
#More operations
b =np.ones(4) +1
b

array([2., 2., 2., 2.])

In [66]:
a - b

array([-1.,  0.,  1.,  2.])

In [67]:
a * b

array([2., 4., 6., 8.])

In [68]:
#Matrix Multiplication

c = np.diag([1,2,3,4])

print(c * c)
print("--------------------")
print(c.dot(c))

[[ 1  0  0  0]
 [ 0  4  0  0]
 [ 0  0  9  0]
 [ 0  0  0 16]]
--------------------
[[ 1  0  0  0]
 [ 0  4  0  0]
 [ 0  0  9  0]
 [ 0  0  0 16]]


In [69]:
a==b

array([False,  True, False, False])

In [70]:
a > b

array([False, False,  True,  True])

In [71]:
#Shape Mismatch

c= [3,4,5]
a+c

ValueError: operands could not be broadcast together with shapes (4,) (3,) 

References 

https://numpy.org/doc/ \
https://docs.python.org/3/library/timeit.html \
https://cs231n.github.io/python-numpy-tutorial/#numpy \
https://scipy-lectures.org/intro/numpy/operations.html

### Data Loading

In [72]:
#load data into numpy array
data = np.loadtxt('population.txt')

In [73]:
data

array([[ 1900., 30000.,  4000., 48300.],
       [ 1901., 47200.,  6100., 48200.],
       [ 1902., 70200.,  9800., 41500.],
       [ 1903., 77400., 35200., 38200.],
       [ 1904., 36300., 59400., 40600.],
       [ 1905., 20600., 41700., 39800.],
       [ 1906., 18100., 19000., 38600.],
       [ 1907., 21400., 13000., 42300.],
       [ 1908., 22000.,  8300., 44500.],
       [ 1909., 25400.,  9100., 42100.],
       [ 1910., 27100.,  7400., 46000.],
       [ 1911., 40300.,  8000., 46800.],
       [ 1912., 57000., 12300., 43800.],
       [ 1913., 76600., 19500., 40900.],
       [ 1914., 52300., 45700., 39400.],
       [ 1915., 19500., 51100., 39000.],
       [ 1916., 11200., 29700., 36700.],
       [ 1917.,  7600., 15800., 41800.],
       [ 1918., 14600.,  9700., 43300.],
       [ 1919., 16200., 10100., 41300.],
       [ 1920., 24700.,  8600., 47300.]])

In [74]:
year, hares, lynxes, carrots = data.T 

In [75]:
print(year)

[1900. 1901. 1902. 1903. 1904. 1905. 1906. 1907. 1908. 1909. 1910. 1911.
 1912. 1913. 1914. 1915. 1916. 1917. 1918. 1919. 1920.]


In [76]:
populations = data [:, 1:]
populations

array([[30000.,  4000., 48300.],
       [47200.,  6100., 48200.],
       [70200.,  9800., 41500.],
       [77400., 35200., 38200.],
       [36300., 59400., 40600.],
       [20600., 41700., 39800.],
       [18100., 19000., 38600.],
       [21400., 13000., 42300.],
       [22000.,  8300., 44500.],
       [25400.,  9100., 42100.],
       [27100.,  7400., 46000.],
       [40300.,  8000., 46800.],
       [57000., 12300., 43800.],
       [76600., 19500., 40900.],
       [52300., 45700., 39400.],
       [19500., 51100., 39000.],
       [11200., 29700., 36700.],
       [ 7600., 15800., 41800.],
       [14600.,  9700., 43300.],
       [16200., 10100., 41300.],
       [24700.,  8600., 47300.]])

## Pandas

In [77]:
#Pandas can be used for data loading
import pandas as pd
df = pd.read_csv('iris.csv')

In [78]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [79]:
df.head(10)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
5,5.4,3.9,1.7,0.4,Setosa
6,4.6,3.4,1.4,0.3,Setosa
7,5.0,3.4,1.5,0.2,Setosa
8,4.4,2.9,1.4,0.2,Setosa
9,4.9,3.1,1.5,0.1,Setosa


In [80]:
df.tail()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica


In [81]:
df.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [82]:
v= df['sepal.length']#.max()

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal.length, Length: 150, dtype: float64

In [83]:
#Check for missing value
print(df.isnull())

     sepal.length  sepal.width  petal.length  petal.width  variety
0           False        False         False        False    False
1           False        False         False        False    False
2           False        False         False        False    False
3           False        False         False        False    False
4           False        False         False        False    False
..            ...          ...           ...          ...      ...
145         False        False         False        False    False
146         False        False         False        False    False
147         False        False         False        False    False
148         False        False         False        False    False
149         False        False         False        False    False

[150 rows x 5 columns]


In [84]:
#Viewing the column names
df.columns

Index(['sepal.length', 'sepal.width', 'petal.length', 'petal.width',
       'variety'],
      dtype='object')

In [85]:
#Information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


Documentation
https://pandas.pydata.org/docs/user_guide/index.html

Dealing with missing values \
https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html \
https://scikit-learn.org/stable/modules/impute.html