In [1]:
import numpy as np
import numpy.linalg as la ## used for linear algebra things like inverse matrix, etc.

In [2]:
a = np.array([[1,2], [3,4]], dtype="i") 

In [3]:
print(a)

[[1 2]
 [3 4]]


All Markdown blocks correspond to the code above them; only the headings are exceptions.

# Dtype and Dimensions

In [4]:
a.dtype # shows the type the array holds 

dtype('int32')

In [5]:
a.ndim # shows how many dimensions the array has

2

#### *If you want to create a multi-dimensional array (like a 2D array), all the inner arrays must have the same number of elements.*

#### *For unequal number of elements just set dtype = object*


In [6]:
# Example of code that doesn't work when the inner arrays have an unequal number of elements.
# b = np.array([[1,2], [1, 2, 3]])
# b.ndim

In [7]:
c = np.array([[[1,2], [54, 56]], [[50, 40], [30, 20]]])
print(c[0,1,1]) # Indexes can be listed using commas, one after another

56


# Shape and Size

In [8]:
print(c.shape)  # Returns a tuple representing the array dimensions: (2D arrays, 1D arrays, elements per 1D array)
print(c.shape[0]) # Retrieves the first dimension from the shape tuple (number of 2D arrays)

(2, 2, 2)
2


In [9]:
c.size # shows how many elements are in the whole array

8

# Arrange, Random, and Reshape

In [10]:
a = np.arange(20, 100) # create an array with values from a to b-1  [or just from 0 to a, if b not given]
a

array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
       88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [11]:
a = np.arange(20, 100, 3) # can make it with a jump of three
a

array([20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56, 59, 62, 65, 68,
       71, 74, 77, 80, 83, 86, 89, 92, 95, 98])

In [12]:
b = np.random.permutation(a) # rearranges all values in a random fashion
b

array([20, 65, 59, 47, 35, 62, 44, 23, 50, 86, 80, 53, 26, 98, 41, 56, 68,
       74, 83, 89, 92, 77, 38, 32, 29, 95, 71])

#### *Can create random numbers on .random with .rand() or .randn()*

#### *Can use np.zeros and np.ones to create arrays with zeroes or ones* 

In [13]:
c = np.arange(100).reshape(4, 25) # makes the array into a 2-dimensional array with 4 subarrays, with 25 elements each.
c

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
        41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
        66, 67, 68, 69, 70, 71, 72, 73, 74],
       [75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
        91, 92, 93, 94, 95, 96, 97, 98, 99]])

# Slicing and Sorting

#### *When you create a slice from something, the slice doesn’t make a copy of the data. Instead, it points to the same memory location as the original. This means that if you modify the slice, you’re also changing the original data.*

In [14]:
D = np.arange(100)

print(D[3:10]) ## from 3 to 9

print(D[::5]) ## every fifth element

print(D[::-5]) ## every fifth element from the end 


print()


E = np.round(10*np.random.rand(5,4))

print(E[:, 1:3]) ## returns a submatrix with all rows and sliced columns

print(E[1:3, 1:3]) ## returns a submatrix with slicing

[3 4 5 6 7 8 9]
[ 0  5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95]
[99 94 89 84 79 74 69 64 59 54 49 44 39 34 29 24 19 14  9  4]

[[ 7.  4.]
 [ 2.  9.]
 [ 8. 10.]
 [10.  1.]
 [ 9.  0.]]
[[ 2.  9.]
 [ 8. 10.]]


#### *[::5] means “take every 5th element from start to end, without specifying a start or end index."*

#### *":" means everything*

In [15]:
print(E)

print()

print(E.T) ## creates a transpose

[[ 7.  7.  4.  4.]
 [ 2.  2.  9.  6.]
 [ 6.  8. 10.  4.]
 [ 6. 10.  1.  4.]
 [ 3.  9.  0.  6.]]

[[ 7.  2.  6.  6.  3.]
 [ 7.  2.  8. 10.  9.]
 [ 4.  9. 10.  1.  0.]
 [ 4.  6.  4.  4.  6.]]


In [16]:
E.sort(axis=0) ## sorts the columns with axis = 1, it sorts the rows (affects the original array, works for more dimensions)
E

array([[ 2.,  2.,  0.,  4.],
       [ 3.,  7.,  1.,  4.],
       [ 6.,  8.,  4.,  4.],
       [ 6.,  9.,  9.,  6.],
       [ 7., 10., 10.,  6.]])

#### *.sort() can be used on 1-dimensional arrays, just don't give it an axis.*

# Masking and Broadcasting

#### *Makes a copy of the data, so it doesn't change the original*

In [50]:
F = np.arange(20, 25)

## using indexing with double [] -> F[index_array]
print(F[[0, 3, 4]]) ## can get specific indexes

print(F[[True, False, True, False, False]]) ## use True/False list to filter data, returns only True positions.

## using conditions F[F<8]
print(F[F<23]) ## filters all that are below 23

[20 23 24]
[20 22]
[20 21 22]


#### *Can use &, 'and' in the conditions. & - used for arrays, 'and' used for single objects.*

In [18]:
G = np.arange(40, 60).reshape(2, 10)
print(G + 5) ## adds five to all columns of the matrix, for any dimensional matrix (can even add arrays this way)

Z = np.arange(2, 8).reshape(2, 3)
W = np.arange(14, 20).reshape(2, 3)

print()

print(np.hstack((Z,W))) ## concats two arrays horizontally

print()

print(np.vstack((Z,W))) ## concats two arrays vertically

[[45 46 47 48 49 50 51 52 53 54]
 [55 56 57 58 59 60 61 62 63 64]]

[[ 2  3  4 14 15 16]
 [ 5  6  7 17 18 19]]

[[ 2  3  4]
 [ 5  6  7]
 [14 15 16]
 [17 18 19]]


#### *Can use .concatenate() for joining on existing axis*

In [19]:
import pandas as pd

# Pandas Series and DataFrames

In [20]:
A = pd.Series([1,2,3,4,5], index=["a", "b", "c", "d", "e"]) ## just an array, can make custom indexes
print(A)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [21]:
print(A.values)
print(A.index)

[1 2 3 4 5]
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


#### *For explicit indexing, it includes the final element too. (indexing works the same as numpy)*

#### *Can create a series with a dictionary, just pass it to pd.Series()*

In [22]:
A = pd.Series([1,2,3,4,5], index=["a", "b", "c", "d", "e"]) 
B = pd.Series([10, 15, 25, 35, 600], index=["a", "b", "c", "d", "e"])

C = pd.DataFrame({"A": A, "B": B}) ## use for multidimensional array     [key -> column, value -> populates the column]
print(C)

print()

print(C.T)

   A    B
a  1   10
b  2   15
c  3   25
d  4   35
e  5  600

    a   b   c   d    e
A   1   2   3   4    5
B  10  15  25  35  600


In [23]:
print(C.values)
print(C.index)

print(C.values[2, 0]) ## use .values for slicing 

print(C.columns)

C['D'] = C['B'] / 90 ## create a new column like in dictionaries. 
print(C['D'])

del C["D"] ## delete like in dictionaries.

[[  1  10]
 [  2  15]
 [  3  25]
 [  4  35]
 [  5 600]]
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
3
Index(['A', 'B'], dtype='object')
a    0.111111
b    0.166667
c    0.277778
d    0.388889
e    6.666667
Name: D, dtype: float64


# Missing values and using .loc, .iloc

In [24]:
D = pd.DataFrame([{'a': 1, "b": 4}, {'b': -3, "c": 9}])
D

Unnamed: 0,a,b,c
0,1.0,4,
1,,-3,9.0


In [25]:
D = D.fillna(2) ## fills with fixed values

D = D.dropna(axis=1) ## deletes columns/rows if NaN in them 
D

Unnamed: 0,a,b,c
0,1.0,4,2.0
1,2.0,-3,9.0


In [26]:
data = pd.Series(["a", "b", "c"], index=[3,0,5])

print(data.loc[0])
print(data.iloc[0])

print()

print(data.loc[5])
try:
    print(data.iloc[5])
except:
    print("Error!")

print()

print(data.loc[0:5])
print(data.iloc[0:5])

b
a

c
Error!

0    b
5    c
dtype: object
3    a
0    b
5    c
dtype: object


#### *Use .loc for explicit indexing, and .iloc for implicit indexing (explicit --> created by you)*

#### *By default pandas uses implicit indexing*

# Pandas Exercise

In [27]:
from sklearn.impute import SimpleImputer

In [28]:
covid_csv  = pd.read_csv("./CSV\'s/covid_19_data.csv")

In [None]:
covid_csv.head(10) ## shows the top n from the dataset

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
5,6,01/22/2020,Guangdong,Mainland China,1/22/2020 17:00,26.0,0.0,0.0
6,7,01/22/2020,Guangxi,Mainland China,1/22/2020 17:00,2.0,0.0,0.0
7,8,01/22/2020,Guizhou,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
8,9,01/22/2020,Hainan,Mainland China,1/22/2020 17:00,4.0,0.0,0.0
9,10,01/22/2020,Hebei,Mainland China,1/22/2020 17:00,1.0,0.0,0.0


In [35]:
covid_csv.shape

(306429, 8)

In [None]:
print(covid_csv.columns)
print(covid_csv.dtypes)

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered'],
      dtype='object')
SNo                  int64
ObservationDate     object
Province/State      object
Country/Region      object
Last Update         object
Confirmed          float64
Deaths             float64
Recovered          float64
dtype: object


In [None]:
print(covid_csv.isna().any()) ## check if there are any NaN fields in each column

print()

print(covid_csv.isna().sum()) ## gets the sum for NaN from the columns

SNo                False
ObservationDate    False
Province/State      True
Country/Region     False
Last Update        False
Confirmed          False
Deaths             False
Recovered          False
dtype: bool

SNo                    0
ObservationDate        0
Province/State     78103
Country/Region         0
Last Update            0
Confirmed              0
Deaths                 0
Recovered              0
dtype: int64


In [162]:
print(covid_csv["Deaths"].count())

print(covid_csv["Deaths"].mean())

print(covid_csv["Deaths"].max())

print(covid_csv["Deaths"].min())

print() 

## .quantile() manual approach
# percentage = 0.25
# sorted_deaths = covid_csv["Deaths"].sort_values().reset_index(drop=True)
# below_index = covid_csv["Deaths"].count() * percentage
# lower = sorted_deaths[round(below_index)]
# upper = sorted_deaths[round(below_index) + 1]
# calculating = lower + percentage * (upper - lower)
# print(calculating)

#.quantile(percentage) --> tells the number that n% of the data is below
print(covid_csv["Deaths"].quantile(0.25)) 

print(covid_csv['Deaths'].quantile(0.5))

print(covid_csv["Deaths"].quantile(0.75))

print()

## .std() manual approach
# mean = covid_csv["Deaths"].mean()
# sqr_differences = (covid_csv["Deaths"] - mean) ** 2
# variance = sqr_differences.mean()
# std = variance ** 0.5

## shows the average distance that data points are from the mean [standard deviation]
print(covid_csv["Deaths"].std())


## Can just use .describe()
print(covid_csv.describe())

## It's good to do these for all numeric fields

306429
2036.4032679674574
112385.0
-178.0

13.0
192.0
1322.0

6410.9380477066725
                 SNo     Confirmed         Deaths     Recovered
count  306429.000000  3.064290e+05  306429.000000  3.064290e+05
mean   153215.000000  8.567091e+04    2036.403268  5.042029e+04
std     88458.577156  2.775516e+05    6410.938048  2.015124e+05
min         1.000000 -3.028440e+05    -178.000000 -8.544050e+05
25%     76608.000000  1.042000e+03      13.000000  1.100000e+01
50%    153215.000000  1.037500e+04     192.000000  1.751000e+03
75%    229822.000000  5.075200e+04    1322.000000  2.027000e+04
max    306429.000000  5.863138e+06  112385.000000  6.399531e+06


#### *Use sort_values() not sort() for Series*

#### *Use reset_index(drop=True) when you want to create a new sequential index*