In [18]:
import numpy as np
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [19]:
# create a 2D array
a = np.array([[1, 2, 3], [4, 5, 6]])
a
b = np.array([[7, 8, 9]])
b
np.concatenate((a, b), axis=0)

array([[1, 2, 3],
       [4, 5, 6]])

array([[7, 8, 9]])

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [20]:
a = np.arange(1, 9).reshape(2, 4)
a
# split a into 3 sub arrays
a1, a2, a3 = np.split(a, [3, 5], axis=1)

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [21]:
a1
a2
a3

array([[1, 2, 3],
       [5, 6, 7]])

array([[4],
       [8]])

array([], shape=(2, 0), dtype=int64)

In [22]:
a1

array([[1, 2, 3],
       [5, 6, 7]])

In [23]:
a2 = np.array([[7, 8, 9]])
a2

# broadcast 1st dimension a2 to match the shape of a1
a1 + a2

array([[7, 8, 9]])

array([[ 8, 10, 12],
       [12, 14, 16]])

In [24]:
a1.shape
a1

(2, 3)

array([[1, 2, 3],
       [5, 6, 7]])

In [25]:
a2
a2.shape

array([[7, 8, 9]])

(1, 3)

In [26]:
# broadcast 2nd dimension
a1 + a2

array([[ 8, 10, 12],
       [12, 14, 16]])

In [27]:
# calculate the average of each row
np.average(a1 + a2, axis=-1)

array([10., 14.])

In [28]:
a2 = a[0, :]

In [29]:
a = np.array([[1, 2]])
b = np.array([[3], [4]])
a
b
a + b

array([[1, 2]])

array([[3],
       [4]])

array([[4, 5],
       [5, 6]])

In [30]:
x = np.array([1, 2])
y = np.expand_dims(x, axis=[0, 1])
y.shape
x.shape

x[np.newaxis, np.newaxis, :].shape

(1, 1, 2)

(2,)

(1, 1, 2)

In [31]:
y.shape
# throw error since the size of dimension axis 2 is not 1
y.squeeze(axis=2).shape

(1, 1, 2)

ValueError: cannot select an axis to squeeze out which has size not equal to one

In [32]:
a = np.ones([9, 5, 3])
c = np.ones([3, 5])

# matmul and dot are the same for 2D arrays multiplication
np.sum(np.matmul(a, c) != np.dot(a, c), axis=None)

0

In [34]:
a = np.array([[2, 1, np.nan], [3, np.nan, 5]])
a

array([[ 2.,  1., nan],
       [ 3., nan,  5.]])

In [35]:
# calculate average without nan for each row
np.nanmean(a, axis=1)
a.shape

array([1.5, 4. ])

(2, 3)

In [36]:
# calculate average without nan for each column
np.nanmean(a, axis=0)

array([2.5, 1. , 5. ])

In [37]:
import pandas as pd

a = pd.DataFrame([[2, 1, np.nan], [3, np.nan, 5]])
a

Unnamed: 0,0,1,2
0,2,1.0,
1,3,,5.0


In [38]:
type(a[0])
type(a.loc[0])

pandas.core.series.Series

pandas.core.series.Series

In [39]:
df = pd.read_csv("data/product_hierarchy.csv", index_col=[0])
df.head(5)

Unnamed: 0_level_0,product_length,product_depth,product_width,cluster_id,hierarchy1_id,hierarchy2_id,hierarchy3_id,hierarchy4_id,hierarchy5_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P0000,5.0,20.0,12.0,,H00,H0004,H000401,H00040105,H0004010534
P0001,13.5,22.0,20.0,cluster_5,H01,H0105,H010501,H01050100,H0105010006
P0002,22.0,40.0,22.0,cluster_0,H03,H0315,H031508,H03150800,H0315080028
P0004,2.0,13.0,4.0,cluster_3,H03,H0314,H031405,H03140500,H0314050003
P0005,16.0,30.0,16.0,cluster_9,H03,H0312,H031211,H03121109,H0312110917


In [40]:
df.columns
df.index.names

Index(['product_length', 'product_depth', 'product_width', 'cluster_id',
       'hierarchy1_id', 'hierarchy2_id', 'hierarchy3_id', 'hierarchy4_id',
       'hierarchy5_id'],
      dtype='object')

FrozenList(['product_id'])

In [41]:
# get the information of the data frame
# including the number of rows, columns, data types, and memory usage
df.info()

df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 699 entries, P0000 to P0748
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_length  681 non-null    float64
 1   product_depth   683 non-null    float64
 2   product_width   683 non-null    float64
 3   cluster_id      649 non-null    object 
 4   hierarchy1_id   699 non-null    object 
 5   hierarchy2_id   699 non-null    object 
 6   hierarchy3_id   699 non-null    object 
 7   hierarchy4_id   699 non-null    object 
 8   hierarchy5_id   699 non-null    object 
dtypes: float64(3), object(6)
memory usage: 54.6+ KB


product_length    18
product_depth     16
product_width     16
cluster_id        50
hierarchy1_id      0
hierarchy2_id      0
hierarchy3_id      0
hierarchy4_id      0
hierarchy5_id      0
dtype: int64

In [42]:
# result of dataframe groupby can be iterated
for key, sub_df in df.groupby("hierarchy1_id"):
    print(key)
    print(sub_df)

H00
            product_length  product_depth  product_width cluster_id   
product_id                                                            
P0000                  5.0           20.0           12.0        NaN  \
P0008                  5.0           16.0            5.0  cluster_0   
P0009                  5.0           18.0           14.0  cluster_6   
P0015                 10.0           33.0           10.0  cluster_1   
P0016                  2.0           18.0            8.0  cluster_0   
...                    ...            ...            ...        ...   
P0719                  6.0           17.0            6.0        NaN   
P0724                  6.0           24.0           10.0  cluster_0   
P0725                  NaN            NaN            NaN  cluster_0   
P0726                  1.0           15.0           10.0  cluster_0   
P0733                  2.0            4.0            9.0  cluster_7   

           hierarchy1_id hierarchy2_id hierarchy3_id hierarchy4_id   
pr

In [43]:
df = pd.DataFrame(
    {"A": [3, 5, 7, 6], "B": ["a", "b", "c", "d"]},
    index=pd.MultiIndex.from_product([["bar", "foo"], ["one", "two"]]),
)
df
# stack the DataFrame columns
sdf = df.stack()
sdf

Unnamed: 0,Unnamed: 1,A,B
bar,one,3,a
bar,two,5,b
foo,one,7,c
foo,two,6,d


bar  one  A    3
          B    a
     two  A    5
          B    b
foo  one  A    7
          B    c
     two  A    6
          B    d
dtype: object

In [44]:
sdf = pd.DataFrame(sdf)
# set the names of index
sdf.index.names = ["first", "second", "third"]

# set the names of columns
sdf.rename(columns={0: "value"}, inplace=True)
sdf.columns = ["col"]
# reset the index to be dataframe columns
sdf = sdf.reset_index()

In [45]:
sdf

Unnamed: 0,first,second,third,col
0,bar,one,A,3
1,bar,one,B,a
2,bar,two,A,5
3,bar,two,B,b
4,foo,one,A,7
5,foo,one,B,c
6,foo,two,A,6
7,foo,two,B,d


In [46]:
df = pd.DataFrame(
    {
        "foo": ["one", "one", "one", "two", "two", "two"],
        "bar": ["A", "B", "C", "A", "B", "C"],
        "baz": [1, 2, 3, 4, 5, 6],
        "zoo": ["x", "y", "z", "q", "w", "t"],
    }
)
df

Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w
5,two,C,6,t


In [47]:
# directly create a pivot table
# row is foo, column is baz, value is zoo

df.pivot(index="foo", columns="baz", values="zoo")

baz,1,2,3,4,5,6
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,x,y,z,,,
two,,,,q,w,t


In [48]:
# can apply a function to a dataframe groupby object
df.groupby('foo').apply(lambda x: x.sort_values('baz', ascending=False)).reset_index(drop=True)

Unnamed: 0,foo,bar,baz,zoo
0,one,C,3,z
1,one,B,2,y
2,one,A,1,x
3,two,C,6,t
4,two,B,5,w
5,two,A,4,q
