In [4]:
# Numpy ndarray
import numpy as np

## Creation
### From list
lst = [1, 2, 3, 4, 5, 6]
npa_1 = np.array(lst)
print(npa_1)
### From range
npa_2 = np.arange(1, 7).reshape(3, 2)
print(npa_2)
### From shape
shape = (3, 2)
print(np.zeros(shape, dtype = "int"))
print(np.empty(shape, dtype = "int"))
print(np.ones(shape, dtype = "int"))
### From data

[1 2 3 4 5 6]
[[1 2]
 [3 4]
 [5 6]]
[[0 0]
 [0 0]
 [0 0]]
[[0 0]
 [0 0]
 [0 0]]
[[1 1]
 [1 1]
 [1 1]]


In [5]:
## Manipulation
### Single element
print(npa_1[0]); print(npa_2[0, 0])
### Multiple elements
print(npa_1[:4]); print(npa_2[:2, :])
print(npa_1[ [0, 1, 4] ])
### Counts
print(npa_2.size)
print(len(npa_2))
print(npa_2.shape)
### Iteration
for i in np.nditer(npa_2, order = 'C'):      # Row-major
    print(i)
for i in np.nditer(npa_2, order = 'F'):      # Column-major
    print(i)

1
1
[1 2 3 4]
[[1 2]
 [3 4]]
[1 2 5]
6
3
(3, 2)
1
2
3
4
5
6
1
3
5
2
4
6


In [6]:
## Stats: np.stats(ndarray, axis = 0 (Column, compute over the rows), 1 (Row, compute over the columns))
print(np.mean(npa_2, axis = 1))
print(np.median(npa_2, axis = 1))
print(np.min(npa_2, axis = 1))
print(np.max(npa_2, axis = 1))

[1.5 3.5 5.5]
[1.5 3.5 5.5]
[1 3 5]
[2 4 6]


In [7]:
## Masking: [Bitmask (Boolean array)] 
npa_3 = np.arange(5)
print(npa_3)
### Single mask
print(npa_3[npa_3 < 3])
### Joint mask
print(npa_3[(npa_3 > 1) & (npa_3 < 4)])
### Aggregate comparison: np.any(); np.all()
print(np.all(npa_3 == np.abs(npa_3)))

[0 1 2 3 4]
[0 1 2]
[2 3]
True


In [8]:
# Pandas - Versatile ndarrays
import pandas as pd

## Series: Array (.values) ~ Values; Index (.index) ~ Keys
### Creation: pd.Series(data (List, Array, Dict), index)
print(pd.Series([1, 2, 3, 4, 5]))                          
s_1 = pd.Series([1, 2, 3, 4, 5], index = ['a', 'b', 'c', 'd', 'e'])
s_1 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})
print(s_1); print(s_1.values); print(s_1.index)
### Read access
s_2 = pd.Series(range(6), index = list('abcdef')); print(s_2)
#### Single element: s[i]
print(s_2[0])         # Implicit index = Series.iloc[i]
print(s_2['a'])       # Explicit index = Series.loc['i']
#### Multiple elements: s[i:j]; s[ [i, j] ]
print(s_2[0:3])       # Implicit index, excluding the last = Series.iloc[i:j]
print(s_2['a':'c'])   # Explicit index, including the last = Series.loc['i':'j']
#### Masking
print(s_2[s_2 == 3]); print(s_2.where(s_2 == 3))     # Masking on values
print(s_2[(s_2.index == 'a') | (s_2.index == 'e')])  # Masking on index

0    1
1    2
2    3
3    4
4    5
dtype: int64
a    1
b    2
c    3
d    4
e    5
dtype: int64
[1 2 3 4 5]
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
a    0
b    1
c    2
d    3
e    4
f    5
dtype: int64
0
0
a    0
b    1
c    2
dtype: int64
a    0
b    1
c    2
dtype: int64
d    3
dtype: int64
a    NaN
b    NaN
c    NaN
d    3.0
e    NaN
f    NaN
dtype: float64
a    0
e    4
dtype: int64


In [18]:
## DataFrame: Array (.values); Index (.index); Column (.columns)
### Creation: pd.DataFrame(data, index, columns)
df_1 = pd.DataFrame(data = np.arange(15).reshape(5, 3),
                    index = list('abcde'),
                    columns = ['col0', 'col1', 'col2'])
print(df_1)
print(df_1.values)
print(df_1.index)
print(df_1.columns)
### Read access
#### Explicit columns: [i] + [[i, j]]
#### Implicit columns: X
#### Explicit rows: Slicing only
#### Implicit rows: Slicing only
#### DataFrame.loc[]: Row(s); Row + Column
#### DataFrame.iloc[]: Row(s); Row + Column

   col0  col1  col2
a     0     1     2
b     3     4     5
c     6     7     8
d     9    10    11
e    12    13    14
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]]
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
Index(['col0', 'col1', 'col2'], dtype='object')


In [21]:
### Missing values
df_1.loc['a', 'col0'] = None
df_1.loc['b', 'col0'] = np.nan
print(df_1.isna())                         # On DataFrame
print(df_1[df_1['col0'].isna()])           # On Series
print(df_1.isna().sum())                   # Count on the column wise
print(df_1['col0'].isna().sum())           # Count on the specific column
print(df_1.dropna(axis = 1, how = 'any'))  # Drop columns that have any missing values
### Data aggregation: df.groupby(grouping_variable)[column].aggregation_functions()
df_2 = pd.DataFrame(data = [[1, 2010, 50], 
                            [1, 2011, 100],
                            [2, 2010, 60], 
                            [2, 2011, 30], 
                            [2, 2012, 10],
                            [3, 2012, 500]],
                    columns = ['ID', 'Year', 'Value'])
print(df_2.groupby('Year')['Value'].agg(['sum', 'mean', 'std']))
### Data transformation: df.groupby(grouping_variable)[column].transform()
#### Add the aggregate values back to the original dataframe 
#### Like-indexed series
df_2['avg_year'] = df_2.groupby('Year')['Value'].transform('mean')
print(df_2)



    col0   col1   col2
a   True  False  False
b   True  False  False
c  False  False  False
d  False  False  False
e  False  False  False
   col0  col1  col2
a   NaN     1     2
b   NaN     4     5
col0    2
col1    0
col2    0
dtype: int64
2
   col1  col2
a     1     2
b     4     5
c     7     8
d    10    11
e    13    14
      sum  mean         std
Year                       
2010  110    55    7.071068
2011  130    65   49.497475
2012  510   255  346.482323
   ID  Year  Value  avg_year
0   1  2010     50        55
1   1  2011    100        65
2   2  2010     60        55
3   2  2011     30        65
4   2  2012     10       255
5   3  2012    500       255
