##### Importing Modules

In [1]:
import numpy as np
import pandas as pd


# **Series**

In [2]:
data = pd.Series(data=[1, 2, 3.5, 10])
data

0     1.0
1     2.0
2     3.5
3    10.0
dtype: float64

In [3]:
data.values

array([ 1. ,  2. ,  3.5, 10. ])

In [4]:
data.index

RangeIndex(start=0, stop=4, step=1)

### Construction of Pandas Series Object

* #### From List

In [5]:
data = pd.Series(data=[1, 2, 3.5, 10])
data

0     1.0
1     2.0
2     3.5
3    10.0
dtype: float64

* #### From NumPy array

In [6]:
data = pd.Series(data=np.linspace(0, 1, 5))
data

0    0.00
1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

* #### With Explicit Index

In [7]:
data = pd.Series(data=np.random.random(5), index=['a', 'b', 'c', 'd', 'e'])
data

a    0.870367
b    0.747016
c    0.102479
d    0.527021
e    0.271719
dtype: float64

* #### From Dictionary

In [8]:
data = pd.Series(data={'a': 1, 'b':2, 'c':3, 'd':5, 'e':10})
data

a     1
b     2
c     3
d     5
e    10
dtype: int64

# **DataFrame**

### Construction

* #### From List of Lists

In [9]:
data = pd.DataFrame(data=[[1, 2, 3.5, 10], [21, 20, 3, 0], [10, 2, 5, 1]], columns=['A', 'B', 'C', 'D'])
data

Unnamed: 0,A,B,C,D
0,1,2,3.5,10
1,21,20,3.0,0
2,10,2,5.0,1


* #### With Explicit Index

In [10]:
data = pd.DataFrame(data=[[1, 2, 3.5], [0, 3, 0], [2, 5, 1]], columns=['A', 'B', 'C'], index=[10, 20, 30])
data

Unnamed: 0,A,B,C
10,1,2,3.5
20,0,3,0.0
30,2,5,1.0


* #### From NumPy n-dimensinal array

In [11]:
data = pd.DataFrame(data=np.random.randint(0, 20, (3, 4)), columns=['A', 'B', 'C', 'D'])
data

Unnamed: 0,A,B,C,D
0,13,3,18,6
1,10,13,19,16
2,15,18,18,16


* #### From List of Dictionaries

In [12]:
data = pd.DataFrame(data=[{'a': 1, 'b':2, 'c':3, 'd':5, 'e':10}, {'a': 10, 'b':12, 'c':31, 'd':25, 'e':0}])
data

Unnamed: 0,a,b,c,d,e
0,1,2,3,5,10
1,10,12,31,25,0


* #### From Dictionary of Lists

In [13]:
data = pd.DataFrame(data={'a': [1, 2, 3, 5, 4], 'b': [12, 13, 14, 15, 16], 'c': [31, 25, 20, 22, 23]})
data

Unnamed: 0,a,b,c
0,1,12,31
1,2,13,25
2,3,14,20
3,5,15,22
4,4,16,23


* #### From Dictionary of Numpy arrays

In [14]:
data = pd.DataFrame(data={'rand': np.random.rand(5), 'randn': np.random.randn(5), 'random': np.random.random(5)})
data

Unnamed: 0,rand,randn,random
0,0.946479,0.764694,0.663398
1,0.048147,-0.340867,0.820753
2,0.199457,-1.43634,0.348399
3,0.439314,-0.849128,0.106496
4,0.151398,0.532612,0.432759


* #### From Dictionary of Dictionaries

In [15]:
df = pd.DataFrame(data={'A': {'a': 1, 'b':2, 'c':3}, 'B': {'a': 10, 'b':12, 'c':31}, 'C': {'a': 0, 'b':1, 'c':9}})
df

Unnamed: 0,A,B,C
a,1,10,0
b,2,12,1
c,3,31,9


* #### From Array of Series

In [16]:
ser1 = pd.Series(np.random.rand(3))
ser2 = pd.Series(np.random.rand(3))
ser3 = pd.Series(np.random.rand(3))

df = pd.DataFrame(data=np.array([ser1, ser2, ser3]), columns=['alpha', 'beta', 'gamma'])
df

Unnamed: 0,alpha,beta,gamma
0,0.266028,0.562508,0.025017
1,0.209058,0.344811,0.168777
2,0.281032,0.898272,0.13172


* #### From Dictionary of Series

In [17]:
alpha = pd.Series({'a': 1, 'b':2, 'c':3})
beta = pd.Series({'a': 10, 'b':12, 'c':31})
gamma = pd.Series({'a': 0, 'b':1, 'c':9})

data = pd.DataFrame({'alpha': alpha, 'beta':beta, 'gamma':gamma})
data

Unnamed: 0,alpha,beta,gamma
a,1,10,0
b,2,12,1
c,3,31,9


## Pandas Index object

In [18]:
idx = pd.Index(np.linspace(1, 5, 6))
idx

Float64Index([1.0, 1.8, 2.6, 3.4000000000000004, 4.2, 5.0], dtype='float64')

As immutable array

In [19]:
try:
    idx[0] = 0
except:
    print("ERROR!")

ERROR!


## Indexing and Slicing

* #### Series as dictionary

In [20]:
data = pd.Series(data=[0, 2, 3, 4, 12, 18], index=['a', 'b', 'c', 'd', 'f', 'h'])
data

a     0
b     2
c     3
d     4
f    12
h    18
dtype: int64

In [21]:
data['a']

0

In [22]:
data['c':'e']

c    3
d    4
dtype: int64

**NOTE :**  This will not work if not arranged lexicographcally

In [23]:
data.keys()

Index(['a', 'b', 'c', 'd', 'f', 'h'], dtype='object')

In [24]:
data.values

array([ 0,  2,  3,  4, 12, 18])

In [25]:
list(data.items())

[('a', 0), ('b', 2), ('c', 3), ('d', 4), ('f', 12), ('h', 18)]

In [26]:
data['f'] = 5
data

a     0
b     2
c     3
d     4
f     5
h    18
dtype: int64

* #### Series as 1-D Array

Implicit Indexing

In [27]:
data[1]

2

Explicit Indexing

In [28]:
data['a']

0

Implicit Slicing

In [29]:
data[2: 4]

c    3
d    4
dtype: int64

Explicit Slicing

In [30]:
data['b':'d']

b    2
c    3
d    4
dtype: int64

NOTE: final index inclusive

Fancy Indexing

In [31]:
data[['a', 'b', 'f']]

a    0
b    2
f    5
dtype: int64

### Indexers

In [32]:
data = pd.Series(data=np.arange(5), index=np.arange(5) + 2)
data

2    0
3    1
4    2
5    3
6    4
dtype: int64

* #### loc
for explicit indexing

In [33]:
idx = 3

In [34]:
data.loc[idx]

1

In [35]:
data.loc[idx: idx + 2]  # [3:5]

3    1
4    2
5    3
dtype: int64

NOTE: final index inclusive

* #### iloc
for implicit indexing

In [36]:
idx = 1

In [37]:
data.iloc[idx]

1

In [38]:
data.iloc[idx: idx + 2] # [1:3]

3    1
4    2
dtype: int64

NOTE: final index exclusive

* ### DataFrame as Dictionary

In [39]:
alpha = pd.Series({'a': 1, 'b':2, 'c':3})
beta = pd.Series({'a': 10, 'b':12, 'c':31})
gamma = pd.Series({'a': 0, 'b':1, 'c':9})

data = pd.DataFrame({'alpha': alpha, 'beta':beta, 'gamma':gamma})
data

Unnamed: 0,alpha,beta,gamma
a,1,10,0
b,2,12,1
c,3,31,9


In [40]:
data.keys()

Index(['alpha', 'beta', 'gamma'], dtype='object')

In [41]:
data.values

array([[ 1, 10,  0],
       [ 2, 12,  1],
       [ 3, 31,  9]])

In [42]:
data.columns

Index(['alpha', 'beta', 'gamma'], dtype='object')

Indexing (columns)

In [43]:
data['alpha']  # columns

a    1
b    2
c    3
Name: alpha, dtype: int64

Attribute-Style access

In [44]:
data.alpha

a    1
b    2
c    3
Name: alpha, dtype: int64

Not recommended as col names can be same as df attributes

* ### DataFrame as 2-D Array

Slicing (rows)

In [45]:
data[1:3]  # rows

Unnamed: 0,alpha,beta,gamma
b,2,12,1
c,3,31,9


Implicit

In [46]:
data.iloc[0, 0]

1

In [47]:
data.iloc[1:, :2]

Unnamed: 0,alpha,beta
b,2,12
c,3,31


Explicit

In [48]:
data.loc['a', 'alpha']

1

In [49]:
data.loc['b':, :'beta']

Unnamed: 0,alpha,beta
b,2,12
c,3,31


Almost all Kinds of numpy operations

In [50]:
data.T

Unnamed: 0,a,b,c
alpha,1,2,3
beta,10,12,31
gamma,0,1,9


Fancy Indexing

In [51]:
data.loc[['a', 'b'], ['beta', 'gamma']]

Unnamed: 0,beta,gamma
a,10,0
b,12,1


Adding new columns

In [52]:
data['delta'] = data['beta'] - data['alpha']
data

Unnamed: 0,alpha,beta,gamma,delta
a,1,10,0,9
b,2,12,1,10
c,3,31,9,28


Boolean Masking

In [53]:
data[data['gamma'] < 2]

Unnamed: 0,alpha,beta,gamma,delta
a,1,10,0,9
b,2,12,1,10


Reindexing

In [54]:
new_idx = ['b', 'a', 'c']

data.reindex(index=new_idx)

Unnamed: 0,alpha,beta,gamma,delta
b,2,12,1,10
a,1,10,0,9
c,3,31,9,28


## Operations on DataFrame

for unary `ufuncs` preserves index, for binary `ufuncs` align index (fill missing index with `NaN` unless prevented)

Index preserved

In [55]:
np.sqrt(data)

Unnamed: 0,alpha,beta,gamma,delta
a,1.0,3.162278,0.0,3.0
b,1.414214,3.464102,1.0,3.162278
c,1.732051,5.567764,3.0,5.291503


Index Alignment

In [56]:
data2 = data.loc[['b', 'a', 'c']] / 2
print(data)
print(data2)

data + data2

   alpha  beta  gamma  delta
a      1    10      0      9
b      2    12      1     10
c      3    31      9     28
   alpha  beta  gamma  delta
b    1.0   6.0    0.5    5.0
a    0.5   5.0    0.0    4.5
c    1.5  15.5    4.5   14.0


Unnamed: 0,alpha,beta,gamma,delta
a,1.5,15.0,0.0,13.5
b,3.0,18.0,1.5,15.0
c,4.5,46.5,13.5,42.0


### Missing Values

In [90]:
data2 = data.loc[['a', 'c'], :'gamma']
data2

Unnamed: 0,alpha,beta,gamma
a,1,10,0
c,3,31,9


In [81]:
data3 = data.add(data2) 
data3

Unnamed: 0,alpha,beta,delta,gamma
a,2.0,20.0,,0.0
b,,,,
c,6.0,62.0,,18.0


In [87]:
data3.isnull()   ####

Unnamed: 0,alpha,beta,delta,gamma
a,False,False,True,False
b,True,True,True,True
c,False,False,True,False


In [88]:
data3.notnull()   ####

Unnamed: 0,alpha,beta,delta,gamma
a,True,True,False,True
b,False,False,False,False
c,True,True,False,True


#### Filling missing values

In [65]:
data.add(data2, fill_value=0)

Unnamed: 0,alpha,beta,delta,gamma
a,2.0,20.0,9.0,0.0
b,2.0,12.0,10.0,1.0
c,6.0,62.0,28.0,18.0


Using `df.fillna()`

In [84]:
data3.fillna(0)

Unnamed: 0,alpha,beta,delta,gamma
a,2.0,20.0,0.0,0.0
b,0.0,0.0,0.0,0.0
c,6.0,62.0,0.0,18.0


_forward-fill_

In [112]:
filled_data = data3.fillna(method='ffill').copy()
filled_data

Unnamed: 0,alpha,beta,delta,gamma
a,2.0,20.0,,0.0
b,2.0,20.0,,0.0
c,6.0,62.0,,18.0


In [113]:
filled_data.fillna(axis=1, method='ffill')

Unnamed: 0,alpha,beta,delta,gamma
a,2.0,20.0,20.0,0.0
b,2.0,20.0,20.0,0.0
c,6.0,62.0,62.0,18.0


_back-fill_

In [114]:
filled_data = data3.fillna(method='bfill').copy()
filled_data

Unnamed: 0,alpha,beta,delta,gamma
a,2.0,20.0,,0.0
b,6.0,62.0,,18.0
c,6.0,62.0,,18.0


In [115]:
filled_data.fillna(axis=1, method='bfill')

Unnamed: 0,alpha,beta,delta,gamma
a,2.0,20.0,0.0,0.0
b,6.0,62.0,18.0,18.0
c,6.0,62.0,18.0,18.0


#### Droping Missing Values

Drop all `NaN` across rows

In [89]:
data3.dropna()

Unnamed: 0,alpha,beta,delta,gamma


Drop across columns

In [91]:
data3.dropna(axis=1)

a
b
c


`how`

In [94]:
data3.dropna(how='any')

Unnamed: 0,alpha,beta,delta,gamma


In [93]:
data3.dropna(axis=1, how='any')

a
b
c


In [96]:
data3.dropna(how='all')

Unnamed: 0,alpha,beta,delta,gamma
a,2.0,20.0,,0.0
c,6.0,62.0,,18.0


In [97]:
data3.dropna(axis=1, how='all')

Unnamed: 0,alpha,beta,gamma
a,2.0,20.0,0.0
b,,,
c,6.0,62.0,18.0


`thresh`

In [103]:
data3.dropna(thresh=3)

Unnamed: 0,alpha,beta,delta,gamma
a,2.0,20.0,,0.0
c,6.0,62.0,,18.0


In [104]:
data3.dropna(axis=1, thresh=2)

Unnamed: 0,alpha,beta,gamma
a,2.0,20.0,0.0
b,,,
c,6.0,62.0,18.0


#### Broadcasting

In [76]:
col = data.loc[:, 'alpha']
col

a    1
b    2
c    3
Name: alpha, dtype: int64

In [79]:
print(data)
data.add(col, axis=0)

   alpha  beta  gamma  delta
a      1    10      0      9
b      2    12      1     10
c      3    31      9     28


Unnamed: 0,alpha,beta,gamma,delta
a,2,11,1,10
b,4,14,3,12
c,6,34,12,31


## Hierarchical Indexing on Panel Data

### Pandas MultiIndex

In [121]:
idx = pd.MultiIndex(levels=[['a', 'b', 'c'], [10, 20]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])
idx

MultiIndex([('a', 10),
            ('a', 20),
            ('b', 10),
            ('b', 20),
            ('c', 10),
            ('c', 20)],
           )

In [122]:
cols = ['A', 'B', 'C']
cols

['A', 'B', 'C']

In [124]:
data = np.random.randint(0, 100, size=(6, 3))
data

array([[26, 39, 45],
       [74, 14, 97],
       [67, 37, 24],
       [63, 44, 73],
       [76, 93, 67],
       [50, 37, 23]])

In [131]:
df = pd.DataFrame(data=data, index=idx, columns=cols)
df

Unnamed: 0,Unnamed: 1,A,B,C
a,10,26,39,45
a,20,74,14,97
b,10,67,37,24
b,20,63,44,73
c,10,76,93,67
c,20,50,37,23


Convert multiply-indexed `Series` into conventionally indexed `DataFrame` and vice versa

In [165]:
seriesA = df['A']
seriesA

a  10    26
   20    74
b  10    67
   20    63
c  10    76
   20    50
Name: A, dtype: int64

`unstack`

In [166]:
dfA = seriesA.unstack()
dfA

Unnamed: 0,10,20
a,26,74
b,67,63
c,76,50


`stack`

In [167]:
new_seriesA = dfA.stack()
new_seriesA

a  10    26
   20    74
b  10    67
   20    63
c  10    76
   20    50
dtype: int64

On dataframe

In [171]:
df2 = df.unstack()
df2

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,10,20,10,20,10,20
a,26,74,39,14,45,97
b,67,63,37,44,24,73
c,76,50,93,37,67,23


In [175]:
df2.unstack()

A  10  a    26
       b    67
       c    76
   20  a    74
       b    63
       c    50
B  10  a    39
       b    37
       c    93
   20  a    14
       b    44
       c    37
C  10  a    45
       b    24
       c    67
   20  a    97
       b    73
       c    23
dtype: int64

Note the conversion from series to data frame and vice versa

Indexing

In [155]:
df['A'].loc[:, 10]

a    26
b    67
c    76
Name: A, dtype: int64

### Construction Methods of MultiIndex

* #### From nested Lists

In [177]:
idx = [['a', 'a', 'b', 'b'], [10, 20, 10, 20]]
df = pd.DataFrame(data=np.random.rand(4, 2), index=idx, columns=['col1', 'col2'])
df

Unnamed: 0,Unnamed: 1,col1,col2
a,10,0.304352,0.264352
a,20,0.421823,0.995558
b,10,0.883201,0.546828
b,20,0.679922,0.469713


* #### From dictionary with tuple keys

In [186]:
tup_data = {('a', 10): [12, 13], 
        ('a', 20): [2, 3], 
        ('b', 10): [22, 31], 
        ('b', 20): [20, 5]}

df = pd.DataFrame(tup_data)
df

Unnamed: 0_level_0,a,a,b,b
Unnamed: 0_level_1,10,20,10,20
0,12,2,22,20
1,13,3,31,5
