<h1> 10 minutes of Pandas </h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

<h2> Object Creations in Pandas </h2>

In [4]:
series = pd.Series([1, 3, 5, np.nan, 6, 8]) 

series

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
dates = pd.date_range('20130101', periods=6)

dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### We can use a dictonary like syntax in DataFrame

In [7]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))


df

Unnamed: 0,A,B,C,D
2013-01-01,0.258781,-0.141228,-1.584546,0.363434
2013-01-02,0.5215,-0.574329,-0.739023,0.264204
2013-01-03,-0.594747,-0.860002,1.519501,-0.688696
2013-01-04,2.581689,0.106394,-1.599839,0.19003
2013-01-05,0.157918,-0.727874,-1.704831,0.407655
2013-01-06,-0.004549,-0.612659,1.152531,1.670256


### DataFrames can also be created using a dict

In [8]:
df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(4)), dtype="float32"),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(['test', 'train', 'test', 'train']),
    'F': 'foo'
})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


## Viewing Data in Pandas

In [10]:
df.head()  # Get the top rows of the DataFrame

Unnamed: 0,A,B,C,D
2013-01-01,0.258781,-0.141228,-1.584546,0.363434
2013-01-02,0.5215,-0.574329,-0.739023,0.264204
2013-01-03,-0.594747,-0.860002,1.519501,-0.688696
2013-01-04,2.581689,0.106394,-1.599839,0.19003
2013-01-05,0.157918,-0.727874,-1.704831,0.407655


In [11]:
df.tail(3)  # Get the last rows of the DataFrame

Unnamed: 0,A,B,C,D
2013-01-04,2.581689,0.106394,-1.599839,0.19003
2013-01-05,0.157918,-0.727874,-1.704831,0.407655
2013-01-06,-0.004549,-0.612659,1.152531,1.670256


In [12]:
df.index  # Display the indexes of the DataFrame

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [13]:
df.columns  # Display the columns of the DataFrame

Index(['A', 'B', 'C', 'D'], dtype='object')

In [14]:
df.values  # Display all the values of the DataFrame

array([[ 0.25878124, -0.14122797, -1.58454579,  0.36343411],
       [ 0.52150009, -0.57432891, -0.7390227 ,  0.26420364],
       [-0.5947473 , -0.86000225,  1.51950099, -0.68869552],
       [ 2.58168928,  0.10639404, -1.59983932,  0.19003019],
       [ 0.15791803, -0.72787437, -1.70483107,  0.40765497],
       [-0.00454911, -0.61265872,  1.15253118,  1.67025599]])

In [15]:
df.T  # Transpose the entire DataFrame

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.258781,0.5215,-0.594747,2.581689,0.157918,-0.004549
B,-0.141228,-0.574329,-0.860002,0.106394,-0.727874,-0.612659
C,-1.584546,-0.739023,1.519501,-1.599839,-1.704831,1.152531
D,0.363434,0.264204,-0.688696,0.19003,0.407655,1.670256


In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.486765,-0.468283,-0.492701,0.367814
std,1.091887,0.371521,1.463117,0.755846
min,-0.594747,-0.860002,-1.704831,-0.688696
25%,0.036068,-0.69907,-1.596016,0.208574
50%,0.20835,-0.593494,-1.161784,0.313819
75%,0.45582,-0.249503,0.679643,0.3966
max,2.581689,0.106394,1.519501,1.670256


In [17]:
df.sort_index(axis=1, ascending=False)  # Sorting by an axis

Unnamed: 0,D,C,B,A
2013-01-01,0.363434,-1.584546,-0.141228,0.258781
2013-01-02,0.264204,-0.739023,-0.574329,0.5215
2013-01-03,-0.688696,1.519501,-0.860002,-0.594747
2013-01-04,0.19003,-1.599839,0.106394,2.581689
2013-01-05,0.407655,-1.704831,-0.727874,0.157918
2013-01-06,1.670256,1.152531,-0.612659,-0.004549


In [18]:
df.sort_values(by='B')  # Sorting by values

Unnamed: 0,A,B,C,D
2013-01-03,-0.594747,-0.860002,1.519501,-0.688696
2013-01-05,0.157918,-0.727874,-1.704831,0.407655
2013-01-06,-0.004549,-0.612659,1.152531,1.670256
2013-01-02,0.5215,-0.574329,-0.739023,0.264204
2013-01-01,0.258781,-0.141228,-1.584546,0.363434
2013-01-04,2.581689,0.106394,-1.599839,0.19003


### Selecting a single column yielding a Series, equivalent to df.A

In [19]:
df['A']

2013-01-01    0.258781
2013-01-02    0.521500
2013-01-03   -0.594747
2013-01-04    2.581689
2013-01-05    0.157918
2013-01-06   -0.004549
Freq: D, Name: A, dtype: float64

In [20]:
df[0:3]  # Also can be selected by []

Unnamed: 0,A,B,C,D
2013-01-01,0.258781,-0.141228,-1.584546,0.363434
2013-01-02,0.5215,-0.574329,-0.739023,0.264204
2013-01-03,-0.594747,-0.860002,1.519501,-0.688696


#### Also we can get cross section using a label

In [21]:
df.loc[dates[0]]

A    0.258781
B   -0.141228
C   -1.584546
D    0.363434
Name: 2013-01-01 00:00:00, dtype: float64

#### Selecting on a multi-axis by label

In [22]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.258781,-0.141228
2013-01-02,0.5215,-0.574329
2013-01-03,-0.594747,-0.860002
2013-01-04,2.581689,0.106394
2013-01-05,0.157918,-0.727874
2013-01-06,-0.004549,-0.612659


#### Showing label slicing, both endpoints are included

In [24]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.5215,-0.574329
2013-01-03,-0.594747,-0.860002
2013-01-04,2.581689,0.106394


In [25]:
df.loc['20130102', ['A', 'B']]

A    0.521500
B   -0.574329
Name: 2013-01-02 00:00:00, dtype: float64

In [26]:
df.loc[dates[0], 'A']

0.25878124317715734