# Pandas Overview

In [48]:
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Basic Series

In [50]:
my_series = pd.Series([1, 4, 5, 7, np.nan, 8, 9])
my_series.index
my_series

RangeIndex(start=0, stop=7, step=1)

0    1.0
1    4.0
2    5.0
3    7.0
4    NaN
5    8.0
6    9.0
dtype: float64

### Date Time Object

In [111]:
my_date_index = pd.date_range('20170115', periods = 6, freq = 'D')
my_date_index

DatetimeIndex(['2017-01-15', '2017-01-16', '2017-01-17', '2017-01-18',
               '2017-01-19', '2017-01-20'],
              dtype='datetime64[ns]', freq='D')

### Sample NumPy Data

In [112]:
sample_numpy_data = np.array(np.arange(24)).reshape((3, 2, 4))
sample_numpy_data

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]],

       [[16, 17, 18, 19],
        [20, 21, 22, 23]]])

In [113]:
sample_numpy_data.shape = (6, 4)

In [114]:
sample_numpy_data

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [115]:
np.arange(24).shape = (4, 6)

In [116]:
np.arange(24).shape

(24,)

In [117]:
sample_numpy_data.shape

(6, 4)

In [118]:
np.sum(sample_numpy_data)

276

In [119]:
sample_numpy_data

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [120]:
my_date_index

DatetimeIndex(['2017-01-15', '2017-01-16', '2017-01-17', '2017-01-18',
               '2017-01-19', '2017-01-20'],
              dtype='datetime64[ns]', freq='D')

### Sample DataFrame

In [122]:
sample_df = pd.DataFrame(sample_numpy_data, index = my_date_index, columns= list('ABCD'))
type(sample_df)
sample_df

pandas.core.frame.DataFrame

Unnamed: 0,A,B,C,D
2017-01-15,0,1,2,3
2017-01-16,4,5,6,7
2017-01-17,8,9,10,11
2017-01-18,12,13,14,15
2017-01-19,16,17,18,19
2017-01-20,20,21,22,23


In [123]:
sample_df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

### Data Frame from Python Dictionary

In [125]:
sample_df.head(6)

Unnamed: 0,A,B,C,D
2017-01-15,0,1,2,3
2017-01-16,4,5,6,7
2017-01-17,8,9,10,11
2017-01-18,12,13,14,15
2017-01-19,16,17,18,19
2017-01-20,20,21,22,23


In [126]:
sample_df.tail(2)

Unnamed: 0,A,B,C,D
2017-01-19,16,17,18,19
2017-01-20,20,21,22,23


In [127]:
sample_df.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [128]:
sample_df.index

DatetimeIndex(['2017-01-15', '2017-01-16', '2017-01-17', '2017-01-18',
               '2017-01-19', '2017-01-20'],
              dtype='datetime64[ns]', freq='D')

In [129]:
sample_df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [130]:
sample_df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,10.0,11.0,12.0,13.0
std,7.48,7.48,7.48,7.48
min,0.0,1.0,2.0,3.0
25%,5.0,6.0,7.0,8.0
50%,10.0,11.0,12.0,13.0
75%,15.0,16.0,17.0,18.0
max,20.0,21.0,22.0,23.0


In [None]:
pd.set_option('display.precision', 2)

In [None]:
sample_df.describe()

In [131]:
sample_df.T

Unnamed: 0,2017-01-15 00:00:00,2017-01-16 00:00:00,2017-01-17 00:00:00,2017-01-18 00:00:00,2017-01-19 00:00:00,2017-01-20 00:00:00
A,0,4,8,12,16,20
B,1,5,9,13,17,21
C,2,6,10,14,18,22
D,3,7,11,15,19,23


In [132]:
sample_df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2017-01-15,3,2,1,0
2017-01-16,7,6,5,4
2017-01-17,11,10,9,8
2017-01-18,15,14,13,12
2017-01-19,19,18,17,16
2017-01-20,23,22,21,20


In [133]:
sample_df.sort_index(axis = 0, ascending = False)

Unnamed: 0,A,B,C,D
2017-01-20,20,21,22,23
2017-01-19,16,17,18,19
2017-01-18,12,13,14,15
2017-01-17,8,9,10,11
2017-01-16,4,5,6,7
2017-01-15,0,1,2,3


In [134]:
sample_df.sort_values(by='B', ascending = False)

Unnamed: 0,A,B,C,D
2017-01-20,20,21,22,23
2017-01-19,16,17,18,19
2017-01-18,12,13,14,15
2017-01-17,8,9,10,11
2017-01-16,4,5,6,7
2017-01-15,0,1,2,3


# Selection

Selecting data from DataFrame

In [135]:
sample_df

Unnamed: 0,A,B,C,D
2017-01-15,0,1,2,3
2017-01-16,4,5,6,7
2017-01-17,8,9,10,11
2017-01-18,12,13,14,15
2017-01-19,16,17,18,19
2017-01-20,20,21,22,23


In [136]:
sample_df['C']

2017-01-15     2
2017-01-16     6
2017-01-17    10
2017-01-18    14
2017-01-19    18
2017-01-20    22
Freq: D, Name: C, dtype: int64

In [137]:
sample_df[1:4]

Unnamed: 0,A,B,C,D
2017-01-16,4,5,6,7
2017-01-17,8,9,10,11
2017-01-18,12,13,14,15


In [138]:
sample_df.loc[my_date_index[1:3]]

Unnamed: 0,A,B,C,D
2017-01-16,4,5,6,7
2017-01-17,8,9,10,11


In [139]:
my_date_index[1:3]

DatetimeIndex(['2017-01-16', '2017-01-17'], dtype='datetime64[ns]', freq='D')

In [140]:
sample_df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2017-01-15,0,1
2017-01-16,4,5
2017-01-17,8,9
2017-01-18,12,13
2017-01-19,16,17
2017-01-20,20,21


In [141]:
sample_df.loc[:, ['A', 'B']] * 6

Unnamed: 0,A,B
2017-01-15,0,6
2017-01-16,24,30
2017-01-17,48,54
2017-01-18,72,78
2017-01-19,96,102
2017-01-20,120,126


In [143]:
sample_df.loc[:, :]

Unnamed: 0,A,B,C,D
2017-01-15,0,1,2,3
2017-01-16,4,5,6,7
2017-01-17,8,9,10,11
2017-01-18,12,13,14,15
2017-01-19,16,17,18,19
2017-01-20,20,21,22,23


In [149]:
type(sample_df.iloc[3])

pandas.core.series.Series

In [150]:
sample_df.iloc[3, 1]

13

In [154]:
sample_df >= 11

Unnamed: 0,A,B,C,D
2017-01-15,False,False,False,False
2017-01-16,False,False,False,False
2017-01-17,False,False,False,True
2017-01-18,True,True,True,True
2017-01-19,True,True,True,True
2017-01-20,True,True,True,True


In [152]:
sample_df[sample_df >= 11]

Unnamed: 0,A,B,C,D
2017-01-15,,,,
2017-01-16,,,,
2017-01-17,,,,11.0
2017-01-18,12.0,13.0,14.0,15.0
2017-01-19,16.0,17.0,18.0,19.0
2017-01-20,20.0,21.0,22.0,23.0
