# Pandas Overview

In [5]:
import numpy as np
import pandas as pd

### Basic Series

In [6]:
my_series = pd.Series([1, 4, 5, 7, np.nan, 8, 9])
my_series

0    1.0
1    4.0
2    5.0
3    7.0
4    NaN
5    8.0
6    9.0
dtype: float64

### Date Time Object

In [9]:
my_date_index = pd.date_range('20170101', periods = 6, freq = 'D')
my_date_index

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06'],
              dtype='datetime64[ns]', freq='D')

### Sample NumPy Data

In [8]:
sample_numpy_data = np.array(np.arange(24)).reshape((6,4))
sample_numpy_data

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

### Sample DataFrame

In [10]:
sample_df = pd.DataFrame(sample_numpy_data, index = my_date_index, columns  = list('ABCD'))
sample_df

Unnamed: 0,A,B,C,D
2017-01-01,0,1,2,3
2017-01-02,4,5,6,7
2017-01-03,8,9,10,11
2017-01-04,12,13,14,15
2017-01-05,16,17,18,19
2017-01-06,20,21,22,23


### Data Frame from Python Dictionary

In [11]:
sample_df.head()

Unnamed: 0,A,B,C,D
2017-01-01,0,1,2,3
2017-01-02,4,5,6,7
2017-01-03,8,9,10,11
2017-01-04,12,13,14,15
2017-01-05,16,17,18,19


In [12]:
sample_df.tail(2)

Unnamed: 0,A,B,C,D
2017-01-05,16,17,18,19
2017-01-06,20,21,22,23


In [14]:
sample_df.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [15]:
sample_df.index

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06'],
              dtype='datetime64[ns]', freq='D')

In [16]:
sample_df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [17]:
sample_df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,10.0,11.0,12.0,13.0
std,7.483315,7.483315,7.483315,7.483315
min,0.0,1.0,2.0,3.0
25%,5.0,6.0,7.0,8.0
50%,10.0,11.0,12.0,13.0
75%,15.0,16.0,17.0,18.0
max,20.0,21.0,22.0,23.0


In [19]:
pd.set_option('display.precision', 2)

In [20]:
sample_df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,10.0,11.0,12.0,13.0
std,7.48,7.48,7.48,7.48
min,0.0,1.0,2.0,3.0
25%,5.0,6.0,7.0,8.0
50%,10.0,11.0,12.0,13.0
75%,15.0,16.0,17.0,18.0
max,20.0,21.0,22.0,23.0


In [21]:
sample_df.T

Unnamed: 0,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00
A,0,4,8,12,16,20
B,1,5,9,13,17,21
C,2,6,10,14,18,22
D,3,7,11,15,19,23


In [22]:
sample_df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2017-01-01,3,2,1,0
2017-01-02,7,6,5,4
2017-01-03,11,10,9,8
2017-01-04,15,14,13,12
2017-01-05,19,18,17,16
2017-01-06,23,22,21,20


In [23]:
sample_df.sort_index(axis = 0, ascending = False)

Unnamed: 0,A,B,C,D
2017-01-06,20,21,22,23
2017-01-05,16,17,18,19
2017-01-04,12,13,14,15
2017-01-03,8,9,10,11
2017-01-02,4,5,6,7
2017-01-01,0,1,2,3


In [24]:
sample_df.sort_values(by='B', ascending = False)

Unnamed: 0,A,B,C,D
2017-01-06,20,21,22,23
2017-01-05,16,17,18,19
2017-01-04,12,13,14,15
2017-01-03,8,9,10,11
2017-01-02,4,5,6,7
2017-01-01,0,1,2,3


# Selection

Selecting data from DataFrame

In [25]:
sample_df

Unnamed: 0,A,B,C,D
2017-01-01,0,1,2,3
2017-01-02,4,5,6,7
2017-01-03,8,9,10,11
2017-01-04,12,13,14,15
2017-01-05,16,17,18,19
2017-01-06,20,21,22,23


In [26]:
sample_df['C']

2017-01-01     2
2017-01-02     6
2017-01-03    10
2017-01-04    14
2017-01-05    18
2017-01-06    22
Freq: D, Name: C, dtype: int64

In [27]:
sample_df[1:4]

Unnamed: 0,A,B,C,D
2017-01-02,4,5,6,7
2017-01-03,8,9,10,11
2017-01-04,12,13,14,15


In [29]:
sample_df.loc[my_date_index[1:3]]

Unnamed: 0,A,B,C,D
2017-01-02,4,5,6,7
2017-01-03,8,9,10,11


In [30]:
sample_df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2017-01-01,0,1
2017-01-02,4,5
2017-01-03,8,9
2017-01-04,12,13
2017-01-05,16,17
2017-01-06,20,21


In [32]:
sample_df.loc[:, ['A', 'B']] * 6

Unnamed: 0,A,B
2017-01-01,0,6
2017-01-02,24,30
2017-01-03,48,54
2017-01-04,72,78
2017-01-05,96,102
2017-01-06,120,126


In [34]:
sample_df.iloc[3]

A    12
B    13
C    14
D    15
Name: 2017-01-04 00:00:00, dtype: int64

In [35]:
sample_df.iloc[1:3, 2:5]

Unnamed: 0,C,D
2017-01-02,6,7
2017-01-03,10,11


In [36]:
sample_df.C >= 14

2017-01-01    False
2017-01-02    False
2017-01-03    False
2017-01-04     True
2017-01-05     True
2017-01-06     True
Freq: D, Name: C, dtype: bool

In [37]:
sample_df[sample_df >= 11]

Unnamed: 0,A,B,C,D
2017-01-01,,,,
2017-01-02,,,,
2017-01-03,,,,11.0
2017-01-04,12.0,13.0,14.0,15.0
2017-01-05,16.0,17.0,18.0,19.0
2017-01-06,20.0,21.0,22.0,23.0
