In [2]:
# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 60)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline

# read in the data and print the first five rows
# use the Symbol column as the index, and 
# only read in columns in positions 0, 2, 3, 7
sp500 = pd.read_csv("sp500.csv", 
                    index_col='Symbol', 
                    usecols=[0, 2, 3, 7])

In [3]:
np.random.seed(12345)
df = pd.DataFrame({'foo':np.random.random(10000), 'key':range(100,10100)})
df[:5]

        foo  key
0  0.929616  100
1  0.316376  101
2  0.183919  102
3  0.204560  103
4  0.567725  104

In [4]:
# boolean select where key column is 10099
df[df.key==10099] 

           foo    key
9999  0.760378  10099

In [6]:
# time the select
%timeit df[df.key==10099]

949 µs ± 47.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
# move key to the index
df_key_index = df.set_index(['key'])
df[:5]
df_key_index.loc[10099]

foo    0.760378
Name: 10099, dtype: float64

# The fundamental index type: Index

In [9]:
#Create DataFrame by dict:
temps = pd.DataFrame({'City':['Missoula', 'Philadelphia'], 'Temperature':[70, 80]})
temps

           City  Temperature
0      Missoula           70
1  Philadelphia           80

In [10]:
temps.columns # column name = key = index

Index(['City', 'Temperature'], dtype='object')

In [12]:
"""
in pandas the default index is 64 bit integers; recent version using RangeIndex as optimization of Int64Index
"""
temps.index

RangeIndex(start=0, stop=2, step=1)

# Integer index labels using Int64Index and RangeIndex


In [22]:
# explicitly create an Int64Index
df_i64 = pd.DataFrame(np.arange(10, 20), index=np.arange(0, 10)) # index = : create index
df_i64[:5]

    0
0  10
1  11
2  12
3  13
4  14

In [14]:
# view the index
df_i64.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [17]:
# by default we are given a RangeIndex
df_range = pd.DataFrame(np.arange(10,15))
df_range

    0
0  10
1  11
2  12
3  13
4  14

# Floating point labels using Float64Index

In [21]:
df_64 = pd.DataFrame(np.arange(0, 1000, 5),
                    np.arange(0.0, 100.0, 0.5))
df_64

        0
0.0     0
0.5     5
1.0    10
1.5    15
2.0    20
...   ...
97.5  975
98.0  980
98.5  985
99.0  990
99.5  995

[200 rows x 1 columns]

In [23]:
df_64.index

Float64Index([ 0.0,  0.5,  1.0,  1.5,  2.0,  2.5,  3.0,
               3.5,  4.0,  4.5,
              ...
              95.0, 95.5, 96.0, 96.5, 97.0, 97.5, 98.0,
              98.5, 99.0, 99.5],
             dtype='float64', length=200)

# Representing discrete intervals using IntervalIndex

In [27]:
df_interval = pd.DataFrame({'A':[1, 2, 3,4]},
                           index = pd.IntervalIndex.from_breaks([
                               0, 0.5, 1.0, 1.5, 2.0
                           ]) )
df_interval

            A
(0.0, 0.5]  1
(0.5, 1.0]  2
(1.0, 1.5]  3
(1.5, 2.0]  4

# Categorical values as an index: CategoricalIndex

In [30]:
# shifting categorical column(B) into index of DataFrame
df_categorical = pd.DataFrame({'A':np.arange(6),
                               'B':list('aaadbe')})
df_categorical

   A  B
0  0  a
1  1  a
2  2  a
3  3  d
4  4  b
5  5  e

In [32]:
df_categorical['B'] = df_categorical['B'].astype('category') # make column B become Categorical type
df_categorical = df_categorical.set_index('B') # shift categorical column B to the index
df_categorical


   A
B   
a  0
a  1
a  2
d  3
b  4
e  5

In [33]:
# lookup values in category 'a'
df_categorical.loc['a'] #.loc find value base on key

   A
B   
a  0
a  1
a  2

# Indexing by dates and times using DatetimeIndex

In [36]:
# create a DatetimeIndex from a date range
rng = pd.date_range('15/4/2020', periods=5, freq='H') # create datetime index, 
                                                      # preiods = 5, freq = 'H' mean the hours from 0 to 4
ts = pd.Series(np.random.randn(len(rng)), index = (rng)) # #create random Series with datetime index
ts

2020-04-15 00:00:00    0.465777
2020-04-15 01:00:00   -0.430910
2020-04-15 02:00:00   -0.795362
2020-04-15 03:00:00    0.783361
2020-04-15 04:00:00    0.580506
Freq: H, dtype: float64

In [37]:
ts.index

DatetimeIndex(['2020-04-15 00:00:00',
               '2020-04-15 01:00:00',
               '2020-04-15 02:00:00',
               '2020-04-15 03:00:00',
               '2020-04-15 04:00:00'],
              dtype='datetime64[ns]', freq='H')

# Select value using an index

In [38]:
s = pd.Series(np.arange(0, 5), index = list('abdte'))
s

a    0
b    1
d    2
t    3
e    4
dtype: int32

In [39]:
# lookup by index label
s['b']

1

In [40]:
# explicit lookup by label
s.loc['b']

1

In [42]:
df = pd.DataFrame([np.arange(10,12),
                   np.arange(12, 14)],
                 columns=list('ab'),
                 index=list('vw'))
df

    a   b
v  10  11
w  12  13

In [43]:
# this returns the column 'a'
df['a']

v    10
w    12
Name: a, dtype: int64

In [57]:
# return the row 'w' by label
df.loc['v']

a    10
b    11
Name: v, dtype: int64

In [51]:
 #slices the Series from index label b to d
s = pd.Series(np.arange(0,5),
              index=list('abcde'))
s

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [58]:
s['b':'d'] # find value from index by using .loc['b'] but slide by just s['':''] = s.loc['b':'d']

b    1
c    2
d    3
dtype: int32

# Moving data to and from the index

In [59]:
# examine asome of the sp500 data
sp500[:5]

                        Sector   Price  Book Value
Symbol                                            
MMM                Industrials  141.14      26.668
ABT                Health Care   39.60      15.573
ABBV               Health Care   53.95       2.954
ACN     Information Technology   79.79       8.326
ACE                 Financials  102.91      86.897

In [61]:
# a column data index can be remove from the index of a DataFrame object using .reset_index()
index_reset = sp500.reset_index() # remove symbol from index
index_reset[:5]

  Symbol                  Sector   Price  Book Value
0    MMM             Industrials  141.14      26.668
1    ABT             Health Care   39.60      15.573
2   ABBV             Health Care   53.95       2.954
3    ACN  Information Technology   79.79       8.326
4    ACE              Financials  102.91      86.897

In [62]:
# a column data can be move to the index of a DataFrame object using .set_index()
col_move_index = index_reset.set_index('Sector')
col_move_index[:5]

                       Symbol   Price  Book Value
Sector                                           
Industrials               MMM  141.14      26.668
Health Care               ABT   39.60      15.573
Health Care              ABBV   53.95       2.954
Information Technology    ACN   79.79       8.326
Financials                ACE  102.91      86.897

In [63]:
#Reindexing a pandas object. Make the dataframe conform the new index;
    #Aligning data from the old index with the new and filling in NaN where alignment fails
reindexed = sp500.reindex(index=['MMM','ABBV','FOO']) # note: FOO ko co trong original data => NaN; 
                                                      # ABT and ACN ko co trong index => dropped
reindexed

             Sector   Price  Book Value
Symbol                                 
MMM     Industrials  141.14      26.668
ABBV    Health Care   53.95       2.954
FOO             NaN     NaN         NaN

In [64]:
# reindex columns
sp500.reindex(columns=['Price', 
                       'Book Value', 
                       'NewCol'])[:5]

         Price  Book Value  NewCol
Symbol                            
MMM     141.14      26.668     NaN
ABT      39.60      15.573     NaN
ABBV     53.95       2.954     NaN
ACN      79.79       8.326     NaN
ACE     102.91      86.897     NaN

# Hierarchical indexing

In [67]:
# a pandas index that has multiple levels of hierarchy is referred to as MultiIndex

In [66]:
# first, push symbol into a column
reindexed = sp500.reset_index()

In [68]:
# and now index sp500 by sector and symbol
multi_fi = reindexed.set_index(['Sector', 'Symbol'])
multi_fi[:5]

                                Price  Book Value
Sector                 Symbol                    
Industrials            MMM     141.14      26.668
Health Care            ABT      39.60      15.573
                       ABBV     53.95       2.954
Information Technology ACN      79.79       8.326
Financials             ACE     102.91      86.897

In [69]:
# the index is a MultiIndex
type(multi_fi.index)

pandas.core.indexes.multi.MultiIndex

In [70]:
# this has two levels
len(multi_fi.index.levels)

2

In [71]:
multi_fi.index.levels[0] #each index level is an index: level 0 -> Sector

Index(['Consumer Discretionary', 'Consumer Discretionary ',
       'Consumer Staples', 'Consumer Staples ', 'Energy',
       'Financials', 'Health Care', 'Industrials',
       'Industries', 'Information Technology', 'Materials',
       'Telecommunications Services', 'Utilities'],
      dtype='object', name='Sector')

In [72]:
multi_fi.index.levels[1] # level 1 -> Symbol

Index(['A', 'AA', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACE',
       'ACN', 'ACT', 'ADBE',
       ...
       'XLNX', 'XOM', 'XRAY', 'XRX', 'XYL', 'YHOO', 'YUM',
       'ZION', 'ZMH', 'ZTS'],
      dtype='object', name='Symbol', length=500)

In [77]:
 #to select from a hierarchy of indexes, you can chain .xs()
multi_fi.xs('Industrials', level=0)[:5] #select all items with a level 0 (Sector) index label of Industrials; 
                               # the level for which values are dropped 
multi_fi.xs('Industrials')[:5] # same result with above

         Price  Book Value
Symbol                    
MMM     141.14      26.668
ALLE     52.46       0.000
APH      95.71      18.315
AVY      48.20      15.616
BA      132.41      19.870

In [76]:
multi_fi.xs('ALLE', level=1)  # select row with index colum 'Symbol' and index row 'ALLE'; 
                              # Select rows where level 1 (Symbol) is ALLE

             Price  Book Value
Sector                        
Industrials  52.46         0.0

In [79]:
multi_fi.xs('ALLE', level = 1, drop_level=False) #prevent levels index from being dropped (Symbol column appear in display)

                    Price  Book Value
Sector      Symbol                   
Industrials ALLE    52.46         0.0

In [89]:
multi_fi.xs('Industrials').xs('UPS') #chain 'Industrial' in Sector with 'UPS' in Symbol

Price         102.73
Book Value      6.79
Name: UPS, dtype: float64