<a href="https://colab.research.google.com/github/PJdin/PJdin/blob/main/AIDL220325.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuring panadas

In [3]:
!wget https://raw.githubusercontent.com/PacktPublishing/Learning-Pandas-Second-Edition/master/data/sp500.csv

--2022-03-25 06:13:47--  https://raw.githubusercontent.com/PacktPublishing/Learning-Pandas-Second-Edition/master/data/sp500.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 83629 (82K) [text/plain]
Saving to: ‘sp500.csv’


2022-03-25 06:13:47 (6.32 MB/s) - ‘sp500.csv’ saved [83629/83629]



In [7]:
# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date


# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 60)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline

# read in the data and print the first five rows
# use the Symbol column as the index, and 
# only read in columns in positions 0, 2, 3, 7
sp500 = pd.read_csv("./sp500.csv", 
                    index_col='Symbol', 
                    usecols=[0, 2, 3, 7])

# The importance of indexes

In [13]:
# create DataFame of random numbers and a key column
# random.seed 產生第一次執行的亂數 之後執行都輸出同樣的參數
np.random.seed(123456)
df = pd.DataFrame({'foo':np.random.random(10000), 'key':range(100, 10100)})
df[:5]

In [12]:
# boolean select where key is 10099
df[df.key==10099]

In [14]:
# time the select
%timeit df[df.key==10099]

The slowest run took 4.44 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 5: 310 µs per loop


In [15]:
# move key to the index
df_with_index = df.set_index(['key'])
df_with_index[:5]

In [16]:
# now can lookup with the index
# 用索引找尋
df_with_index.loc[10099]

foo    0.272283
Name: 10099, dtype: float64

In [17]:
# and this is a lot faster
%timeit df_with_index.loc[10099]

The slowest run took 6.18 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 5: 66.9 µs per loop


# The fundamental index type: Index

In [18]:
# show that the columns are actually an index
temps = pd.DataFrame({ "City": ["Missoula", "Philadelphia"],
                       "Temperature": [70, 80] })
temps

In [21]:
# we can see columns is an index
# 查看列與類型
temps.columns

Index(['City', 'Temperature'], dtype='object')

# Integer index labels using Int64Index and RangeIndex

In [20]:
# explicitly create an Int64Index
df_i64 = pd.DataFrame(np.arange(10, 20), index=np.arange(0, 10))
df_i64[:5]

Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [22]:
# view the index
df_i64.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [26]:
# by default we are given a RangeIndex
# 欄位給予數字10~15
df_range = pd.DataFrame(np.arange(10, 15))
df_range[:5]

In [27]:
# start 起點 stop 停止 step間隔
df_range.index

RangeIndex(start=0, stop=5, step=1)

# Floating point labels using Float64Index

In [28]:
# indexes using a Float64Index
# pd.DateFrame(欄，列)
# arange(起始數字，結束數字，間隔)
df_f64 = pd.DataFrame(np.arange(0, 1000, 5), 
                      np.arange(0.0, 100.0, 0.5))
df_f64.iloc[:5] # need iloc to slice first five

In [29]:
# 輸出index欄位
df_f64.index

Float64Index([ 0.0,  0.5,  1.0,  1.5,  2.0,  2.5,  3.0,
               3.5,  4.0,  4.5,
              ...
              95.0, 95.5, 96.0, 96.5, 97.0, 97.5, 98.0,
              98.5, 99.0, 99.5],
             dtype='float64', length=200)

# Representing discrete intervals using IntervalIndex

In [30]:
# a DataFrame with an IntervalIndex
# IntervalIndex.from_breaks(數字) 從拆分數組構造一個區間索引
df_interval = pd.DataFrame({ "A": [1, 2, 3, 4]},
                    index = pd.IntervalIndex.from_breaks(
                        [0, 0.5, 1.0, 1.5, 2.0]))
df_interval

In [31]:
# 列出index欄位
df_interval.index

IntervalIndex([(0.0, 0.5], (0.5, 1.0], (1.0, 1.5], (1.5, 2.0]], dtype='interval[float64, right]')

# Categorical values as an index: CategoricalIndex

In [32]:
# 可能版本問題有錯誤
# create a DataFrame with a Categorical coulmn
df_categorical = pd.DataFrame({'A': np.arange(6),
                               'B': list('aabbca')})
df_categorical['B'] = df_categorical['B'].astype('category', 
              categories=list('cab'))
df_categorical

TypeError: ignored

In [35]:
# shift the categorical column to the index
df_categorical = df_categorical.set_index('B')
df_categorical.index

KeyError: ignored

In [34]:
# lookup values in category 'a'
df_categorical.loc['a']

# Indexing by dates and times using DatetimeIndex

In [38]:
# create a DatetimeIndex from a date range
# date_range(開始時間,periods=時期,freq=頻率) 指定的頻率生成時間點
rng = pd.date_range('5/1/2017', periods=5, freq='H')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2017-05-01 00:00:00   -0.449276
2017-05-01 01:00:00    2.472977
2017-05-01 02:00:00   -0.716023
2017-05-01 03:00:00   -0.032915
2017-05-01 04:00:00   -0.337780
Freq: H, dtype: float64

In [40]:
# 列出index欄位與類型
ts.index

DatetimeIndex(['2017-05-01 00:00:00',
               '2017-05-01 01:00:00',
               '2017-05-01 02:00:00',
               '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

# Indexing periods of time using PeriodIndex

In [41]:
# explicily create a PeriodIndex
PeriodIndex([時間],frep=頻率)
periods = pd.PeriodIndex(['2017-1', '2017-2', '2017-3'], freq='M')
periods

PeriodIndex(['2017-01', '2017-02', '2017-03'], dtype='period[M]')

In [42]:
# use the index in a Series
# Series類似陣列的物件
period_series = pd.Series(np.random.randn(len(periods)), 
                          index=periods)
period_series

2017-01   -0.437051
2017-02    0.533249
2017-03   -0.819218
Freq: M, dtype: float64

# Creating and using an index with a Series or DataFrame

In [None]:
# create a DatetimeIndex
date_times = pd.DatetimeIndex(pd.date_range('5/1/2017', 
                                            periods=5, 
                                            freq='H'))
date_times

DatetimeIndex(['2017-05-01 00:00:00',
               '2017-05-01 01:00:00',
               '2017-05-01 02:00:00',
               '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [None]:
# create a DataFrame using the index
df_date_times = pd.DataFrame(np.arange(0, len(date_times)), 
                             index=date_times)
df_date_times

                     0
2017-05-01 00:00:00  0
2017-05-01 01:00:00  1
2017-05-01 02:00:00  2
2017-05-01 03:00:00  3
2017-05-01 04:00:00  4

In [None]:
# set the index of a DataFrame
df_date_times.index = pd.DatetimeIndex(pd.date_range('6/1/2017', 
                                                     periods=5, 
                                                     freq='H'))
df_date_times

                     0
2017-06-01 00:00:00  0
2017-06-01 01:00:00  1
2017-06-01 02:00:00  2
2017-06-01 03:00:00  3
2017-06-01 04:00:00  4

# Selecting values using an index

In [None]:
# create a series
s = pd.Series(np.arange(0, 5), index=list('abcde'))
s

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [None]:
# lookup by index label
s['b']

1

In [None]:
# explicit lookup by label
s.loc['b']

1

In [None]:
# create a DataFrame with two columns
df = pd.DataFrame([ np.arange(10, 12), 
                    np.arange(12, 14)], 
                  columns=list('ab'), 
                  index=list('vw'))
df

    a   b
v  10  11
w  12  13

In [None]:
# this returns the column 'a'
df['a']

v    10
w    12
Name: a, dtype: int64

In [None]:
# return the row 'w' by label
df.loc['w']

a    12
b    13
Name: w, dtype: int64

In [None]:
# slices the Series from index label b to d
s['b':'d']

b    1
c    2
d    3
dtype: int64

In [None]:
# this explicitly slices from label b to d
s.loc['b':'d']

b    1
c    2
d    3
dtype: int64

In [None]:
# and this looks up rows by label
s.loc[['a', 'c', 'e']]

a    0
c    2
e    4
dtype: int64

# Moving data to and from the index 

In [None]:
# examine asome of the sp500 data
sp500[:5]

                        Sector   Price  Book Value
Symbol                                            
MMM                Industrials  141.14      26.668
ABT                Health Care   39.60      15.573
ABBV               Health Care   53.95       2.954
ACN     Information Technology   79.79       8.326
ACE                 Financials  102.91      86.897

In [None]:
# reset the index which moves the values in the index to a column
index_moved_to_col = sp500.reset_index()
index_moved_to_col[:5]

  Symbol                  Sector   Price  Book Value
0    MMM             Industrials  141.14      26.668
1    ABT             Health Care   39.60      15.573
2   ABBV             Health Care   53.95       2.954
3    ACN  Information Technology   79.79       8.326
4    ACE              Financials  102.91      86.897

In [None]:
# and now set the Sector column to be the index
index_moved_to_col.set_index('Sector')[:5]

                       Symbol   Price  Book Value
Sector                                           
Industrials               MMM  141.14      26.668
Health Care               ABT   39.60      15.573
Health Care              ABBV   53.95       2.954
Information Technology    ACN   79.79       8.326
Financials                ACE  102.91      86.897

In [None]:
# reindex to have MMM, ABBV, and FOO index labels
reindexed = sp500.reindex(index=['MMM', 'ABBV', 'FOO'])
# note that ABT and ACN are dropped and FOO has NaN values
reindexed

             Sector   Price  Book Value
Symbol                                 
MMM     Industrials  141.14      26.668
ABBV    Health Care   53.95       2.954
FOO             NaN     NaN         NaN

In [None]:
# reindex columns
sp500.reindex(columns=['Price', 
                       'Book Value', 
                       'NewCol'])[:5]

         Price  Book Value  NewCol
Symbol                            
MMM     141.14      26.668     NaN
ABT      39.60      15.573     NaN
ABBV     53.95       2.954     NaN
ACN      79.79       8.326     NaN
ACE     102.91      86.897     NaN

# Hierarchical indexing

In [None]:
# first, push symbol into a column
reindexed = sp500.reset_index()
# and now index sp500 by sector and symbol
multi_fi = reindexed.set_index(['Sector', 'Symbol'])
multi_fi[:5]

                                Price  Book Value
Sector                 Symbol                    
Industrials            MMM     141.14      26.668
Health Care            ABT      39.60      15.573
                       ABBV     53.95       2.954
Information Technology ACN      79.79       8.326
Financials             ACE     102.91      86.897

In [None]:
# the index is a MultiIndex
type(multi_fi.index)

pandas.core.indexes.multi.MultiIndex

In [None]:
# this has two levels
len(multi_fi.index.levels)

2

In [None]:
# each index level is an index
multi_fi.index.levels[0]

Index(['Consumer Discretionary', 'Consumer Discretionary ',
       'Consumer Staples', 'Consumer Staples ', 'Energy',
       'Financials', 'Health Care', 'Industrials',
       'Industries', 'Information Technology', 'Materials',
       'Telecommunications Services', 'Utilities'],
      dtype='object', name='Sector')

In [None]:
# each index level is an index
multi_fi.index.levels[1]

Index(['A', 'AA', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACE',
       'ACN', 'ACT', 'ADBE',
       ...
       'XLNX', 'XOM', 'XRAY', 'XRX', 'XYL', 'YHOO', 'YUM',
       'ZION', 'ZMH', 'ZTS'],
      dtype='object', name='Symbol', length=500)

In [None]:
# values of index level 0
multi_fi.index.get_level_values(0)

Index(['Industrials', 'Health Care', 'Health Care',
       'Information Technology', 'Financials',
       'Health Care', 'Information Technology',
       'Utilities', 'Health Care', 'Financials',
       ...
       'Utilities', 'Information Technology',
       'Information Technology', 'Financials',
       'Industrials', 'Information Technology',
       'Consumer Discretionary', 'Health Care',
       'Financials', 'Health Care'],
      dtype='object', name='Sector', length=500)

In [None]:
# get all stocks that are Industrials
# note the result drops level 0 of the index
multi_fi.xs('Industrials')[:5]

         Price  Book Value
Symbol                    
MMM     141.14      26.668
ALLE     52.46       0.000
APH      95.71      18.315
AVY      48.20      15.616
BA      132.41      19.870

In [None]:
# select rows where level 1 (Symbol) is ALLE
# note that the Sector level is dropped from the result
multi_fi.xs('ALLE', level=1)

             Price  Book Value
Sector                        
Industrials  52.46         0.0

In [None]:
# Industrials, without dropping the level
multi_fi.xs('Industrials', drop_level=False)[:5]

                     Price  Book Value
Sector      Symbol                    
Industrials MMM     141.14      26.668
            ALLE     52.46       0.000
            APH      95.71      18.315
            AVY      48.20      15.616
            BA      132.41      19.870

In [None]:
# drill through the levels
multi_fi.xs('Industrials').xs('UPS')

Price         102.73
Book Value      6.79
Name: UPS, dtype: float64

In [None]:
# drill through using tuples
multi_fi.xs(('Industrials', 'UPS'))

Price         102.73
Book Value      6.79
Name: (Industrials, UPS), dtype: float64