In [0]:
# reference: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html

In [0]:
import pandas as pd
import numpy as np

In [0]:
# Object creation
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [0]:
dates = pd.date_range('20130101', periods = 6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [0]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.534878,-0.048425,-1.283282,0.350047
2013-01-02,-1.302898,-0.679378,0.586519,0.109681
2013-01-03,0.527272,-0.237457,0.505365,-0.083227
2013-01-04,0.29203,0.532315,-0.078534,-0.562857
2013-01-05,0.25788,0.734306,0.680984,-0.874495
2013-01-06,-1.047663,-1.38289,-0.069737,-1.815099


In [0]:
df2 = pd.DataFrame(
    {
     'A': 1.,
     'B': pd.Timestamp('20130102'),
     'C': pd.Series(1, index = list(range(4)), dtype = 'float32'),
     'D': np.array([3] * 4, dtype='int32'),
     'E': pd.Categorical(['test', 'train', 'test', 'train']),
     'F': 'foo'
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [0]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [0]:
# Viewing data
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.534878,-0.048425,-1.283282,0.350047
2013-01-02,-1.302898,-0.679378,0.586519,0.109681
2013-01-03,0.527272,-0.237457,0.505365,-0.083227
2013-01-04,0.29203,0.532315,-0.078534,-0.562857
2013-01-05,0.25788,0.734306,0.680984,-0.874495


In [0]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.29203,0.532315,-0.078534,-0.562857
2013-01-05,0.25788,0.734306,0.680984,-0.874495
2013-01-06,-1.047663,-1.38289,-0.069737,-1.815099


In [0]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [0]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [0]:
df.to_numpy()

array([[ 1.53487823, -0.04842454, -1.28328215,  0.35004692],
       [-1.30289775, -0.67937752,  0.58651931,  0.10968116],
       [ 0.52727183, -0.23745675,  0.50536534, -0.08322688],
       [ 0.29202972,  0.53231522, -0.07853355, -0.56285739],
       [ 0.25787958,  0.7343056 ,  0.68098432, -0.87449497],
       [-1.04766345, -1.38288969, -0.0697371 , -1.81509936]])

In [0]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.043583,-0.180255,0.056886,-0.479325
std,1.055405,0.78234,0.73506,0.792352
min,-1.302898,-1.38289,-1.283282,-1.815099
25%,-0.721278,-0.568897,-0.076334,-0.796586
50%,0.274955,-0.142941,0.217814,-0.323042
75%,0.468461,0.38713,0.566231,0.061454
max,1.534878,0.734306,0.680984,0.350047


In [0]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.350047,-1.283282,-0.048425,1.534878
2013-01-02,0.109681,0.586519,-0.679378,-1.302898
2013-01-03,-0.083227,0.505365,-0.237457,0.527272
2013-01-04,-0.562857,-0.078534,0.532315,0.29203
2013-01-05,-0.874495,0.680984,0.734306,0.25788
2013-01-06,-1.815099,-0.069737,-1.38289,-1.047663


In [0]:
df.sort_values(by = 'B')

Unnamed: 0,A,B,C,D
2013-01-06,-1.047663,-1.38289,-0.069737,-1.815099
2013-01-02,-1.302898,-0.679378,0.586519,0.109681
2013-01-03,0.527272,-0.237457,0.505365,-0.083227
2013-01-01,1.534878,-0.048425,-1.283282,0.350047
2013-01-04,0.29203,0.532315,-0.078534,-0.562857
2013-01-05,0.25788,0.734306,0.680984,-0.874495


In [0]:
# Getting
df['A']

2013-01-01    1.534878
2013-01-02   -1.302898
2013-01-03    0.527272
2013-01-04    0.292030
2013-01-05    0.257880
2013-01-06   -1.047663
Freq: D, Name: A, dtype: float64

In [0]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.534878,-0.048425,-1.283282,0.350047
2013-01-02,-1.302898,-0.679378,0.586519,0.109681
2013-01-03,0.527272,-0.237457,0.505365,-0.083227


In [0]:
df['20130102': '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-1.302898,-0.679378,0.586519,0.109681
2013-01-03,0.527272,-0.237457,0.505365,-0.083227
2013-01-04,0.29203,0.532315,-0.078534,-0.562857


In [0]:
# Selection by label
dates[0]

Timestamp('2013-01-01 00:00:00', freq='D')

In [0]:
df.loc[dates[0]]

A    1.534878
B   -0.048425
C   -1.283282
D    0.350047
Name: 2013-01-01 00:00:00, dtype: float64

In [0]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,1.534878,-0.048425
2013-01-02,-1.302898,-0.679378
2013-01-03,0.527272,-0.237457
2013-01-04,0.29203,0.532315
2013-01-05,0.25788,0.734306
2013-01-06,-1.047663,-1.38289


In [0]:
df.loc['20130102': '20130104', ['A', 'B']] # pd.DataFrame으로 나옴

Unnamed: 0,A,B
2013-01-02,-1.302898,-0.679378
2013-01-03,0.527272,-0.237457
2013-01-04,0.29203,0.532315


In [0]:
df.loc['20130102', ['A', 'B']] # pd.Series로 나옴

A   -1.302898
B   -0.679378
Name: 2013-01-02 00:00:00, dtype: float64

In [0]:
df.loc[dates[0], 'A'] # np.float64로 나옴

1.534878231440857

In [0]:
# Selection by position
df

Unnamed: 0,A,B,C,D
2013-01-01,1.534878,-0.048425,-1.283282,0.350047
2013-01-02,-1.302898,-0.679378,0.586519,0.109681
2013-01-03,0.527272,-0.237457,0.505365,-0.083227
2013-01-04,0.29203,0.532315,-0.078534,-0.562857
2013-01-05,0.25788,0.734306,0.680984,-0.874495
2013-01-06,-1.047663,-1.38289,-0.069737,-1.815099


In [0]:
df.iloc[3]

A    0.292030
B    0.532315
C   -0.078534
D   -0.562857
Name: 2013-01-04 00:00:00, dtype: float64

In [0]:
df.iloc[3:5, :2]

Unnamed: 0,A,B
2013-01-04,0.29203,0.532315
2013-01-05,0.25788,0.734306


In [0]:
# Boolean indexing
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.534878,-0.048425,-1.283282,0.350047
2013-01-03,0.527272,-0.237457,0.505365,-0.083227
2013-01-04,0.29203,0.532315,-0.078534,-0.562857
2013-01-05,0.25788,0.734306,0.680984,-0.874495


In [0]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.534878,,,0.350047
2013-01-02,,,0.586519,0.109681
2013-01-03,0.527272,,0.505365,
2013-01-04,0.29203,0.532315,,
2013-01-05,0.25788,0.734306,0.680984,
2013-01-06,,,,


In [0]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.534878,-0.048425,-1.283282,0.350047,one
2013-01-02,-1.302898,-0.679378,0.586519,0.109681,one
2013-01-03,0.527272,-0.237457,0.505365,-0.083227,two
2013-01-04,0.29203,0.532315,-0.078534,-0.562857,three
2013-01-05,0.25788,0.734306,0.680984,-0.874495,four
2013-01-06,-1.047663,-1.38289,-0.069737,-1.815099,three


In [0]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.527272,-0.237457,0.505365,-0.083227,two
2013-01-05,0.25788,0.734306,0.680984,-0.874495,four


In [0]:
# Missing data
df1 = df.reindex(index = dates[:4], columns = list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,1.534878,-0.048425,-1.283282,0.350047,1.0
2013-01-02,-1.302898,-0.679378,0.586519,0.109681,1.0
2013-01-03,0.527272,-0.237457,0.505365,-0.083227,
2013-01-04,0.29203,0.532315,-0.078534,-0.562857,


In [0]:
df1.dropna(how = 'any') # dropna()해도 됨

Unnamed: 0,A,B,C,D,E
2013-01-01,1.534878,-0.048425,-1.283282,0.350047,1.0
2013-01-02,-1.302898,-0.679378,0.586519,0.109681,1.0


In [0]:
df1.fillna(value= 5)

Unnamed: 0,A,B,C,D,E
2013-01-01,1.534878,-0.048425,-1.283282,0.350047,1.0
2013-01-02,-1.302898,-0.679378,0.586519,0.109681,1.0
2013-01-03,0.527272,-0.237457,0.505365,-0.083227,5.0
2013-01-04,0.29203,0.532315,-0.078534,-0.562857,5.0


In [0]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True
