# 발표해야할 내용 164 ~ 185p

###### 인덱스 관련 객체

In [2]:
import pandas as pd

ind = pd.Index([1, 3, 5, 7, 9, 11])
ind

Int64Index([1, 3, 5, 7, 9, 11], dtype='int64')

In [2]:
ind[::2]

Int64Index([1, 5, 9], dtype='int64')

In [3]:
ind.ndim, ind.shape

(1, (6,))

In [4]:
ind[1] = 6
# 배열 변경 불가능함! 안전하게하기 위해! [넘파이배열과 판다스 인덱스 객체의 차이점임]

TypeError: Index does not support mutable operations

###### pandas.index 클래스

In [5]:
pd.Index([1, 2, 3])

Int64Index([1, 2, 3], dtype='int64')

In [6]:
pd.Index(list('abc'))

Index(['a', 'b', 'c'], dtype='object')

###### pandas.RangeIndex 클래스

In [4]:
import numpy as np
df = pd.DataFrame(np.arange(12).reshape(2, 6), columns=list('ABCDEF'))
df

Unnamed: 0,A,B,C,D,E,F
0,0,1,2,3,4,5
1,6,7,8,9,10,11


In [8]:
df.index

RangeIndex(start=0, stop=2, step=1)

###### pandas.Categoricallndex 킅래스

In [9]:
ser = pd.Series(['ha', 'hi'] * 1000)
ser

0       ha
1       hi
2       ha
3       hi
4       ha
        ..
1995    hi
1996    ha
1997    hi
1998    ha
1999    hi
Length: 2000, dtype: object

In [10]:
ser.nbytes

16000

In [13]:
ser.astype('category')

0       ha
1       hi
2       ha
3       hi
4       ha
        ..
1995    hi
1996    ha
1997    hi
1998    ha
1999    hi
Length: 2000, dtype: category
Categories (2, object): ['ha', 'hi']

In [14]:
ser.astype('category').nbytes

2016

# pandas.Categorical 클래스

In [15]:
s1 = pd.Categorical([1, 2, 3, 1, 2, 3])
s1

[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1, 2, 3]

In [16]:
type(s1)

pandas.core.arrays.categorical.Categorical

In [17]:
s1.dtype

CategoricalDtype(categories=[1, 2, 3], ordered=False)

In [18]:
s2 = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
s2

['a', 'b', 'c', 'a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']

In [19]:
s3 = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, categories=['c', 'b', 'a'])
s3

['a', 'b', 'c', 'a', 'b', 'c']
Categories (3, object): ['c' < 'b' < 'a']

In [20]:
s3.min(), s3.max()

('c', 'a')

In [21]:
ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
ser

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [22]:
df = pd.DataFrame({'A': ['a', 'b', 'c', 'a']})
df['B'] = df['A'].astype('category')

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [23]:
df.dtypes

A      object
B    category
dtype: object

In [24]:
df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype='category')
df.dtypes

A    category
B    category
dtype: object

In [25]:
df['A']

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (3, object): ['a', 'b', 'c']

In [26]:
df['B']

0    b
1    c
2    c
3    d
Name: B, dtype: category
Categories (3, object): ['b', 'c', 'd']

In [27]:
df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')})
df_cat = df.astype('category')
df_cat.dtypes

A    category
B    category
dtype: object

# pandas.MultiIndex 킅래스

In [28]:
arr = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
pd.MultiIndex.from_arrays(arr, names=('number', 'color'))

MultiIndex([(1,  'red'),
            (1, 'blue'),
            (2,  'red'),
            (2, 'blue')],
           names=['number', 'color'])

In [29]:
arr = [['ha', 'ha', 'hi', 'hi', 'ho', 'ho',], ['one', 'two', 'one', 'two', 'one', 'two']]
tuples = list(zip(*arr))
tuples

[('ha', 'one'),
 ('ha', 'two'),
 ('hi', 'one'),
 ('hi', 'two'),
 ('ho', 'one'),
 ('ho', 'two')]

In [30]:
ind = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
ind

MultiIndex([('ha', 'one'),
            ('ha', 'two'),
            ('hi', 'one'),
            ('hi', 'two'),
            ('ho', 'one'),
            ('ho', 'two')],
           names=['first', 'second'])

In [31]:
ser = pd.Series(np.random.randn(6), index=ind)
ser

first  second
ha     one      -0.421701
       two      -0.461009
hi     one       2.082543
       two       0.543260
ho     one       2.311307
       two       0.457764
dtype: float64

In [32]:
df = pd.DataFrame(np.random.randn(6, 4), index=arr)
df

Unnamed: 0,Unnamed: 1,0,1,2,3
ha,one,-0.154016,-1.260265,-2.10469,0.048513
ha,two,-0.464683,0.365688,-0.823887,-1.831991
hi,one,1.428056,0.153533,2.124914,-1.567984
hi,two,-1.167322,1.654742,-0.860694,0.223
ho,one,-2.568918,-0.792255,-0.685042,1.798935
ho,two,-0.008176,-2.443939,-1.530672,0.280292


In [33]:
df.index

MultiIndex([('ha', 'one'),
            ('ha', 'two'),
            ('hi', 'one'),
            ('hi', 'two'),
            ('ho', 'one'),
            ('ho', 'two')],
           )

In [34]:
df = pd.DataFrame(np.random.randn(3, 6), index=['A', 'B', 'C'], columns=ind)
df

first,ha,ha,hi,hi,ho,ho
second,one,two,one,two,one,two
A,-0.834212,-0.444929,0.151346,2.44748,2.342726,0.21793
B,1.407437,2.087038,2.256959,-0.59979,-1.103206,1.743151
C,-1.774702,1.263259,-0.534233,0.736537,1.933427,0.223267


In [35]:
pd.DataFrame(np.random.randn(3, 4), index=ind[:3], columns=ind[:4])

Unnamed: 0_level_0,first,ha,ha,hi,hi
Unnamed: 0_level_1,second,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ha,one,0.585418,-0.557083,0.034642,-0.615098
ha,two,-0.755366,0.9669,-0.532651,-0.230589
hi,one,0.301932,-0.612602,1.382173,-1.415736


In [36]:
pd.Series(np.random.randn(6), index=tuples)

(ha, one)    1.271616
(ha, two)    0.665417
(hi, one)   -0.521027
(hi, two)    0.354884
(ho, one)    1.432611
(ho, two)    0.413577
dtype: float64

# 판다스 주요 기능 p172 ~ 185 시작!

In [37]:
ser = pd.Series(np.random.randn(100))
ser.head()

0   -1.036675
1    0.327099
2   -1.348636
3   -0.704630
4    0.573889
dtype: float64

In [38]:
ser.tail()

95    0.585943
96    1.349764
97    0.286696
98   -0.375945
99    0.996041
dtype: float64

In [39]:
ind = pd.date_range('1/1/2021', periods=5)
ser = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
df = pd.DataFrame(np.random.randn(5, 3), index=ind, columns=['A', 'B', 'C'])
df[:2]

Unnamed: 0,A,B,C
2021-01-01,-0.576127,0.152781,0.906484
2021-01-02,0.748642,-0.583751,0.316044


##### 판다스 객체 이진 연산

In [40]:
df = pd.DataFrame({'one': pd.Series(np.random.randn(2), index=['a', 'b']), 
                   'two': pd.Series(np.random.randn(3), index=['a', 'b', 'c']), 
                   'three': pd.Series(np.random.randn(2), index=['b', 'c'])})
df

Unnamed: 0,one,two,three
a,-0.180737,0.496561,
b,-0.00081,-1.167937,0.885787
c,,-0.693145,1.285867


In [41]:
df.iloc[1]

one     -0.000810
two     -1.167937
three    0.885787
Name: b, dtype: float64

In [42]:
df['two']

a    0.496561
b   -1.167937
c   -0.693145
Name: two, dtype: float64

In [43]:
row = df.iloc[1]
col = df['two']

In [44]:
df.sub(row, axis='columns')

Unnamed: 0,one,two,three
a,-0.179927,1.664498,
b,0.0,0.0,0.0
c,,0.474791,0.40008


In [45]:
df.sub(col, axis=0)

Unnamed: 0,one,two,three
a,-0.677299,0.0,
b,1.167127,0.0,2.053724
c,,0.0,1.979013


In [46]:
d = {'one': [1., 2., np.nan], 'two': [3., 2., 1.], 'three': [np.nan, 1., 1.]}
df = pd.DataFrame(d, index=list('abc'))
df

Unnamed: 0,one,two,three
a,1.0,3.0,
b,2.0,2.0,1.0
c,,1.0,1.0


In [47]:
d1 = pd.DataFrame({'one': pd.Series([1., 2.], index=['a', 'b']), 
                   'two': pd.Series([1., 1., 1.], index=['a', 'b', 'c']), 
                   'three': pd.Series([2., 2., 2.], index=['a', 'b', 'c'])})
df1 = pd.DataFrame(d1)
df1

Unnamed: 0,one,two,three
a,1.0,1.0,2.0
b,2.0,1.0,2.0
c,,1.0,2.0


In [48]:
df + df1

Unnamed: 0,one,two,three
a,2.0,4.0,
b,4.0,3.0,3.0
c,,2.0,3.0


In [49]:
df.add(df1, fill_value=0)

Unnamed: 0,one,two,three
a,2.0,4.0,2.0
b,4.0,3.0,3.0
c,,2.0,3.0


In [50]:
df = pd.DataFrame({'angles': [0, 3, 4], 'degrees': [360, 180, 360]},
                  index=['circle', 'triangle', 'rectangle'])
df

Unnamed: 0,angles,degrees
circle,0,360
triangle,3,180
rectangle,4,360


In [51]:
df + 1

Unnamed: 0,angles,degrees
circle,1,361
triangle,4,181
rectangle,5,361


In [52]:
df - [1, 2]

Unnamed: 0,angles,degrees
circle,-1,358
triangle,2,178
rectangle,3,358


In [53]:
df.sub([1, 2], axis = 'columns')

Unnamed: 0,angles,degrees
circle,-1,358
triangle,2,178
rectangle,3,358


In [54]:
df1 = df.sub(pd.Series([1, 2, 3], index = ['circle', 'triangle', 'rectangle']), axis='index')
df1

Unnamed: 0,angles,degrees
circle,-1,359
triangle,1,178
rectangle,1,357


### 요약과 통계 연산

In [55]:
d = {'one': [1., 2., np.nan], 'two': [3., 2., 1.], 'three': [np.nan, 1., 1.]}
df = pd.DataFrame(d, index=list('abc'))
df

Unnamed: 0,one,two,three
a,1.0,3.0,
b,2.0,2.0,1.0
c,,1.0,1.0


In [56]:
df.mean(0)

one      1.5
two      2.0
three    1.0
dtype: float64

In [57]:
df.mean(1)

a    2.000000
b    1.666667
c    1.000000
dtype: float64

In [58]:
df.sum(0, skipna=False)

one      NaN
two      6.0
three    NaN
dtype: float64

In [59]:
df.sum(1, skipna=True)

a    4.0
b    5.0
c    2.0
dtype: float64

In [60]:
df.std()

one      0.707107
two      1.000000
three    0.000000
dtype: float64

In [61]:
df.std(axis=1)

a    1.414214
b    0.577350
c    0.000000
dtype: float64

In [62]:
np.std(df, axis=1)

a    1.000000
b    0.471405
c    0.000000
dtype: float64

In [63]:
np.std(df, ddof=1, axis=1)

a    1.414214
b    0.577350
c    0.000000
dtype: float64

In [64]:
df[['one', 'two', 'three']].std()

one      0.707107
two      1.000000
three    0.000000
dtype: float64

In [65]:
df.cumsum()

Unnamed: 0,one,two,three
a,1.0,3.0,
b,3.0,5.0,1.0
c,,6.0,2.0


In [66]:
np.mean(df['one'])

1.5

In [70]:
ser = pd.Series(np.random.randn(500))
ser[20:500] = np.nan
ser[10:20] = 5
ser.nunique()

11

In [68]:
ser = pd.Series(np.random.randn(1000))
ser[::2] = np.nanWe 
ser.describe()

count    500.000000
mean       0.022739
std        0.952707
min       -3.842591
25%       -0.619831
50%        0.012767
75%        0.676262
max        2.975367
dtype: float64

In [76]:
df = pd.DataFrame(np.random.randn(1000, 4), columns = ['a', 'b', 'c', 'd'])
df.iloc[::2] = np.nan
df.describe()

Unnamed: 0,a,b,c,d
count,500.0,500.0,500.0,500.0
mean,0.031246,0.03623,0.064487,0.001702
std,1.022098,0.999186,1.057886,0.971094
min,-3.621393,-3.375471,-3.156073,-2.679372
25%,-0.627388,-0.611267,-0.635101,-0.610224
50%,-0.024723,-0.01149,0.066992,-0.035623
75%,0.668733,0.739111,0.70978,0.650707
max,3.501505,3.217087,3.302933,3.047904


In [77]:
ser.describe(percentiles = [0.05, 0.25, .75, .95])

count    20.000000
mean      2.348326
std       2.761440
min      -1.909121
5%       -0.906388
25%       0.018147
50%       2.643828
75%       5.000000
95%       5.000000
max       5.000000
dtype: float64

In [78]:
ser = pd.Series(['a', 'a', 'b', 'c', 'c', np.nan, 'c', 'd'])
ser.describe()

count     7
unique    4
top       c
freq      3
dtype: object

In [79]:
df = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)})
df.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [80]:
df.describe(include=['object'])
# 왜 top이 yes로 나오지? 책에는 No로 나오는데..

Unnamed: 0,a
count,4
unique,2
top,Yes
freq,2


In [81]:
df.describe(include=['number'])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [82]:
df.describe(include='all')

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


In [83]:
ser = pd.Series(np.random.randn(5))
ser

0    1.316043
1   -0.551202
2   -1.136636
3    1.724347
4    0.861967
dtype: float64

In [84]:
ser.idxmin(), ser.idxmax()

(2, 3)

In [85]:
df = pd.DataFrame(np.random.randn(4, 3), columns = ['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,-0.905864,0.550861,0.537133
1,-0.079408,1.008777,-2.419176
2,1.209594,1.730597,-0.169425
3,-0.426134,0.189107,1.10414


In [86]:
df.idxmin(axis=0)

A    0
B    3
C    1
dtype: int64

In [87]:
df.idxmin()

A    0
B    3
C    1
dtype: int64

In [88]:
df.idxmax(axis=1)

0    B
1    B
2    B
3    C
dtype: object

In [89]:
df1 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))
df1

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [90]:
df1['A'].idxmin()

'd'

In [91]:
data = np.random.randint(0, 7, size=30)
data

array([4, 3, 2, 0, 0, 2, 5, 4, 5, 2, 6, 2, 5, 2, 6, 6, 0, 2, 1, 4, 0, 6,
       1, 2, 1, 3, 1, 2, 2, 4])

In [92]:
ser1 = pd.Series(data)
ser1.value_counts()

2    9
4    4
0    4
6    4
1    4
5    3
3    2
dtype: int64

In [93]:
pd.value_counts(data)

2    9
4    4
0    4
6    4
1    4
5    3
3    2
dtype: int64

In [94]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]

In [95]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)

([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
 Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]],
 array([0.994, 3.   , 5.   , 7.   ]))

In [96]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, labels=['bad', 'medium', 'good'])

['bad', 'good', 'medium', 'medium', 'good', 'bad']
Categories (3, object): ['bad' < 'medium' < 'good']

In [97]:
pd.cut([0, 1, 1, 2], bins=4, labels=False)

array([0, 1, 1, 3])

In [98]:
ser = pd.Series(np.array([2, 4, 6, 8, 10]), index=['a', 'b', 'c', 'd', 'e'])
pd.cut(ser, 3)

a    (1.992, 4.667]
b    (1.992, 4.667]
c    (4.667, 7.333]
d     (7.333, 10.0]
e     (7.333, 10.0]
dtype: category
Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, 7.333] < (7.333, 10.0]]

In [104]:
pd.qcut(range(5), 4)

[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]]

In [105]:
pd.qcut(range(5), 3, labels=['good', 'medium', 'bad'])

['good', 'good', 'medium', 'bad', 'bad']
Categories (3, object): ['good' < 'medium' < 'bad']

In [106]:
pd.qcut(range(5), 4, labels=False)

array([0, 0, 1, 2, 3])

In [5]:
pd.cut(np.random.randn(25), 5).value_counts()

(-2.107, -1.314]    3
(-1.314, -0.525]    9
(-0.525, 0.264]     7
(0.264, 1.053]      3
(1.053, 1.842]      3
dtype: int64

In [108]:
pd.qcut(np.random.randn(25), 5).value_counts()

(-2.375, -0.98]    5
(-0.98, -0.426]    5
(-0.426, 0.424]    5
(0.424, 0.823]     5
(0.823, 2.053]     5
dtype: int64