## pandas 기초

- pandas는 R의 핵심 데이터 시리즈와 프레이믈 파이썬의 추가한 것이다
- numpay를 기반으로 구현되었고, numpy 대비 기능을 더 확장하여 재구현한 것
- Python Data analysis Library
- https://pandas.pydata.org

In [1]:
import numpy as np
import pandas as pd

- 파이썬의 자료구조
> 수치형, 문자열, 리스트, 딕셔너리, 튜플, 집합, 블린
- numpy의 자료구조
> ndarray(배열) : 배열의 데이터는 모든 같은 타입이다.
- pandas의 자료구조
> Series(시리즈), DataFrame(데이터프레임)
> DataFrame의 인덱싱 -> Series의 인덱싱 -> 값(스칼라), 수치, 문자 블린,NaN 이 등장
> Series: 인덱스와 데이터만 존재하는 컬럼이 없는 자료구조
> DataFrame: 인덱스와 컬럼이 존재하는 자료구조
> NaN: 데이터가 없다. (난,넌), Not a Number => np.nan

In [5]:
# Series
# 데이터를 정수로 넣었으나 기볺여으로 float64가 반영되었다.
a = pd.Series( [1,3,5,np.nan,6,8] )
a

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
# 속성
a.dtype

dtype('float64')

In [8]:
# 속성-크기
a.shape  # 1차원 데이터로 총 6개의 데이터가 존재

(6,)

In [14]:
# DataFrame
# 인덱스와 컬럼이 존재하는 자료구조
cols = 'ABCD' 
# ['','','',''] 리스트 로 만들기
cols = list('ABCD')
indexs = pd.date_range('20190812', periods=7)
# 컬럼 4개 인덱스 7개
cols,indexs

(['A', 'B', 'C', 'D'],
 DatetimeIndex(['2019-08-12', '2019-08-13', '2019-08-14', '2019-08-15',
                '2019-08-16', '2019-08-17', '2019-08-18'],
               dtype='datetime64[ns]', freq='D'))

In [17]:
# 데이터는  shape => (7,4) # 인덱스 값이 먼저, 뒤가 컬럼값
datas = np.random.randn(7,4)
datas, datas.shape

(array([[-0.33043964, -0.99615366,  0.46530276, -0.27684994],
        [ 0.83423625, -0.0694506 , -0.93741214, -0.43618738],
        [-0.72058705, -0.62864201, -0.01114932,  1.08823353],
        [-0.13242627, -0.18732976,  0.40462458,  1.84195197],
        [-0.38020317, -0.06874349, -0.15976487, -0.45920546],
        [-0.72155494,  0.90859108, -0.94873165,  0.08819295],
        [ 0.58938677, -0.42004079, -2.43708461,  0.70920791]]), (7, 4))

In [20]:
# df 생성
df = pd.DataFrame( datas,index=indexs,columns=cols)
df

Unnamed: 0,A,B,C,D
2019-08-12,-0.33044,-0.996154,0.465303,-0.27685
2019-08-13,0.834236,-0.069451,-0.937412,-0.436187
2019-08-14,-0.720587,-0.628642,-0.011149,1.088234
2019-08-15,-0.132426,-0.18733,0.404625,1.841952
2019-08-16,-0.380203,-0.068743,-0.159765,-0.459205
2019-08-17,-0.721555,0.908591,-0.948732,0.088193
2019-08-18,0.589387,-0.420041,-2.437085,0.709208


- ** 데아터가 로드된 후 DataFrame을 만든 후 점검할 사항**

In [21]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [23]:
df.index

DatetimeIndex(['2019-08-12', '2019-08-13', '2019-08-14', '2019-08-15',
               '2019-08-16', '2019-08-17', '2019-08-18'],
              dtype='datetime64[ns]', freq='D')

In [25]:
df.values

array([[-0.33043964, -0.99615366,  0.46530276, -0.27684994],
       [ 0.83423625, -0.0694506 , -0.93741214, -0.43618738],
       [-0.72058705, -0.62864201, -0.01114932,  1.08823353],
       [-0.13242627, -0.18732976,  0.40462458,  1.84195197],
       [-0.38020317, -0.06874349, -0.15976487, -0.45920546],
       [-0.72155494,  0.90859108, -0.94873165,  0.08819295],
       [ 0.58938677, -0.42004079, -2.43708461,  0.70920791]])

In [26]:
type(df.values)

numpy.ndarray

In [27]:
df.shape

(7, 4)

In [28]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [29]:
# df의 개요
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2019-08-12 to 2019-08-18
Freq: D
Data columns (total 4 columns):
A    7 non-null float64
B    7 non-null float64
C    7 non-null float64
D    7 non-null float64
dtypes: float64(4)
memory usage: 280.0 bytes


In [31]:
# 통계요약 : 개수, 평균, 표준편차, 최소, 25%, 50%, 75% 최대
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,-0.123084,-0.208824,-0.517745,0.365049
std,0.612161,0.595661,1.020431,0.878706
min,-0.721555,-0.996154,-2.437085,-0.459205
25%,-0.550395,-0.524341,-0.943072,-0.356519
50%,-0.33044,-0.18733,-0.159765,0.088193
75%,0.22848,-0.069097,0.196738,0.898721
max,0.834236,0.908591,0.465303,1.841952


In [34]:
# B열 기준 데이터를 정렬, 내림차순, 
df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2019-08-17,-0.721555,0.908591,-0.948732,0.088193
2019-08-16,-0.380203,-0.068743,-0.159765,-0.459205
2019-08-13,0.834236,-0.069451,-0.937412,-0.436187
2019-08-15,-0.132426,-0.18733,0.404625,1.841952
2019-08-18,0.589387,-0.420041,-2.437085,0.709208
2019-08-14,-0.720587,-0.628642,-0.011149,1.088234
2019-08-12,-0.33044,-0.996154,0.465303,-0.27685


In [37]:
# 특정 컬럼의 데이터만 보기 => 인덱싱 => 차원축소
df['C'], type(df['C'])

(2019-08-12    0.465303
 2019-08-13   -0.937412
 2019-08-14   -0.011149
 2019-08-15    0.404625
 2019-08-16   -0.159765
 2019-08-17   -0.948732
 2019-08-18   -2.437085
 Freq: D, Name: C, dtype: float64, pandas.core.series.Series)

In [39]:
# 슬라이싱 : 차원유지
df[:] # 카피 동일

Unnamed: 0,A,B,C,D
2019-08-12,-0.33044,-0.996154,0.465303,-0.27685
2019-08-13,0.834236,-0.069451,-0.937412,-0.436187
2019-08-14,-0.720587,-0.628642,-0.011149,1.088234
2019-08-15,-0.132426,-0.18733,0.404625,1.841952
2019-08-16,-0.380203,-0.068743,-0.159765,-0.459205
2019-08-17,-0.721555,0.908591,-0.948732,0.088193
2019-08-18,0.589387,-0.420041,-2.437085,0.709208


In [40]:
# 데이터가 슬라이싱 되서 나온다 => 차원을 유지해야 하니까
# a <= x <B
df[1:3] # 1:3은 인덱스 값 기준

Unnamed: 0,A,B,C,D
2019-08-13,0.834236,-0.069451,-0.937412,-0.436187
2019-08-14,-0.720587,-0.628642,-0.011149,1.088234


In [44]:
# 슬라이싱을 하는데 인덱스 값이 아닌 실제 값으로 자르기
# a<= x <= b
df['2019-08-13':'2019-08-15']

Unnamed: 0,A,B,C,D
2019-08-13,0.834236,-0.069451,-0.937412,-0.436187
2019-08-14,-0.720587,-0.628642,-0.011149,1.088234
2019-08-15,-0.132426,-0.18733,0.404625,1.841952


- 전통적인 인덱싱과 슬라이싱을 진행하면 표현의 한계점에 도달.
- 이를 극복하기 위해서 pandas 만의 데이터 추출법이 추가가 되었다.
- loc, iloc <= 2개를 주로 사용한다.
- 연속데이터에 대한 추출  <-> 비연속 데이터들의 추출법(펜시인뎅싱, 쿼리수행등등)

### loc

In [52]:
# loc : location 정보를 옵션으로 하여 슬라이싱 지원
# loc을 통한 데이터추출
# df.loc[ 인덱스명 ]
df.loc['2019-08-12'], type(df.loc['2019-08-12'])

(A   -0.330440
 B   -0.996154
 C    0.465303
 D   -0.276850
 Name: 2019-08-12 00:00:00, dtype: float64, pandas.core.series.Series)

In [53]:
# 원본 카피

df.loc[:]

Unnamed: 0,A,B,C,D
2019-08-12,-0.33044,-0.996154,0.465303,-0.27685
2019-08-13,0.834236,-0.069451,-0.937412,-0.436187
2019-08-14,-0.720587,-0.628642,-0.011149,1.088234
2019-08-15,-0.132426,-0.18733,0.404625,1.841952
2019-08-16,-0.380203,-0.068743,-0.159765,-0.459205
2019-08-17,-0.721555,0.908591,-0.948732,0.088193
2019-08-18,0.589387,-0.420041,-2.437085,0.709208


In [60]:
# 인덱스쪽은 전부 포함시키고(1차원), 컬럼은 A,C만 포함(2차원)
df.loc[ :,['A','C'] ]    # 인덱스, 컬럼 순으로 표현.   , 앞부분은 인덱스 , 뒷부분은 컬럼


Unnamed: 0,A,C
2019-08-12,-0.33044,0.465303
2019-08-13,0.834236,-0.937412
2019-08-14,-0.720587,-0.011149
2019-08-15,-0.132426,0.404625
2019-08-16,-0.380203,-0.159765
2019-08-17,-0.721555,-0.948732
2019-08-18,0.589387,-2.437085


In [66]:
# 차원축소

df.loc[: ,'A'] , type(df.loc[: ,'A'])

(2019-08-12   -0.330440
 2019-08-13    0.834236
 2019-08-14   -0.720587
 2019-08-15   -0.132426
 2019-08-16   -0.380203
 2019-08-17   -0.721555
 2019-08-18    0.589387
 Freq: D, Name: A, dtype: float64, pandas.core.series.Series)

In [65]:
# 차원유지

df.loc[: ,['A']] , type(df.loc[: ,['A']])


(2019-08-12   -0.330440
 2019-08-13    0.834236
 2019-08-14   -0.720587
 2019-08-15   -0.132426
 2019-08-16   -0.380203
 2019-08-17   -0.721555
 2019-08-18    0.589387
 Freq: D, Name: A, dtype: float64, pandas.core.frame.DataFrame)

In [68]:
df.loc[ '2019-08-13':'2019-08-15',['A','C'] ] # 차원 유지 = 데이터프레임

Unnamed: 0,A,C
2019-08-13,0.834236,-0.937412
2019-08-14,-0.720587,-0.011149
2019-08-15,-0.132426,0.404625


In [81]:
# 차원축소
df.loc[ '2019-08-13', ['A','C']]

A    0.834236
C   -0.937412
Name: 2019-08-13 00:00:00, dtype: float64

In [83]:
# 차원유지
df.loc[ '2019-08-13':'2019-08-13', ['A','C']]

Unnamed: 0,A,C
2019-08-13,0.834236,-0.937412


In [80]:
# 차원축소가 2회 진행 -> 스칼라 (값)
df.loc['2019-08-12','A']

-0.33043963588637304

### iloc 

- 펜시인덱싱과 유사하다.
- 행과 열의 번호를 이용하여 데이터를 접근하는 방식
- i -> index

In [85]:
# 2019-08-13wk epdlxj cncnf
# 1 => 인덱스 값이 1인 데이터
df.iloc[ 1 ]

A    0.834236
B   -0.069451
C   -0.937412
D   -0.436187
Name: 2019-08-13 00:00:00, dtype: float64

In [94]:
# iloc 슬라이싱
# a <= index < b, c <= column < d
df.iloc[ 1:3, 1:3 ]

Unnamed: 0,B,C
2019-08-13,-0.069451,-0.937412
2019-08-14,-0.628642,-0.011149


In [96]:
# iloc + 펜시인덱싱 기법 사용 (인덱스, 컬럼을 비연속적 위치를 나열)
df.iloc[ [1,4,2] , [1,0,2] ]

Unnamed: 0,B,A,C
2019-08-13,-0.069451,0.834236,-0.937412
2019-08-16,-0.068743,-0.380203,-0.159765
2019-08-14,-0.628642,-0.720587,-0.011149


In [101]:
# 특정 조건에 만족하는 데이터만 추출
# 데이터프레임이 생성되면 컬럼명은 맴버 변수로 자동생성됨( df. 찍고 탭 눌러보면 A,B,C,D가 뜬다.)
# C 컬럼에 존재하는 데이터 중에 양수만( 양수면 True, 0이하면 False)
# 조건을 부여하여 블리언 데이터를 만들어 참만 포함시키는 방식 : 블리언 인덱싱
# [T, F, F, F, T, T, T] 데이터를 and하면 참만 살아나마서 아래와 같은 결과를 발생
# df에 식을 치면 => 전체 구성원에 전부 다 연산이 진행된다.
# 행렬 (연산) 값 => 각 구성원에 일일이 다 연산하는 것과 동일
df[df.C > 0]

Unnamed: 0,A,B,C,D
2019-08-12,-0.33044,-0.996154,0.465303,-0.27685
2019-08-15,-0.132426,-0.18733,0.404625,1.841952


In [102]:
# 데이터 전체를 기준으로 0보다 큰가? 0보다 같거나 작은 데이터들은? ==> Nan으로 대체
df[df > 0]

Unnamed: 0,A,B,C,D
2019-08-12,,,0.465303,
2019-08-13,0.834236,,,
2019-08-14,,,,1.088234
2019-08-15,,,0.404625,1.841952
2019-08-16,,,,
2019-08-17,,0.908591,,0.088193
2019-08-18,0.589387,,,0.709208


In [104]:
# 복사
df.copy(), df[:]

(                   A         B         C         D
 2019-08-12 -0.330440 -0.996154  0.465303 -0.276850
 2019-08-13  0.834236 -0.069451 -0.937412 -0.436187
 2019-08-14 -0.720587 -0.628642 -0.011149  1.088234
 2019-08-15 -0.132426 -0.187330  0.404625  1.841952
 2019-08-16 -0.380203 -0.068743 -0.159765 -0.459205
 2019-08-17 -0.721555  0.908591 -0.948732  0.088193
 2019-08-18  0.589387 -0.420041 -2.437085  0.709208,
                    A         B         C         D
 2019-08-12 -0.330440 -0.996154  0.465303 -0.276850
 2019-08-13  0.834236 -0.069451 -0.937412 -0.436187
 2019-08-14 -0.720587 -0.628642 -0.011149  1.088234
 2019-08-15 -0.132426 -0.187330  0.404625  1.841952
 2019-08-16 -0.380203 -0.068743 -0.159765 -0.459205
 2019-08-17 -0.721555  0.908591 -0.948732  0.088193
 2019-08-18  0.589387 -0.420041 -2.437085  0.709208)

In [107]:
# 기존데이터 df에 새로운 컬럼을 추가한다!! (아주 중요) => 파생변수
# 기존 df의 1차원과 동수의 데이터가 존재해야 한다.
# 데이터는 리스트 ok, Series도 ok
new_data = ['one','one','two','three','four','five','five']
# 데이터 추가, 대상[ 신규컬럼명 ] = 데이터
df['E'] = new_data

In [108]:
df

Unnamed: 0,A,B,C,D,E
2019-08-12,-0.33044,-0.996154,0.465303,-0.27685,one
2019-08-13,0.834236,-0.069451,-0.937412,-0.436187,one
2019-08-14,-0.720587,-0.628642,-0.011149,1.088234,two
2019-08-15,-0.132426,-0.18733,0.404625,1.841952,three
2019-08-16,-0.380203,-0.068743,-0.159765,-0.459205,four
2019-08-17,-0.721555,0.908591,-0.948732,0.088193,five
2019-08-18,0.589387,-0.420041,-2.437085,0.709208,five


In [113]:
# 데이터 조사
# 안에 그런 값이 있는가?  => False, True
df['E'].isin(['two','four'])

2019-08-12    False
2019-08-13    False
2019-08-14     True
2019-08-15    False
2019-08-16     True
2019-08-17    False
2019-08-18    False
Freq: D, Name: E, dtype: bool

In [114]:
df[df['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2019-08-14,-0.720587,-0.628642,-0.011149,1.088234,two
2019-08-16,-0.380203,-0.068743,-0.159765,-0.459205,four


In [116]:
# 누적값 
# apply ( 함수를 표현 ) => 맴버들을 다 건드린다.
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,E
2019-08-12,-0.33044,-0.996154,0.465303,-0.27685,one
2019-08-13,0.503797,-1.065604,-0.472109,-0.713037,oneone
2019-08-14,-0.21679,-1.694246,-0.483259,0.375196,oneonetwo
2019-08-15,-0.349217,-1.881576,-0.078634,2.217148,oneonetwothree
2019-08-16,-0.72942,-1.95032,-0.238399,1.757943,oneonetwothreefour
2019-08-17,-1.450975,-1.041728,-1.187131,1.846136,oneonetwothreefourfive
2019-08-18,-0.861588,-1.461769,-3.624215,2.555344,oneonetwothreefourfivefive
