# Pandas
파이썬 데이터 분석의 3대 라이브러리: 
* Numpy : 배열, 행열을 다룸
* Pandas : 엑셀과 같은 데이터프레임을 다룸
* Matplotlib : 시각화

In [0]:
# pandas 사용하기
import numpy as np #numpy 도 함께 사용
import pandas as pd

In [0]:
# 2. Pandas 자료구조
# Pandas 에서는 기본적으로 정의되는 자료구조인 Series와 Data Frame을 사용한다.
# 이 자료구조들은 빅 데이터 분석에 있어서 높은 수준의 성능을 보여준다.

In [0]:
# 2-1. Series

In [4]:
# Series 정의하기
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
# Series의 값만 확인하기
obj.values

array([ 4,  7, -5,  3])

In [6]:
# Series의 인덱스만 확인하기
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
# Series의 자료형 확인하기
obj.dtypes

dtype('int64')

In [8]:
# 인덱스를 바꿀 수 있다
obj2 = pd.Series([4,7,-5,3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [9]:
# python의 dictionary 자료형을 Series data로 만들 수 있다.
# dictionary의 key가 Series의 index가 된다
sdata = {'kim':35000, 'park':67000, 'john':12000, 'choi':4000}
obj3 = pd.Series(sdata)
obj3

kim     35000
park    67000
john    12000
choi     4000
dtype: int64

In [10]:
# index 변경
obj3.index = ['A','B','C','D']
obj3
# 특정 인덱스만 바꾸는법

A    35000
B    67000
C    12000
D     4000
dtype: int64

In [0]:
# 2-2. Data Frame

In [12]:
# Data Frame 정의하기
# 이전에 DataFrame에 들어갈 데이터를 정의해주어야 하는데,
# 이는 python의 dictionary 또는 numpy의 array로 정의할 수 있다.

data = {'name':['lee', 'lee', 'lee', 'kim', 'park'],
       'year':[2013, 2014, 2015, 2016, 2015],
       'points':[1.5, 1.7, 3.6, 2.4, 2.9]}

df = pd.DataFrame(data)
df

Unnamed: 0,name,year,points
0,lee,2013,1.5
1,lee,2014,1.7
2,lee,2015,3.6
3,kim,2016,2.4
4,park,2015,2.9


In [0]:
# 행과 열의 구조를 가진 데이터가 생긴다.

In [14]:
# 행 번호
df.index

RangeIndex(start=0, stop=5, step=1)

In [15]:
# 열 이름
df.columns

Index(['name', 'year', 'points'], dtype='object')

In [16]:
df

Unnamed: 0,name,year,points
0,lee,2013,1.5
1,lee,2014,1.7
2,lee,2015,3.6
3,kim,2016,2.4
4,park,2015,2.9


In [17]:
df.head(3)

Unnamed: 0,name,year,points
0,lee,2013,1.5
1,lee,2014,1.7
2,lee,2015,3.6


In [18]:
df.tail(2)

Unnamed: 0,name,year,points
3,kim,2016,2.4
4,park,2015,2.9


In [19]:
# 값 얻기
df.values

array([['lee', 2013, 1.5],
       ['lee', 2014, 1.7],
       ['lee', 2015, 3.6],
       ['kim', 2016, 2.4],
       ['park', 2015, 2.9]], dtype=object)

In [20]:
# DataFrame을 만들면서 columns와 index를 설정할 수 있다.
df2 = pd.DataFrame(data, columns=['year', 'name','points', 'penalty'], index = ['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,name,points,penalty
one,2013,lee,1.5,
two,2014,lee,1.7,
three,2015,lee,3.6,
four,2016,kim,2.4,
five,2015,park,2.9,


In [0]:
# DataFrame을 정의하면서, data로 들어가는 python dictionary와 columns의
# 순서가 달라도 알아서 맞춰서 정의된다.
# 하지만 data에 포함되어 있지 않은 값은
# Nan(Not a Number)으로 나타나게 된다.
# 이는 null과 같은 개념이다.
# Nan 값은 추후에 어떠한 방법으로도 처리가 되지 않는 데이터이다.
# 따라서 올바른 데이터 처리를 위해 추가적으로 값을 넣어줘야 한다.

In [22]:
# describe() 함수는 DataFrame의 계산 가능한 값들에 대한
# 요약통계 값을 보여준다.
df2.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,2014.6,2.42
std,1.140175,0.864292
min,2013.0,1.5
25%,2014.0,1.7
50%,2015.0,2.4
75%,2015.0,2.9
max,2016.0,3.6


In [23]:
# 데이터 요약
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, one to five
Data columns (total 4 columns):
year       5 non-null int64
name       5 non-null object
points     5 non-null float64
penalty    0 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 200.0+ bytes
None


In [0]:
# 3. DataFrame Indexing

In [25]:
data = {'name':['lee', 'lee', 'lee', 'park', 'park'],
       'year': [2014, 2015, 2016, 2015, 2016],
       'points':[1.5, 1.7, 3.6, 2.4, 2.9]}

df = pd.DataFrame(data, 
                   columns=['year', 'name','points', 'penalty'],
                   index = ['one', 'two', 'three', 'four', 'five']
                  )
df

Unnamed: 0,year,name,points,penalty
one,2014,lee,1.5,
two,2015,lee,1.7,
three,2016,lee,3.6,
four,2015,park,2.4,
five,2016,park,2.9,


In [0]:
# 3-1. DataFrame 에서 열을 선택하고 조작하기

In [27]:
df['year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [28]:
df['name']

one       lee
two       lee
three     lee
four     park
five     park
Name: name, dtype: object

In [29]:
# 동일한 의미를 갖는, 다른 방법
df.year

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [30]:
df.name

one       lee
two       lee
three     lee
four     park
five     park
Name: name, dtype: object

In [31]:
df[['year', 'points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2015,2.4
five,2016,2.9


In [0]:
df['penalty']=0.5

In [33]:
df['win']=1
df

Unnamed: 0,year,name,points,penalty,win
one,2014,lee,1.5,0.5,1
two,2015,lee,1.7,0.5,1
three,2016,lee,3.6,0.5,1
four,2015,park,2.4,0.5,1
five,2016,park,2.9,0.5,1


In [34]:
# 이렇게 사본 x
df2=df
df2['year']=0.5
df2


Unnamed: 0,year,name,points,penalty,win
one,0.5,lee,1.5,0.5,1
two,0.5,lee,1.7,0.5,1
three,0.5,lee,3.6,0.5,1
four,0.5,park,2.4,0.5,1
five,0.5,park,2.9,0.5,1


In [35]:
df = pd.DataFrame(data, 
                   columns=['year', 'name','points', 'penalty'],
                   index = ['one', 'two', 'three', 'four', 'five']
                  )
df

Unnamed: 0,year,name,points,penalty
one,2014,lee,1.5,
two,2015,lee,1.7,
three,2016,lee,3.6,
four,2015,park,2.4,
five,2016,park,2.9,


In [0]:
df['penalty']=0.5

In [0]:
df['penalty']=[0.1, 0.2, 0.3, 0.4, 0.5]

In [38]:
df

Unnamed: 0,year,name,points,penalty
one,2014,lee,1.5,0.1
two,2015,lee,1.7,0.2
three,2016,lee,3.6,0.3
four,2015,park,2.4,0.4
five,2016,park,2.9,0.5


In [0]:
# 특정 열에 대해 위와 같이 선택하고, 우리가 원하는 값을 대입할 수 있다.


In [0]:
# 또는
# python의 List나 numpy의 array


In [0]:
# 새로운 열을 추가하기
df['zeros']=np.arange(5)

In [40]:
df

Unnamed: 0,year,name,points,penalty,zeros
one,2014,lee,1.5,0.1,0
two,2015,lee,1.7,0.2,1
three,2016,lee,3.6,0.3,2
four,2015,park,2.4,0.4,3
five,2016,park,2.9,0.5,4


In [0]:
# Series를 추가할 수도 있다.
val = pd.Series([-1.2, -1.5, -1.7], 
                index=['two', 'four', 'five'])

In [0]:
df['debt']=val

In [43]:
df

Unnamed: 0,year,name,points,penalty,zeros,debt
one,2014,lee,1.5,0.1,0,
two,2015,lee,1.7,0.2,1,-1.2
three,2016,lee,3.6,0.3,2,
four,2015,park,2.4,0.4,3,-1.5
five,2016,park,2.9,0.5,4,-1.7


In [0]:
# 하지만 Series로 넣을 때는 val와 같이 넣으려는 data의 index에 맞춰서 
# 데이터가 들어간다.
# 이점이 python list나 numpy array로 데이터를 넣을 때와 가장 큰 차이점이다.

In [45]:
df['net_points']=df['points']-df['penalty']
df

Unnamed: 0,year,name,points,penalty,zeros,debt,net_points
one,2014,lee,1.5,0.1,0,,1.4
two,2015,lee,1.7,0.2,1,-1.2,1.5
three,2016,lee,3.6,0.3,2,,3.3
four,2015,park,2.4,0.4,3,-1.5,2.0
five,2016,park,2.9,0.5,4,-1.7,2.4


In [46]:
df['high_points'] = df['net_points'] > 2.0
df

Unnamed: 0,year,name,points,penalty,zeros,debt,net_points,high_points
one,2014,lee,1.5,0.1,0,,1.4,False
two,2015,lee,1.7,0.2,1,-1.2,1.5,False
three,2016,lee,3.6,0.3,2,,3.3,True
four,2015,park,2.4,0.4,3,-1.5,2.0,False
five,2016,park,2.9,0.5,4,-1.7,2.4,True


In [47]:
# 열 삭제하기
del df['high_points']
df

Unnamed: 0,year,name,points,penalty,zeros,debt,net_points
one,2014,lee,1.5,0.1,0,,1.4
two,2015,lee,1.7,0.2,1,-1.2,1.5
three,2016,lee,3.6,0.3,2,,3.3
four,2015,park,2.4,0.4,3,-1.5,2.0
five,2016,park,2.9,0.5,4,-1.7,2.4


In [0]:
del df['net_points']
del df['zeros']

In [49]:
df

Unnamed: 0,year,name,points,penalty,debt
one,2014,lee,1.5,0.1,
two,2015,lee,1.7,0.2,-1.2
three,2016,lee,3.6,0.3,
four,2015,park,2.4,0.4,-1.5
five,2016,park,2.9,0.5,-1.7


In [0]:
# 3-2. DataFrame에서 행을 선택하고 조작하기
# Pandas에서는 DataFrame에서 행을 인덱싱하는 방법이 무수히 많다.
# 물론 위에서 소개했던 열을 선택하는 방법도 수많은 방법 중에 하나에 불과하다.

In [51]:
# 0번째 부터 2(3-1)번째 까지 가져온다.
# 뒤에 써준 숫자번째의 행은 뺀다.
df[0:3]

Unnamed: 0,year,name,points,penalty,debt
one,2014,lee,1.5,0.1,
two,2015,lee,1.7,0.2,-1.2
three,2016,lee,3.6,0.3,


In [52]:
# 아래 방법을 권장한다.
# .loc 또는 .iloc 함수를 사용하는 방법
df.loc['two']

year       2015
name        lee
points      1.7
penalty     0.2
debt       -1.2
Name: two, dtype: object

In [53]:
df.loc['two':'four']

Unnamed: 0,year,name,points,penalty,debt
two,2015,lee,1.7,0.2,-1.2
three,2016,lee,3.6,0.3,
four,2015,park,2.4,0.4,-1.5


In [54]:
df.loc['two':'four', 'points']

two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [55]:
df.loc[:,'year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [56]:
## 값만 가져온다. df2=np.copy(df1)
df2=df.copy()
df2

Unnamed: 0,year,name,points,penalty,debt
one,2014,lee,1.5,0.1,
two,2015,lee,1.7,0.2,-1.2
three,2016,lee,3.6,0.3,
four,2015,park,2.4,0.4,-1.5
five,2016,park,2.9,0.5,-1.7


In [0]:
# 새로운 행 삽입하기
df.loc['six',:] = [2013, 'june', 4.0, 0.1, 2.1]

In [58]:
df

Unnamed: 0,year,name,points,penalty,debt
one,2014.0,lee,1.5,0.1,
two,2015.0,lee,1.7,0.2,-1.2
three,2016.0,lee,3.6,0.3,
four,2015.0,park,2.4,0.4,-1.5
five,2016.0,park,2.9,0.5,-1.7
six,2013.0,june,4.0,0.1,2.1


In [59]:
# .iloc 사용 :: index 번호를 사용한다.
df.iloc[3]

year       2015
name       park
points      2.4
penalty     0.4
debt       -1.5
Name: four, dtype: object

In [60]:
df.iloc[3:5, 0:2]

Unnamed: 0,year,name
four,2015.0,park
five,2016.0,park


In [61]:
df.iloc[[1,3,4], [1,2]]

Unnamed: 0,name,points
two,lee,1.7
four,park,2.4
five,park,2.9


In [62]:
df.iloc[:,1:4]
#df.iloc[3,]


Unnamed: 0,name,points,penalty
one,lee,1.5,0.1
two,lee,1.7,0.2
three,lee,3.6,0.3
four,park,2.4,0.4
five,park,2.9,0.5
six,june,4.0,0.1


In [63]:
df.iloc[5,1]

'june'

In [0]:
# 4.DataFrame에서의 boolean Indexing

In [65]:
df

Unnamed: 0,year,name,points,penalty,debt
one,2014.0,lee,1.5,0.1,
two,2015.0,lee,1.7,0.2,-1.2
three,2016.0,lee,3.6,0.3,
four,2015.0,park,2.4,0.4,-1.5
five,2016.0,park,2.9,0.5,-1.7
six,2013.0,june,4.0,0.1,2.1


In [66]:
# year가 2014보다 큰 boolean data
df['year']>2014

one      False
two       True
three     True
four      True
five      True
six      False
Name: year, dtype: bool

In [67]:
# year가 2014보다 큰 모든 행의 값
df.loc[df['year']>2014,:]
df[df['year']>2014]

Unnamed: 0,year,name,points,penalty,debt
two,2015.0,lee,1.7,0.2,-1.2
three,2016.0,lee,3.6,0.3,
four,2015.0,park,2.4,0.4,-1.5
five,2016.0,park,2.9,0.5,-1.7


In [68]:
df.loc[df['name']=='lee',['points', 'penalty']]

Unnamed: 0,points,penalty
one,1.5,0.1
two,1.7,0.2
three,3.6,0.3


In [69]:
# numpy에서와 같이 논리연산을 응용할 수 있다.
# points<1.7 or points>2.9인 행을 모두 가져오세요
df.loc[(df['points']<1.7) | (df['points']>2.9),:]
###########

Unnamed: 0,year,name,points,penalty,debt
one,2014.0,lee,1.5,0.1,
three,2016.0,lee,3.6,0.3,
six,2013.0,june,4.0,0.1,2.1


In [0]:
# 새로운 값을 대입할 수도 있다.


In [0]:
# 5.Data

In [0]:
import numpy as np
import pandas as pd

In [72]:
# DataFrame을 만들 때 index, column을 설정하지 않으면 
# 기본 값으로 0부터 시작하는 정수형 숫자로 입력된다.
df = pd.DataFrame(np.random.randn(6,4))
df

Unnamed: 0,0,1,2,3
0,1.320535,-0.08021,1.184606,1.114754
1,0.302935,-0.701178,0.156197,1.212739
2,0.770264,0.837936,0.55474,-0.708336
3,0.664644,1.011985,-0.540256,-0.402834
4,-2.342765,0.513484,0.838313,0.320525
5,3.195288,-1.984049,-2.348448,-0.140945


In [73]:
# pandas에서 제공하는 date_range 함수는 datetime 자료형으로 구성된,
# 날짜, 시각 등을 알 수 있는 자료형을 만드는 함수
df.columns = ['A','B','C','D']
df.index = pd.date_range('20190911', periods=6)
df.index

DatetimeIndex(['2019-09-11', '2019-09-12', '2019-09-13', '2019-09-14',
               '2019-09-15', '2019-09-16'],
              dtype='datetime64[ns]', freq='D')

In [74]:
df

Unnamed: 0,A,B,C,D
2019-09-11,1.320535,-0.08021,1.184606,1.114754
2019-09-12,0.302935,-0.701178,0.156197,1.212739
2019-09-13,0.770264,0.837936,0.55474,-0.708336
2019-09-14,0.664644,1.011985,-0.540256,-0.402834
2019-09-15,-2.342765,0.513484,0.838313,0.320525
2019-09-16,3.195288,-1.984049,-2.348448,-0.140945


In [75]:
# np.nan은 NaN값을 의미한다.
df['F'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2019-09-11,1.320535,-0.08021,1.184606,1.114754,1.0
2019-09-12,0.302935,-0.701178,0.156197,1.212739,
2019-09-13,0.770264,0.837936,0.55474,-0.708336,3.5
2019-09-14,0.664644,1.011985,-0.540256,-0.402834,6.1
2019-09-15,-2.342765,0.513484,0.838313,0.320525,
2019-09-16,3.195288,-1.984049,-2.348448,-0.140945,7.0


In [0]:
# NaN 없애기

In [77]:
# 행의 값 중 하나라도 nan인 경우 그 행을 없앤다.
# 지운결과를 df에 적용:df.dropna(how='any', inplace=True)
df.dropna(how='any')

Unnamed: 0,A,B,C,D,F
2019-09-11,1.320535,-0.08021,1.184606,1.114754,1.0
2019-09-13,0.770264,0.837936,0.55474,-0.708336,3.5
2019-09-14,0.664644,1.011985,-0.540256,-0.402834,6.1
2019-09-16,3.195288,-1.984049,-2.348448,-0.140945,7.0


In [78]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2019-09-11,1.320535,-0.08021,1.184606,1.114754,1.0
2019-09-12,0.302935,-0.701178,0.156197,1.212739,
2019-09-13,0.770264,0.837936,0.55474,-0.708336,3.5
2019-09-14,0.664644,1.011985,-0.540256,-0.402834,6.1
2019-09-15,-2.342765,0.513484,0.838313,0.320525,
2019-09-16,3.195288,-1.984049,-2.348448,-0.140945,7.0


In [79]:
# 행의 값이 모두 nan인 경우 그행을 없앤다.
df.drop

<bound method DataFrame.drop of                    A         B         C         D    F
2019-09-11  1.320535 -0.080210  1.184606  1.114754  1.0
2019-09-12  0.302935 -0.701178  0.156197  1.212739  NaN
2019-09-13  0.770264  0.837936  0.554740 -0.708336  3.5
2019-09-14  0.664644  1.011985 -0.540256 -0.402834  6.1
2019-09-15 -2.342765  0.513484  0.838313  0.320525  NaN
2019-09-16  3.195288 -1.984049 -2.348448 -0.140945  7.0>

In [0]:
# 주의 drop함수는 특정 행 또는 열을 drop하고 난 DataFrame을 반환한다.
# 즉, 반환을 받지 않으면 기존의 DataFrame은 그대로이다.
# 아니면, inplace = True 라는 인자를 추가하여, 반환을 받지 않고서도
# 기존의 DataFrame이 변경되도록 한다.

In [81]:
# nan 값에 값 넣기
df.fillna(value=0.5)

Unnamed: 0,A,B,C,D,F
2019-09-11,1.320535,-0.08021,1.184606,1.114754,1.0
2019-09-12,0.302935,-0.701178,0.156197,1.212739,0.5
2019-09-13,0.770264,0.837936,0.55474,-0.708336,3.5
2019-09-14,0.664644,1.011985,-0.540256,-0.402834,6.1
2019-09-15,-2.342765,0.513484,0.838313,0.320525,0.5
2019-09-16,3.195288,-1.984049,-2.348448,-0.140945,7.0


In [82]:
# nan 값인지 확인하기
df.isnull()

Unnamed: 0,A,B,C,D,F
2019-09-11,False,False,False,False,False
2019-09-12,False,False,False,False,True
2019-09-13,False,False,False,False,False
2019-09-14,False,False,False,False,False
2019-09-15,False,False,False,False,True
2019-09-16,False,False,False,False,False


In [83]:
df.isnull()['F']

2019-09-11    False
2019-09-12     True
2019-09-13    False
2019-09-14    False
2019-09-15     True
2019-09-16    False
Freq: D, Name: F, dtype: bool

In [84]:
# F열에서 nan값을 포함하는 행만 추출하기
df.loc[df.isnull()['F'],:]

Unnamed: 0,A,B,C,D,F
2019-09-12,0.302935,-0.701178,0.156197,1.212739,
2019-09-15,-2.342765,0.513484,0.838313,0.320525,


In [85]:
pd.to_datetime('20190911')

Timestamp('2019-09-11 00:00:00')

In [86]:
# 특정 행 drop 하기, 20190911행을 삭제
df.drop(pd.to_datetime('20190911'))

Unnamed: 0,A,B,C,D,F
2019-09-12,0.302935,-0.701178,0.156197,1.212739,
2019-09-13,0.770264,0.837936,0.55474,-0.708336,3.5
2019-09-14,0.664644,1.011985,-0.540256,-0.402834,6.1
2019-09-15,-2.342765,0.513484,0.838313,0.320525,
2019-09-16,3.195288,-1.984049,-2.348448,-0.140945,7.0


In [0]:
## df.drop(df['A'])

In [88]:
# 2개 이상도 가능
df.drop([pd.to_datetime('20190912'), pd.to_datetime('20190914') ])

Unnamed: 0,A,B,C,D,F
2019-09-11,1.320535,-0.08021,1.184606,1.114754,1.0
2019-09-13,0.770264,0.837936,0.55474,-0.708336,3.5
2019-09-15,-2.342765,0.513484,0.838313,0.320525,
2019-09-16,3.195288,-1.984049,-2.348448,-0.140945,7.0


In [89]:
# 특정 열 삭제하기
df.drop('F', axis=1)

Unnamed: 0,A,B,C,D
2019-09-11,1.320535,-0.08021,1.184606,1.114754
2019-09-12,0.302935,-0.701178,0.156197,1.212739
2019-09-13,0.770264,0.837936,0.55474,-0.708336
2019-09-14,0.664644,1.011985,-0.540256,-0.402834
2019-09-15,-2.342765,0.513484,0.838313,0.320525
2019-09-16,3.195288,-1.984049,-2.348448,-0.140945


In [90]:
# 2개 이상의 열도 가능
df.drop(['B', 'D'], axis=1)

Unnamed: 0,A,C,F
2019-09-11,1.320535,1.184606,1.0
2019-09-12,0.302935,0.156197,
2019-09-13,0.770264,0.55474,3.5
2019-09-14,0.664644,-0.540256,6.1
2019-09-15,-2.342765,0.838313,
2019-09-16,3.195288,-2.348448,7.0


In [0]:
# 6.Data 분석용 함수들

In [0]:
data = [[1.4, np.nan],
       [7.1, -4.5],
       [np.nan, np.nan],
       [0.75, -1.3]]

df = pd.DataFrame(data, columns = ['one', 'two'],
                 index=['a','b','c','d'])

In [93]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [94]:
# 행 기준의 합(즉, 각 열의 합)
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [95]:
# 열 기준의 합(즉, 각 행의 합)
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [0]:
# 이 때, 위에서 볼 수 있듯이 NaN값은 배제하고 계산한다.
# NaN 값을 배제하지 않고 계산하려면 아래와 같이 skipna에 대해 false를 지정해 준다.

In [97]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [98]:
# 특정 행 또는 특정 열에서만 계산하기
df['one'].sum()
df.loc[:,'one'].sum()
df.one.sum()

9.25

In [99]:
df.loc['b',:].sum()

2.5999999999999996

In [0]:
# pandas에서 DataFrame에 적용되는 함수들
# sum()함수 이외에도 pandas에서 DataFrame에 적용되는 함수는 다음의 것들이 있다.
# count 전체 성분의(NaN이 아닌) 값의 갯수를 계산
# min, max 전체 성분의 최솟, 최댓값을 계산
# argmin, argmax 전체 성분의 최솟값, 최댓값이 위치한 (정수)인덱스를 반환
# idxmin, idxmax 전체 인덱스 중 최솟값, 최댓값을 반환
# quantile 전체 성분의 특정 사분위수에 해당하는 값을 반환 (0~1 사이)
# sum 전체 성분의 합을 계산
# mean 전체 성분의 평균을 계산
# median 전체 성분의 중간값을 반환
# mad 전체 성분의 평균값으로부터의 절대 편차(absolute deviation)의 평균을 계산
# std, var 전체 성분의 표준편차, 분산을 계산
# cumsum 맨 첫 번째 성분부터 각 성분까지의 누적합을 계산(0에서부터 계속 더해짐)
# cumprod 맨 첫번째 성분부터 각 성분까지의 누적곱을 계산(1에서부터 계속 곱해짐)

In [101]:
df2 = pd.DataFrame(np.random.randn(6,4),
                  columns=['A', 'B', 'C', 'D'],
                  index = pd.date_range('20160701', periods=6))
df2

Unnamed: 0,A,B,C,D
2016-07-01,-0.253484,-1.296295,0.223411,-0.598723
2016-07-02,-0.288425,1.54089,-0.052655,1.385532
2016-07-03,1.956314,-0.918725,0.766867,1.791733
2016-07-04,-1.588016,0.283566,-1.080766,1.122526
2016-07-05,0.036468,-0.582144,0.848848,0.474378
2016-07-06,-0.715391,-0.320414,1.093023,-1.317094


In [102]:
# A열과 B열의 상관계수 구하기
df2['A'].corr(df2['B'])

-0.39304525263946344

In [0]:
# 정렬함수 및 기타함수

In [104]:
dates = df2.index
random_dates = np.random.permutation(dates)
df2 = df2.reindex(index=random_dates, columns = ['D', 'B', 'C', 'A'])
df2

Unnamed: 0,D,B,C,A
2016-07-06,-1.317094,-0.320414,1.093023,-0.715391
2016-07-05,0.474378,-0.582144,0.848848,0.036468
2016-07-04,1.122526,0.283566,-1.080766,-1.588016
2016-07-03,1.791733,-0.918725,0.766867,1.956314
2016-07-02,1.385532,1.54089,-0.052655,-0.288425
2016-07-01,-0.598723,-1.296295,0.223411,-0.253484


In [105]:
# index와 column의 순서가 섞여있다.
# 이 때 index가 오름차순이 되도록 정렬해보자
df2.sort_index(axis=0)

Unnamed: 0,D,B,C,A
2016-07-01,-0.598723,-1.296295,0.223411,-0.253484
2016-07-02,1.385532,1.54089,-0.052655,-0.288425
2016-07-03,1.791733,-0.918725,0.766867,1.956314
2016-07-04,1.122526,0.283566,-1.080766,-1.588016
2016-07-05,0.474378,-0.582144,0.848848,0.036468
2016-07-06,-1.317094,-0.320414,1.093023,-0.715391


In [106]:
# column을 기준으로?
df2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2016-07-06,-0.715391,-0.320414,1.093023,-1.317094
2016-07-05,0.036468,-0.582144,0.848848,0.474378
2016-07-04,-1.588016,0.283566,-1.080766,1.122526
2016-07-03,1.956314,-0.918725,0.766867,1.791733
2016-07-02,-0.288425,1.54089,-0.052655,1.385532
2016-07-01,-0.253484,-1.296295,0.223411,-0.598723


In [107]:
# 내림차순으로는?
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,D,B,C,A
2016-07-06,-1.317094,-0.320414,1.093023,-0.715391
2016-07-05,0.474378,-0.582144,0.848848,0.036468
2016-07-04,1.122526,0.283566,-1.080766,-1.588016
2016-07-03,1.791733,-0.918725,0.766867,1.956314
2016-07-02,1.385532,1.54089,-0.052655,-0.288425
2016-07-01,-0.598723,-1.296295,0.223411,-0.253484


In [108]:
# 값 기준 정렬하기
# D열의 값이 오름차순이 되도록 정렬하기
df2.sort_values(by='D')

Unnamed: 0,D,B,C,A
2016-07-06,-1.317094,-0.320414,1.093023,-0.715391
2016-07-01,-0.598723,-1.296295,0.223411,-0.253484
2016-07-05,0.474378,-0.582144,0.848848,0.036468
2016-07-04,1.122526,0.283566,-1.080766,-1.588016
2016-07-02,1.385532,1.54089,-0.052655,-0.288425
2016-07-03,1.791733,-0.918725,0.766867,1.956314


In [109]:
# B열의 값이 내림차순이 되도록 정렬하기
df2.sort_values(by='B', ascending=False)

Unnamed: 0,D,B,C,A
2016-07-02,1.385532,1.54089,-0.052655,-0.288425
2016-07-04,1.122526,0.283566,-1.080766,-1.588016
2016-07-06,-1.317094,-0.320414,1.093023,-0.715391
2016-07-05,0.474378,-0.582144,0.848848,0.036468
2016-07-03,1.791733,-0.918725,0.766867,1.956314
2016-07-01,-0.598723,-1.296295,0.223411,-0.253484


In [110]:
df2['E'] = np.random.randint(0, 6, size=6)
df2['F'] = ['a', 'b', 'g', 'g', 'a', 'g']
df2

Unnamed: 0,D,B,C,A,E,F
2016-07-06,-1.317094,-0.320414,1.093023,-0.715391,4,a
2016-07-05,0.474378,-0.582144,0.848848,0.036468,1,b
2016-07-04,1.122526,0.283566,-1.080766,-1.588016,0,g
2016-07-03,1.791733,-0.918725,0.766867,1.956314,1,g
2016-07-02,1.385532,1.54089,-0.052655,-0.288425,4,a
2016-07-01,-0.598723,-1.296295,0.223411,-0.253484,5,g


In [111]:
# E열과 F열을 동시에 고려하여, 오름차순으로 하려면?
df2.sort_values(by=['E', 'F'])

Unnamed: 0,D,B,C,A,E,F
2016-07-04,1.122526,0.283566,-1.080766,-1.588016,0,g
2016-07-05,0.474378,-0.582144,0.848848,0.036468,1,b
2016-07-03,1.791733,-0.918725,0.766867,1.956314,1,g
2016-07-06,-1.317094,-0.320414,1.093023,-0.715391,4,a
2016-07-02,1.385532,1.54089,-0.052655,-0.288425,4,a
2016-07-01,-0.598723,-1.296295,0.223411,-0.253484,5,g


In [112]:
# 지정한 행 또는 열에서 중복값을 제외한 유니크한 값만 얻기
df2['F'].unique()

array(['a', 'b', 'g'], dtype=object)

In [113]:
# 지정한 행 또는 열에서 값에 따른 개수 얻기
df2['F'].value_counts()

g    3
a    2
b    1
Name: F, dtype: int64

In [114]:
# 지정한 행 또는 열에서 입력한 값이 있는지 확인하기
df2['F'].isin(['a','b'])

2016-07-06     True
2016-07-05     True
2016-07-04    False
2016-07-03    False
2016-07-02     True
2016-07-01    False
Name: F, dtype: bool

In [116]:
# F열의 값이 alpha나 beta인 모든 행 구하기
df2.loc[df2['F'].isin(['a','b']),:]

Unnamed: 0,D,B,C,A,E,F
2016-07-06,-1.317094,-0.320414,1.093023,-0.715391,4,a
2016-07-05,0.474378,-0.582144,0.848848,0.036468,1,b
2016-07-02,1.385532,1.54089,-0.052655,-0.288425,4,a


In [0]:
# 사용자가 직접 만든 함수를 적용하기
del df2['F']

In [125]:
func = lambda x: x.max() - x.min()
# def func(x):
#   r = x.max() - x.min()
df2.apply(func, axis=0)  

D    3.108827
B    2.837185
C    2.173788
A    3.544330
E    5.000000
dtype: float64