## 1. Pandas란?

Pandas는 파이썬에서 사용하는 데이터 라이브러리로, 행과 열로 이루어진 데이터 객체를 만들어 다룰 수 있게 되며 보다 안정적으로 대용량의 데이터들을 처리하는데 매우 편리한 도구이다.<br><br>
먼저 Pandas를 사용하기 위해서는 pandas를 설치한 이후에 아래와 같이 import 해야 한다.

In [3]:
# pandas 사용하기
import numpy as np
import pandas as pd

## 2.Pandas 자료구조
Pandas에서는 기본적으로 정의되는 자료구조인 Series와 Data Frame을 사용한다.<br>
이 자료구조들은 빅 데이터 분석에 있어 높은 수준의 성능을 보여준다.

## 2-1. Series

In [25]:
# Series 정의하기
# index value

obj = pd.Series([4, 7, -5, 3])
obj

a = np.array([1,2,3,4])
a

array([1, 2, 3, 4])

In [4]:
# Series의 값만 확인하기
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
# Series의 인덱스만 확인하기
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
# Series의 자료형 확인하기
obj.dtype

dtype('int64')

In [9]:
# 인덱스를 바꿀 수도 있다.
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [3]:
# python의 dictionary 자료형을 Series data로 만들 수 있다.
# dictionary의 key가 Series의 index가 된다.
sdata = {'kim' : 35000, 'Beone' : 67000, 'Jin' : 12000, 'Choi' : 40000}
print(sdata)
print()
obj3 = pd.Series(sdata)
print(obj3)

{'kim': 35000, 'Beone': 67000, 'Jin': 12000, 'Choi': 40000}

kim      35000
Beone    67000
Jin      12000
Choi     40000
dtype: int64


In [16]:
# Series name과 index name 주기

obj3.name = "Salary"
obj3.index.name = "Names"

print(obj3.index)
print(obj3.values)
print(obj3.name)

print()

print(obj3)

Index(['kim', 'Beone', 'Jin', 'Choi'], dtype='object', name='Names')
[35000 67000 12000 40000]
Salary

Names
kim      35000
Beone    67000
Jin      12000
Choi     40000
Name: Salary, dtype: int64


In [20]:
# index 변경

obj3.index = ['A', 'B', 'C', 'D',]
obj3

A    35000
B    67000
C    12000
D    40000
Name: Salary, dtype: int64

## 2-2. Data Frame


In [5]:
# Data Frame 정의하기
# 이전에는 DataFrame에 들어갈 데이터를 정의해주어야 하는데,
# 이는 python의 dictionary 또는 numpy의 array로 정의할 수 있다.

# key : colunm / value = row

data = {
    "name" : ["Beone", "Beone", "Beone", "Kim", "Park"],
    "year" : [2013, 2014, 2015, 2016, 205],
    "points" : [1.5, 1.7, 3.6, 2.4, 2.9]
       }

df = pd.DataFrame(data)

# 행과 열의 구조를 가진 데이터가 생긴다.

df

Unnamed: 0,name,year,points
0,Beone,2013,1.5
1,Beone,2014,1.7
2,Beone,2015,3.6
3,Kim,2016,2.4
4,Park,205,2.9


In [21]:
# 행 방향의 index

df.index

RangeIndex(start=0, stop=5, step=1)

In [None]:
# 열 방향의 index

In [22]:
df.columns

Index(['name', 'year', 'points'], dtype='object')

In [24]:
# 값
df.values

array([['Beone', 2013, 1.5],
       ['Beone', 2014, 1.7],
       ['Beone', 2015, 3.6],
       ['Kim', 2016, 2.4],
       ['Park', 205, 2.9]], dtype=object)

In [25]:
# 각 인덱스에 대한 이름 설정
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,name,year,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Beone,2013,1.5
1,Beone,2014,1.7
2,Beone,2015,3.6
3,Kim,2016,2.4
4,Park,205,2.9


In [6]:
# DataFrame을 만들어서 columns와 index를 설정할 수 있다.
df2 = pd.DataFrame(data, columns=['year', 'name', 'points', 'penalty'],
                       index=['one', 'two', 'three', 'four', 'five']
               )
df2

Unnamed: 0,year,name,points,penalty
one,2013,Beone,1.5,
two,2014,Beone,1.7,
three,2015,Beone,3.6,
four,2016,Kim,2.4,
five,205,Park,2.9,


DataFrame을 정의하면서, data로 들어가는 python dictionary와 columns의 순서가 달라도 알아서 맞춰서 정의된다.<br>
하지만 dta에 포함되어 있지 않은 값은 NaN(Not a Number)으로 나타나게 되는데, 이는 null과 같은 개념이다.<br>
NaN값은 추후에 어떠한 방법으로도 처리가 되지 않는 데이터이다. 따라서 올바른 데이터 처리를 위해 추가적으로 값을 넣어줘야 한다.

In [7]:
# describe() 함수는 DataFrame의 계산 가능한 값들에 대한 다양한 계산 값을 보여준다.

df2.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,1652.6,2.42
std,809.233773,0.864292
min,205.0,1.5
25%,2013.0,1.7
50%,2014.0,2.4
75%,2015.0,2.9
max,2016.0,3.6


In [9]:
#df2.info
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, one to five
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   year     5 non-null      int64  
 1   name     5 non-null      object 
 2   points   5 non-null      float64
 3   penalty  0 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 200.0+ bytes


## 3. DataFrame Indexing

In [10]:
data = {
    "names" : ["Beone", "Beone", "Beone", "Kim", "Kim"],
    "year" : [2013, 2014, 2015, 2016, 205],
    "points" : [1.5, 1.7, 3.6, 2.4, 2.9]
       }

df = pd.DataFrame(data, columns=['year', 'names', 'points', 'penalty'],
                       index=['one', 'two', 'three', 'four', 'five']
               )

df



Unnamed: 0,year,names,points,penalty
one,2013,Beone,1.5,
two,2014,Beone,1.7,
three,2015,Beone,3.6,
four,2016,Kim,2.4,
five,205,Kim,2.9,


## 3-1. DataFrame에서 열을 선택하고 조작하기

In [41]:
df['year']

one      2013
two      2014
three    2015
four     2016
five      205
Name: year, dtype: int64

In [42]:
df.year

one      2013
two      2014
three    2015
four     2016
five      205
Name: year, dtype: int64

In [43]:
df[['year', 'points']]

Unnamed: 0,year,points
one,2013,1.5
two,2014,1.7
three,2015,3.6
four,2016,2.4
five,205,2.9


In [None]:
# 특정 열에 대해 선택하고, 원하는 값을 대입할 수 있다.

In [44]:
df['penalty'] = 0.5

In [45]:
df

Unnamed: 0,year,names,points,penalty
one,2013,Beone,1.5,0.5
two,2014,Beone,1.7,0.5
three,2015,Beone,3.6,0.5
four,2016,Kim,2.4,0.5
five,205,Kim,2.9,0.5


In [40]:
df['penalty'] = [0.1, 0.2, 0.3, 0.4, 0.5]
df

Unnamed: 0,names,year,points,penalty
one,Beone,2013,1.5,0.1
two,Beone,2014,1.7,0.2
three,Beone,2015,3.6,0.3
four,Kim,2016,2.4,0.4
five,Kim,205,2.9,0.5


In [41]:
# 새로운 열 추가하기
df['zeros'] = np.arange(5)
df

Unnamed: 0,names,year,points,penalty,zeros
one,Beone,2013,1.5,0.1,0
two,Beone,2014,1.7,0.2,1
three,Beone,2015,3.6,0.3,2
four,Kim,2016,2.4,0.4,3
five,Kim,205,2.9,0.5,4


In [42]:
# Series를 추가할 수도 있다.
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df['debt'] = val
df

Unnamed: 0,names,year,points,penalty,zeros,debt
one,Beone,2013,1.5,0.1,0,
two,Beone,2014,1.7,0.2,1,-1.2
three,Beone,2015,3.6,0.3,2,
four,Kim,2016,2.4,0.4,3,-1.5
five,Kim,205,2.9,0.5,4,-1.7


Series로 넣을 때는 val와 같이 넣으려는 data의 지정한 index에 맞춰서 데이터가 들어간다.<br>
이점이 python의 list나 numpy의 array로 넣을 때의 차이점이다.

In [43]:
df['net_points'] = df['points'] - df['penalty']

In [44]:
df['high_points'] = df['net_points']  > 2.0

In [45]:
df

Unnamed: 0,names,year,points,penalty,zeros,debt,net_points,high_points
one,Beone,2013,1.5,0.1,0,,1.4,False
two,Beone,2014,1.7,0.2,1,-1.2,1.5,False
three,Beone,2015,3.6,0.3,2,,3.3,True
four,Kim,2016,2.4,0.4,3,-1.5,2.0,False
five,Kim,205,2.9,0.5,4,-1.7,2.4,True


In [71]:
# 열 삭제하기
del df['high_points']


In [72]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points
one,2013,Beone,1.5,0.1,0,,1.4
two,2014,Beone,1.7,0.2,1,-1.2,1.5
three,2015,Beone,3.6,0.3,2,,3.3
four,2016,Kim,2.4,0.4,3,-1.5,2.0
five,205,Kim,2.9,0.5,4,-1.7,2.4


In [76]:
del df['net_points']
del df['zeros']

In [77]:
df

Unnamed: 0,year,names,points,penalty,debt
one,2013,Beone,1.5,0.1,
two,2014,Beone,1.7,0.2,-1.2
three,2015,Beone,3.6,0.3,
four,2016,Kim,2.4,0.4,-1.5
five,205,Kim,2.9,0.5,-1.7


In [78]:
df.columns

Index(['year', 'names', 'points', 'penalty', 'debt'], dtype='object')

In [80]:
df.index.name = 'Order'
df.columns.name = 'Info'
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2013,Beone,1.5,0.1,
two,2014,Beone,1.7,0.2,-1.2
three,2015,Beone,3.6,0.3,
four,2016,Kim,2.4,0.4,-1.5
five,205,Kim,2.9,0.5,-1.7


## 3-2. DataFrame에서 행을 선택하고 조작하기
pandas에서는 DataFrame에서 행을 인덱싱하는 방법이 무수히 많다.<br>
물론 위에서 소개했던 열을 선택하는 방법도 수많은 방법중에 하나에 불과하다.


In [81]:
# 0 번째 부터 3- 1 (2) 번째까지 가져온다.
# 뒤에 써준 숫자 번째의 행을 뺀다.
df[0:3]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2013,Beone,1.5,0.1,
two,2014,Beone,1.7,0.2,-1.2
three,2015,Beone,3.6,0.3,


In [82]:
# two 라는 행부터 four이라는 행까지 가져온다.
# 뒤에 써준 이름의 행을 빼지 않는다. 하지만 추천하지는 않는다.
df['two' : 'four']

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2014,Beone,1.7,0.2,-1.2
three,2015,Beone,3.6,0.3,
four,2016,Kim,2.4,0.4,-1.5


In [83]:
# 아래 방법을 권장한다.
# .loc or .iloc 함수를 사용하는 방법

# loc는 "index"를 기준으로 뽑는다.
df.loc['two']

Info
year        2014
names      Beone
points       1.7
penalty      0.2
debt        -1.2
Name: two, dtype: object

In [48]:
df.loc['two' : 'four']
df.loc[:3]

TypeError: cannot do slice indexing on Index with these indexers [3] of type int

In [85]:
# df.loc['two' : 'four']의 points 가져오기
df.loc['two' : 'four', 'points']

Order
two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [86]:
df.loc[ :, 'year'] # == df['year']

Order
one      2013
two      2014
three    2015
four     2016
five      205
Name: year, dtype: int64

In [89]:
df.loc[ :, ['year', 'names']] # == df[['year', 'names']]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,2013,Beone
two,2014,Beone
three,2015,Beone
four,2016,Kim
five,205,Kim


In [91]:
df.loc['three' : 'five', 'year' : 'penalty'] 

Info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
three,2015,Beone,3.6,0.3
four,2016,Kim,2.4,0.4
five,205,Kim,2.9,0.5


In [92]:
# 새로운 행 삽입하기
df.loc['six', :] = [2013, 'Jun', 4.0, 0.1, 2.1]

In [93]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2013.0,Beone,1.5,0.1,
two,2014.0,Beone,1.7,0.2,-1.2
three,2015.0,Beone,3.6,0.3,
four,2016.0,Kim,2.4,0.4,-1.5
five,205.0,Kim,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.1,2.1


In [94]:
# 3번째 행 가져오기

# iloc는 행 번호를 통해 가져온다.
df.iloc[3]

Info
year       2016.0
names         Kim
points        2.4
penalty       0.4
debt         -1.5
Name: four, dtype: object

In [96]:
df.iloc[3:5, 0:2]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
four,2016.0,Kim
five,205.0,Kim


In [97]:
df.iloc[[0, 1, 3], [1, 2]]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Beone,1.5
two,Beone,1.7
four,Kim,2.4


## 4. DataFrame에서의 boolean Indexing

In [98]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2013.0,Beone,1.5,0.1,
two,2014.0,Beone,1.7,0.2,-1.2
three,2015.0,Beone,3.6,0.3,
four,2016.0,Kim,2.4,0.4,-1.5
five,205.0,Kim,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.1,2.1


In [49]:
# year가 2014보다 큰 boolean data
df['year'] > 2014

one      False
two      False
three     True
four      True
five     False
Name: year, dtype: bool

In [99]:
# year가 2014보다 큰 모든 행의 값
df.loc[df['year'] > 2014, :]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
three,2015.0,Beone,3.6,0.3,
four,2016.0,Kim,2.4,0.4,-1.5


In [102]:
df.loc[df['names'] == 'Beone', ['names', 'points']]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Beone,1.5
two,Beone,1.7
three,Beone,3.6


In [103]:
# numpy에서와 같이 논리연산을 응용할 수 있다.
df.loc[(df['points'] > 2) & (df['points'] < 3), :]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
four,2016.0,Kim,2.4,0.4,-1.5
five,205.0,Kim,2.9,0.5,-1.7


In [104]:
# 새로운 값을 대입할 수도 있다.
df.loc[df['points'] > 3, 'penalty'] = 0

In [105]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2013.0,Beone,1.5,0.1,
two,2014.0,Beone,1.7,0.2,-1.2
three,2015.0,Beone,3.6,0.0,
four,2016.0,Kim,2.4,0.4,-1.5
five,205.0,Kim,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.0,2.1


## 5. Data

In [53]:
# DataFrame을 만들 때 index, column을 설정하지 않으면 기본값으로 0부터 시작하는 정수형 숫자로 입력된다.
df = pd.DataFrame(np.random.randn(6, 4))
df

Unnamed: 0,0,1,2,3
0,1.641797,-0.089163,0.460845,0.63769
1,-0.982068,0.184405,-0.975307,-0.371418
2,-0.802731,1.209022,-0.018558,0.912375
3,1.042689,-2.222481,-1.617364,-1.350829
4,0.093143,0.639452,-1.216908,1.332612
5,0.330796,-0.650498,-0.009931,1.126996


In [11]:
df.columns = ['A', 'B', 'C', 'D']
# pandas에서 제공하는 date_range함수는 datetime 자료형으로 구성된, 날짜, 시각 등을 알 수 있는 자료형을 만드는 함수
df.index = pd.date_range('20160701', periods=6)
df.index


ValueError: Length mismatch: Expected axis has 5 elements, new values have 6 elements

In [55]:
df

Unnamed: 0,A,B,C,D
2016-07-01,1.641797,-0.089163,0.460845,0.63769
2016-07-02,-0.982068,0.184405,-0.975307,-0.371418
2016-07-03,-0.802731,1.209022,-0.018558,0.912375
2016-07-04,1.042689,-2.222481,-1.617364,-1.350829
2016-07-05,0.093143,0.639452,-1.216908,1.332612
2016-07-06,0.330796,-0.650498,-0.009931,1.126996


In [12]:
# np.nan은 NaN을 의미한다.
df['F'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

ValueError: Length of values (6) does not match length of index (5)

In [114]:
# 행의 값 중 하나라도 nan인 경우 그 행을 없앤다.
df.dropna(how='any')

Unnamed: 0,A,B,C,D,F
2016-07-01,-0.465512,-0.494371,-0.326439,1.86443,1.0
2016-07-03,-0.47548,-1.14463,-2.492511,1.285889,3.5
2016-07-04,-0.372031,-1.042399,-1.21774,0.131508,6.1
2016-07-06,0.793704,0.190004,0.396708,-0.782454,7.0


In [115]:
# 행의 값 모두 nan인 경우 그 행을 없앤다.
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2016-07-01,-0.465512,-0.494371,-0.326439,1.86443,1.0
2016-07-02,-0.441491,0.313819,-0.243551,-0.386109,
2016-07-03,-0.47548,-1.14463,-2.492511,1.285889,3.5
2016-07-04,-0.372031,-1.042399,-1.21774,0.131508,6.1
2016-07-05,-0.070778,-0.184434,-0.13126,0.991134,
2016-07-06,0.793704,0.190004,0.396708,-0.782454,7.0


<strong>주의</strong>
drop 함수는 특정 행 또는 열을 drop하고 난 DataFrame을 반환한다.<br>
즉, 반환을 받지 않으면 기존의 DataFrame은 그대로이다.<br>
아니면, inplace = True라는 인자를 추가하여 반환을 받지 않고서도 기존의 DataFrame이 변경되도록 한다.

In [116]:
# nan값에 값 넣기
df.fillna(value=0.5)

Unnamed: 0,A,B,C,D,F
2016-07-01,-0.465512,-0.494371,-0.326439,1.86443,1.0
2016-07-02,-0.441491,0.313819,-0.243551,-0.386109,0.5
2016-07-03,-0.47548,-1.14463,-2.492511,1.285889,3.5
2016-07-04,-0.372031,-1.042399,-1.21774,0.131508,6.1
2016-07-05,-0.070778,-0.184434,-0.13126,0.991134,0.5
2016-07-06,0.793704,0.190004,0.396708,-0.782454,7.0


In [118]:
# nan값인지 확인하기
df.isnull()

Unnamed: 0,A,B,C,D,F
2016-07-01,False,False,False,False,False
2016-07-02,False,False,False,False,True
2016-07-03,False,False,False,False,False
2016-07-04,False,False,False,False,False
2016-07-05,False,False,False,False,True
2016-07-06,False,False,False,False,False


In [119]:
# F열에서 nan값을 포함하는 행만 추출하기
df.loc[df.isnull()['F'], :]

Unnamed: 0,A,B,C,D,F
2016-07-02,-0.441491,0.313819,-0.243551,-0.386109,
2016-07-05,-0.070778,-0.184434,-0.13126,0.991134,


In [121]:
pd.to_datetime('20160701')

Timestamp('2016-07-01 00:00:00')

In [58]:
# 특정 행 drop하기
df.drop(pd.to_datetime('20160701'))

Unnamed: 0,A,B,C,D
2016-07-02,-0.982068,0.184405,-0.975307,-0.371418
2016-07-03,-0.802731,1.209022,-0.018558,0.912375
2016-07-04,1.042689,-2.222481,-1.617364,-1.350829
2016-07-05,0.093143,0.639452,-1.216908,1.332612
2016-07-06,0.330796,-0.650498,-0.009931,1.126996


In [56]:
# 여러 행 drop하기
df.drop([pd.to_datetime('20160702'), pd.to_datetime('20160704')])

Unnamed: 0,A,B,C,D
2016-07-01,1.641797,-0.089163,0.460845,0.63769
2016-07-03,-0.802731,1.209022,-0.018558,0.912375
2016-07-05,0.093143,0.639452,-1.216908,1.332612
2016-07-06,0.330796,-0.650498,-0.009931,1.126996


In [60]:
# 특정 열 삭제하기
df.drop('D', axis=1)

Unnamed: 0,A,B,C
2016-07-01,1.641797,-0.089163,0.460845
2016-07-02,-0.982068,0.184405,-0.975307
2016-07-03,-0.802731,1.209022,-0.018558
2016-07-04,1.042689,-2.222481,-1.617364
2016-07-05,0.093143,0.639452,-1.216908
2016-07-06,0.330796,-0.650498,-0.009931


In [61]:
# 여러 열 삭제하기
df.drop(['B', 'D'], axis=1)

Unnamed: 0,A,C
2016-07-01,1.641797,0.460845
2016-07-02,-0.982068,-0.975307
2016-07-03,-0.802731,-0.018558
2016-07-04,1.042689,-1.617364
2016-07-05,0.093143,-1.216908
2016-07-06,0.330796,-0.009931


## 6. Data 분석용 함수들

In [133]:
data = [[1.4, np.nan],
       [7.1, -4.5],
       [np.nan, np.nan],
       [0.75, -1.3]]

print(data)
print()
df = pd.DataFrame(data, columns=['one', 'two'], index=['a', 'b', 'c', 'd',])
df

[[1.4, nan], [7.1, -4.5], [nan, nan], [0.75, -1.3]]



Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [135]:
# 행방향으로 합(각 열의 합)
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [136]:
# 열방향으로의 합(각 의 합)
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [137]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [138]:
# 특정 행 또는 열에서만 계산하기
df['one'].sum()

9.25

In [139]:
df.loc['b'].sum()

2.5999999999999996

<strong>pandas에서 dataFrame에 적용되는 함수들</strong><br>
sum()함수 이외에도 pandas에서 DataFrame에 적용되는 함수는 다음의 것들이 있다.<br>
- count : 전체 성분의(NaN이 아는) 값의 개수를 계산
- min, max : 전체 성분의 최솟, 최댓값의 개수를 계싼
- argmin, argmax : 전체 성분의 최솟갓, 최댓값이 위치한 인덱스(정수)를 반환
- idxmin, idxmax : 전체 인덱스 중 최솟값, 최댓값을 반환
- quantile : 전체 성분의 특정 사분위수에 해당하는 값을 반환(0 ~ 1 사이)
- sum : 전체 성분의 합을 계산
- mean : 전체 성분의 평균을 계산
- median : 전체 성분의 중간값을 반환 mad 전체 성분의 평균값으로부터의 절대 편차(absolute deviation)의 평균을 계산
- std, var : 전체 성분의 표준편차, 분산을 계산
- cumsum : 맨 첫 번쨰 선분부터 각 성분까지의 누적합을 계산(0 에서부터 계속 더해짐)
- cumprod : 맨 첫 번쨰 선분부터 각 성분까지의 누적곱을 계산(1 에서부터 계속 곱해짐)

In [145]:
df2 = pd.DataFrame(np.random.randn(6, 4),
                   columns = ['A', 'B', 'C', 'D',],
                   index = pd.date_range('20160701', periods=6))
df2

Unnamed: 0,A,B,C,D
2016-07-01,0.616345,1.782325,0.631155,0.126684
2016-07-02,0.00524,0.333488,1.912416,0.077316
2016-07-03,0.178285,1.08746,0.702623,-0.165415
2016-07-04,-0.82048,0.090101,0.881887,1.64379
2016-07-05,1.332414,0.497388,0.169174,1.050388
2016-07-06,-1.614792,-0.940818,-0.519718,-0.63695


In [146]:
# A열과 B열의 상관계수 구하기
df2['A'].corr(df2['B'])

0.7360048960509359

In [147]:
# B열과 C열의 공분산 구하기
df2['B'].cov(df2['C'])

0.2770071635434532

### 정렬함수 및 기타 함수

In [148]:
dates = df2.index
random_dates = np.random.permutation(dates)
df2 = df2.reindex(index=random_dates, columns=['D', 'B', 'C', 'A',])
df2

Unnamed: 0,D,B,C,A
2016-07-03,-0.165415,1.08746,0.702623,0.178285
2016-07-02,0.077316,0.333488,1.912416,0.00524
2016-07-06,-0.63695,-0.940818,-0.519718,-1.614792
2016-07-05,1.050388,0.497388,0.169174,1.332414
2016-07-04,1.64379,0.090101,0.881887,-0.82048
2016-07-01,0.126684,1.782325,0.631155,0.616345


In [156]:
# index와 column의 순서가 섞여있다.
# 이때 index가 오름차순이 되도록 정렬해보자

df2.sort_index(axis=0)

Unnamed: 0,D,B,C,A
2016-07-01,0.126684,1.782325,0.631155,0.616345
2016-07-02,0.077316,0.333488,1.912416,0.00524
2016-07-03,-0.165415,1.08746,0.702623,0.178285
2016-07-04,1.64379,0.090101,0.881887,-0.82048
2016-07-05,1.050388,0.497388,0.169174,1.332414
2016-07-06,-0.63695,-0.940818,-0.519718,-1.614792


In [150]:
# 내림차순
df2.sort_index(axis=0, ascending = False)

Unnamed: 0,D,B,C,A
2016-07-06,-0.63695,-0.940818,-0.519718,-1.614792
2016-07-05,1.050388,0.497388,0.169174,1.332414
2016-07-04,1.64379,0.090101,0.881887,-0.82048
2016-07-03,-0.165415,1.08746,0.702623,0.178285
2016-07-02,0.077316,0.333488,1.912416,0.00524
2016-07-01,0.126684,1.782325,0.631155,0.616345


In [154]:
# 값 기준 정렬하기
# D열의 값이 오름차순이 되도록 정렬하기
df2.sort_values(by='D')

Unnamed: 0,D,B,C,A
2016-07-06,-0.63695,-0.940818,-0.519718,-1.614792
2016-07-03,-0.165415,1.08746,0.702623,0.178285
2016-07-02,0.077316,0.333488,1.912416,0.00524
2016-07-01,0.126684,1.782325,0.631155,0.616345
2016-07-05,1.050388,0.497388,0.169174,1.332414
2016-07-04,1.64379,0.090101,0.881887,-0.82048


In [157]:
# B열의 값이 오름차순이 되도록 정렬하기
df2.sort_values(by='B', ascending=False)

Unnamed: 0,D,B,C,A
2016-07-01,0.126684,1.782325,0.631155,0.616345
2016-07-03,-0.165415,1.08746,0.702623,0.178285
2016-07-05,1.050388,0.497388,0.169174,1.332414
2016-07-02,0.077316,0.333488,1.912416,0.00524
2016-07-04,1.64379,0.090101,0.881887,-0.82048
2016-07-06,-0.63695,-0.940818,-0.519718,-1.614792


In [158]:
df2['E'] = np.random.randint(0, 6, size=6)
df2['F'] = ["alpha", 'beta', 'gamma', 'gamma', 'alpha', 'gamma']
df2

Unnamed: 0,D,B,C,A,E,F
2016-07-03,-0.165415,1.08746,0.702623,0.178285,5,alpha
2016-07-02,0.077316,0.333488,1.912416,0.00524,2,beta
2016-07-06,-0.63695,-0.940818,-0.519718,-1.614792,0,gamma
2016-07-05,1.050388,0.497388,0.169174,1.332414,4,gamma
2016-07-04,1.64379,0.090101,0.881887,-0.82048,5,alpha
2016-07-01,0.126684,1.782325,0.631155,0.616345,5,gamma


In [159]:
# E열과 F열을 동시에 고려하여 오름차순으로 정렬
df2.sort_values(by=['E', 'F'])

Unnamed: 0,D,B,C,A,E,F
2016-07-06,-0.63695,-0.940818,-0.519718,-1.614792,0,gamma
2016-07-02,0.077316,0.333488,1.912416,0.00524,2,beta
2016-07-05,1.050388,0.497388,0.169174,1.332414,4,gamma
2016-07-03,-0.165415,1.08746,0.702623,0.178285,5,alpha
2016-07-04,1.64379,0.090101,0.881887,-0.82048,5,alpha
2016-07-01,0.126684,1.782325,0.631155,0.616345,5,gamma


In [160]:
# 지정한 행 또는 열에서 중복값을 제외한 유니크한 값만 얻기
df2['F'].unique()

array(['alpha', 'beta', 'gamma'], dtype=object)

In [163]:
# 지정한 행 또는 열에서 값에 따른 개수 얻기
df2['F'].value_counts()

gamma    3
alpha    2
beta     1
Name: F, dtype: int64

In [162]:
# 지정한 행 또는 열에서 입력한 값이 있는지 확인하기
df2['F'].isin(['alpha', 'beta'])

2016-07-03     True
2016-07-02     True
2016-07-06    False
2016-07-05    False
2016-07-04     True
2016-07-01    False
Name: F, dtype: bool

In [164]:
# F열의 값이 alpha나 beta인 모든 행 구하기
df2.loc[df2['F'].isin(['alpha', 'beta']), :]

Unnamed: 0,D,B,C,A,E,F
2016-07-03,-0.165415,1.08746,0.702623,0.178285,5,alpha
2016-07-02,0.077316,0.333488,1.912416,0.00524,2,beta
2016-07-04,1.64379,0.090101,0.881887,-0.82048,5,alpha


사용자가 직접 만든 함수 사용하기

In [166]:
df3 = pd.DataFrame(np.random.randn(4, 3),
                  columns=['b', 'd', 'e'],
                  index=['seoul', 'incheon', 'busan', 'daegu'])
df3

Unnamed: 0,b,d,e
seoul,0.893903,1.900252,0.62214
incheon,-0.358966,-1.835112,0.514437
busan,-0.145203,0.015594,1.024259
daegu,0.411021,0.873647,3.291167


In [167]:
func = lambda x : x.max() - x.min();

In [168]:
df3.apply(func, axis=0)

b    1.252869
d    3.735363
e    2.776730
dtype: float64

In [169]:
df3.to_csv('./df3_pandas.csv')

In [170]:
df = pd.read_csv('./df3_pandas.csv')
df

Unnamed: 0.1,Unnamed: 0,b,d,e
0,seoul,0.893903,1.900252,0.62214
1,incheon,-0.358966,-1.835112,0.514437
2,busan,-0.145203,0.015594,1.024259
3,daegu,0.411021,0.873647,3.291167
