> Pandas
- list 형태의 data : series 라고 부름
- 함수 뒤에 ? 붙이면 설명과 예시 나옴

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

# Series / DataFrame
> Series
- 각 column의 type은 series
- series는 list로 만들 수 있음.

> Dataframe
- dictionary, list
- column의 순서가 key의 입력순서랑 다를 수 있음
- OrderedDict : key의 순서 보장

In [2]:
# list
s1 = pd.Series([1,2,3])
s2 = pd.Series(['one', 'two', 'three'])

pd.DataFrame(data = dict(num = s1, word = s2))

Unnamed: 0,num,word
0,1,one
1,2,two
2,3,three


In [3]:
# dictionary 담은 list로 Dataframe 만들기
# key의 순서 보장하지 않으므로 columns 순서 지정하면 됨.

dict_list = [
    {'name' : 'John', 'age' : 25, 'job' : 'student'},
    {'name' : 'Nate', 'age' : 30, 'job' : 'teacher'}
]

df = pd.DataFrame(dict_list)
df = df[['name', 'age', 'job']]
df

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,30,teacher


In [4]:
# orderecDict (key의 순서 보장)

from collections import OrderedDict

ordered_dict = OrderedDict(
    [
        ('name', ['John', 'Nate']),
        ('age', [25, 30]),
        ('job', ['student', 'teacher'])  
    ]
)
df = pd.DataFrame.from_dict(ordered_dict)
df

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,30,teacher


In [5]:
# index 지정 안해주면 0부터 시작
df = pd.DataFrame(
    {"a" : [4 ,5, 6],
    "b" : [7, 8, 9],
    "c" : [10, 11, 12]},
    index = [1, 2, 3])
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [6]:
df = pd.DataFrame(
    [[4, 7, 10],
    [5, 8, 11],
    [6, 9, 12]], 
    index=[1, 2, 3], 
    columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [7]:
# index 2개
df = pd.DataFrame(
    {"a" : [4 ,5, 6], 
    "b" : [7, 8, 9], 
    "c" : [10, 11, 12]}, 
    index = pd.MultiIndex.from_tuples([('d',1),('d',2),('e',2)], names=['n','v']))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12


# File

## read file
> pd.read_csv
1. csv, txt (쉼표(,) 구분 파일)
2. tab 구분 파일
3. header 없는 경우

In [8]:
# csv 파일
df_csv = pd.read_csv('practice_data/friend_list.csv')
df_csv

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [9]:
# txt 파일
df_txt = pd.read_csv('practice_data/friend_list.txt')
df_txt

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [10]:
# tab 구분
df_tab = pd.read_csv('practice_data/friend_list_tab.txt', delimiter='\t')
df_tab

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [11]:
# header 없는 경우
df_noheader = pd.read_csv('practice_data/friend_list_no_head.csv',
                          header=None)
df_noheader.columns = ['name', 'age', 'job']
df_noheader

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [12]:
df_header = pd.read_csv('practice_data/friend_list_no_head.csv',
                        header = None, 
                        names = ['name', 'age', 'job'])
df_header

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


## write file
- 같은 폴더(위치)에 작성(저장).
- None 값은 빈칸으로 들어감.

In [13]:
# default : index = True, header = True
# index : row id (행 순서)
# header : column name (열 이름)
# na_rep : na (null 값) 대신 입력

df = df_header.copy()
df.to_csv('friends.csv',na_rep='-')

# Indexing / Operator

> 연산자 (비교 연산자)
- .isin([ ])
- .isnull() / .notnull()
- & : and
- | : or
- ~ : not
- ^ : xor
- df.any() : any
- df.all() : all

In [14]:
df = pd.DataFrame(
    [[4, 7, 10],
    [5, 8, 11],
    [6, 9, 12]], 
    index=[1, 2, 3], 
    columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


## isin

In [15]:
# 무조건 리스트 형태로 입력해야함
df.a.isin([5])

1    False
2     True
3    False
Name: a, dtype: bool

In [16]:
df['a'].isin([5])

1    False
2     True
3    False
Name: a, dtype: bool

## isnull / isna / notnull

In [17]:
df = pd.DataFrame(
    [[4, 7, 10, np.nan],
    [5, 8,np.nan, 11],
    [np.nan,6, 9, 12]], 
    index=[1, 2, 3], 
    columns=['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
1,4.0,7,10.0,
2,5.0,8,,11.0
3,,6,9.0,12.0


In [18]:
df.isna()

Unnamed: 0,a,b,c,d
1,False,False,False,True
2,False,False,True,False
3,True,False,False,False


In [19]:
df.isnull()

Unnamed: 0,a,b,c,d
1,False,False,False,True
2,False,False,True,False
3,True,False,False,False


In [20]:
pd.isnull(df)

Unnamed: 0,a,b,c,d
1,False,False,False,True
2,False,False,True,False
3,True,False,False,False


In [21]:
df['a'].isnull()

1    False
2    False
3     True
Name: a, dtype: bool

In [22]:
df.isnull().sum()

a    1
b    0
c    1
d    1
dtype: int64

In [23]:
df.notnull()

Unnamed: 0,a,b,c,d
1,True,True,True,False
2,True,True,False,True
3,False,True,True,True


In [24]:
df.notnull().sum()

a    2
b    3
c    2
d    2
dtype: int64

In [25]:
df.a.notnull()

1     True
2     True
3    False
Name: a, dtype: bool

# Subset
> Subset Observations (Rows)
- .drop_duplicates()
- .head() / .tail()
- .sample()
- .iloc()
- .nlargest() / .nsmallest()

> Subset Variables (Columns)
- [ ]
- .filter()
- .loc[]
- .iloc[]

> loc, iloc 차이
>> loc : 
- 행(index), 열 이름으로 불러옴
>
>> iloc : 
- 이름은 상관없음
- index, columns 순서(index)로 불러옴


## Rows

### duplicate

#### duplicated

In [26]:
df = pd.DataFrame(
    [[4, 7, 10],
    [5, 8, 11],
    [6, 9, 12],
    [6, 9, 12],
    [6, 13, 14]], 
    index=[1, 2, 3, 4, 5], 
    columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12
4,6,9,12
5,6,13,14


In [27]:
df.drop_duplicates?

In [28]:
# 완전히 겹치는 행만 True
df.duplicated()

1    False
2    False
3    False
4     True
5    False
dtype: bool

In [29]:
# a가 같으면 같다고 판단
df.duplicated(['a'])

1    False
2    False
3    False
4     True
5     True
dtype: bool

#### drop_duplicates
- 중복된 행 제거

In [30]:
# 완전히 겹치는 행만 제거
df.drop_duplicates()

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12
5,6,13,14


In [31]:
# a가 겹치는 것 중 처음것만 놔두고 제거
df.drop_duplicates(['a'], keep = 'first')

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [32]:
# a가 겹치는 것 중 마지막 것만 놔두고 제거
df.drop_duplicates(['a'], keep = 'last')

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
5,6,13,14


In [33]:
# inplace = True : 원래 dataframe 수정 (별로 선호하지 않음)
df.drop_duplicates(inplace = True)

### head, tail

In [34]:
df = pd.DataFrame(
    [[4, 9, 14],
    [5, 10, 15],
    [6, 11, 16],
    [7, 12, 17],
    [8, 13, 18]], 
    index = [1,2,3,4,5],
    columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
1,4,9,14
2,5,10,15
3,6,11,16
4,7,12,17
5,8,13,18


In [35]:
df.head(2)

Unnamed: 0,a,b,c
1,4,9,14
2,5,10,15


In [36]:
df.tail(2)

Unnamed: 0,a,b,c
4,7,12,17
5,8,13,18


### sample

In [37]:
df

Unnamed: 0,a,b,c
1,4,9,14
2,5,10,15
3,6,11,16
4,7,12,17
5,8,13,18


In [38]:
# frac : 가져오는 비율
# 행 순서가 섞여서 불러와짐
df.sample(frac=0.5)

Unnamed: 0,a,b,c
3,6,11,16
2,5,10,15


In [39]:
df.sample(frac = 1)

Unnamed: 0,a,b,c
3,6,11,16
2,5,10,15
4,7,12,17
5,8,13,18
1,4,9,14


In [40]:
# n : 가져오고싶은 행의 수
df.sample(n=2)

Unnamed: 0,a,b,c
1,4,9,14
5,8,13,18


### [ ]
- python method
- 첫 번째 index 포함, 마지막 index 포함 x

In [41]:
df

Unnamed: 0,a,b,c
1,4,9,14
2,5,10,15
3,6,11,16
4,7,12,17
5,8,13,18


In [42]:
df[1:3]

Unnamed: 0,a,b,c
2,5,10,15
3,6,11,16


### loc

In [43]:
df

Unnamed: 0,a,b,c
1,4,9,14
2,5,10,15
3,6,11,16
4,7,12,17
5,8,13,18


In [44]:
# index 1 (이름)
df.loc[1]

a     4
b     9
c    14
Name: 1, dtype: int64

In [45]:
df.loc[[1,3]]

Unnamed: 0,a,b,c
1,4,9,14
3,6,11,16


In [46]:
# 행,열 순서
df.loc[3,'a']

6

In [47]:
df.loc[[1,2], ['a', 'b']]

Unnamed: 0,a,b
1,4,9
2,5,10


### iloc
- 행을 가져옴
- index 이름과는 별개로 index 순서로 판단함

In [48]:
# index 0,1 가져옴
df.iloc[:2]

Unnamed: 0,a,b,c
1,4,9,14
2,5,10,15


In [49]:
# index 2부터 끝까지
df.iloc[2:]

Unnamed: 0,a,b,c
3,6,11,16
4,7,12,17
5,8,13,18


In [50]:
# 끝에서 2개부터 끝까지
df.iloc[-2:]

Unnamed: 0,a,b,c
4,7,12,17
5,8,13,18


### nlargest / nsmallest

In [51]:
df.nlargest?

In [52]:
df = pd.DataFrame({'population': [59000000, 65000000, 434000,434000,
                                  434000, 337000, 11300,11300, 11300],
                   'GDP': [1937894, 2583560 , 12011, 4520, 12128,
                           17036, 182, 38, 311],
                   'alpha-2': ["IT", "FR", "MT", "MV", "BN",
                               "IS", "NR", "TV", "AI"]},
                  index=["Italy", "France", "Malta",
                         "Maldives", "Brunei", "Iceland",
                         "Nauru", "Tuvalu", "Anguilla"])
df

Unnamed: 0,population,GDP,alpha-2
Italy,59000000,1937894,IT
France,65000000,2583560,FR
Malta,434000,12011,MT
Maldives,434000,4520,MV
Brunei,434000,12128,BN
Iceland,337000,17036,IS
Nauru,11300,182,NR
Tuvalu,11300,38,TV
Anguilla,11300,311,AI


In [53]:
# population 열에서 큰 순서로 3개
df.nlargest(3,'population')

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Malta,434000,12011,MT


In [54]:
# population 열에서 작은 순서로 2개
df.nsmallest(2, 'population')

Unnamed: 0,population,GDP,alpha-2
Nauru,11300,182,NR
Tuvalu,11300,38,TV


In [55]:
df.nlargest(2, 'GDP')

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT


In [56]:
df.nsmallest(2, 'GDP')

Unnamed: 0,population,GDP,alpha-2
Tuvalu,11300,38,TV
Nauru,11300,182,NR


## Columns

### [ ]

In [57]:
df = sns.load_dataset("iris")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [58]:
df.sepal_length

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal_length, Length: 150, dtype: float64

In [59]:
df['sepal_length']

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal_length, Length: 150, dtype: float64

In [60]:
df.sepal_length

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal_length, Length: 150, dtype: float64

In [61]:
df['sepal_length'] >= 5

0       True
1      False
2      False
3      False
4       True
       ...  
145     True
146     True
147     True
148     True
149     True
Name: sepal_length, Length: 150, dtype: bool

In [62]:
df[df['sepal_length'] >= 5]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
7,5.0,3.4,1.5,0.2,setosa
10,5.4,3.7,1.5,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [63]:
columns = ['sepal_width', 'sepal_length', 'species']
df[columns]

Unnamed: 0,sepal_width,sepal_length,species
0,3.5,5.1,setosa
1,3.0,4.9,setosa
2,3.2,4.7,setosa
3,3.1,4.6,setosa
4,3.6,5.0,setosa
...,...,...,...
145,3.0,6.7,virginica
146,2.5,6.3,virginica
147,3.0,6.5,virginica
148,3.4,6.2,virginica


### query

In [64]:
df.query('sepal_length >= 6')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
54,6.5,2.8,4.6,1.5,versicolor
56,6.3,3.3,4.7,1.6,versicolor
...,...,...,...,...,...
144,6.7,3.3,5.7,2.5,virginica
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica


### filter
- itmes = : columns 명 그대로
- like = : 문자 포함 
- regex = : 문자 포함, 시작, 끝

In [65]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [66]:
# 무조건 list 형태로 입력해야함
df.filter(items=['sepal_length'])

Unnamed: 0,sepal_length
0,5.1
1,4.9
2,4.7
3,4.6
4,5.0
...,...
145,6.7
146,6.3
147,6.5
148,6.2


In [67]:
df.filter(like = 'sepal')

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


In [68]:
# _ 있는 columns만 가져옴
df.filter(regex = '_')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [69]:
# length로 끝나는 columns
df.filter(regex = 'length$')

Unnamed: 0,sepal_length,petal_length
0,5.1,1.4
1,4.9,1.4
2,4.7,1.3
3,4.6,1.5
4,5.0,1.4
...,...,...
145,6.7,5.2
146,6.3,5.0
147,6.5,5.2
148,6.2,5.4


In [70]:
# sepal로 시작하는 columns
df.filter(regex = '^sepal')

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


In [71]:
# species로 끝나지 않는 columns
# ^ : not
df.filter(regex = '^(?!species$).*')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### loc

In [72]:
# 행, 열 지정
df.loc[:, 'sepal_width':'petal_width']

Unnamed: 0,sepal_width,petal_length,petal_width
0,3.5,1.4,0.2
1,3.0,1.4,0.2
2,3.2,1.3,0.2
3,3.1,1.5,0.2
4,3.6,1.4,0.2
...,...,...,...
145,3.0,5.2,2.3
146,2.5,5.0,1.9
147,3.0,5.2,2.0
148,3.4,5.4,2.3


In [73]:
df.loc[df['sepal_length'] > 5, ['sepal_length','sepal_width']]

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
5,5.4,3.9
10,5.4,3.7
14,5.8,4.0
15,5.7,4.4
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


### iloc

In [74]:
df.iloc[:,[1,3]]

Unnamed: 0,sepal_width,petal_width
0,3.5,0.2
1,3.0,0.2
2,3.2,0.2
3,3.1,0.2
4,3.6,0.2
...,...,...
145,3.0,2.3
146,2.5,1.9
147,3.0,2.0
148,3.4,2.3


# Summarize Data

In [75]:
df = sns.load_dataset('iris')

## shape

In [76]:
df.shape

(150, 5)

In [77]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## value_counts

In [78]:
df['species'].value_counts()

versicolor    50
setosa        50
virginica     50
Name: species, dtype: int64

In [79]:
pd.DataFrame(df['species'].value_counts())

Unnamed: 0,species
versicolor,50
setosa,50
virginica,50


In [80]:
pd.DataFrame(df['petal_width'].value_counts()).head()

Unnamed: 0,petal_width
0.2,29
1.3,13
1.5,12
1.8,12
1.4,8


## unique

In [81]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

## nunique
- unique한 이름의 갯수

In [82]:
df['species'].nunique()

3

In [83]:
df['sepal_length'].nunique()

35

## len

In [84]:
# 행 수
len(df)

150

## describe

In [85]:
df.describe(include = 'all')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,versicolor
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [86]:
# object type만
df.describe(include=[np.object])

Unnamed: 0,species
count,150
unique,3
top,versicolor
freq,50


In [87]:
# 수치형 타입만
df.describe(include=[np.number])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


## numerical function
- sum
- count
- median
- mean
- quantile
- max
- min
- var
- std

In [88]:
df.sum()

sepal_length                                                876.5
sepal_width                                                 458.6
petal_length                                                563.7
petal_width                                                 179.9
species         setosasetosasetosasetosasetosasetosasetosaseto...
dtype: object

In [89]:
df.count()

sepal_length    150
sepal_width     150
petal_length    150
petal_width     150
species         150
dtype: int64

In [90]:
df.median()

sepal_length    5.80
sepal_width     3.00
petal_length    4.35
petal_width     1.30
dtype: float64

In [91]:
df.mean()

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [92]:
df.quantile([0.25, 0.75])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0.25,5.1,2.8,1.6,0.3
0.75,6.4,3.3,5.1,1.8


In [93]:
df.max()

sepal_length          7.9
sepal_width           4.4
petal_length          6.9
petal_width           2.5
species         virginica
dtype: object

In [94]:
df.min()

sepal_length       4.3
sepal_width          2
petal_length         1
petal_width        0.1
species         setosa
dtype: object

In [95]:
df.var()

sepal_length    0.685694
sepal_width     0.189979
petal_length    3.116278
petal_width     0.581006
dtype: float64

In [96]:
df.std()

sepal_length    0.828066
sepal_width     0.435866
petal_length    1.765298
petal_width     0.762238
dtype: float64

## apply(function)
- 원하는 함수 만들어서 적용 가능 (행, 열 수정할 때 유용)
- lambda : lambda함수(익명함수) 사용

In [97]:
# lambda 함수 사용
# 3글자만 가져옴
df['species_3'] = df['species'].apply(lambda x : x[:3])
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_3
0,5.1,3.5,1.4,0.2,setosa,set
1,4.9,3.0,1.4,0.2,setosa,set
2,4.7,3.2,1.3,0.2,setosa,set
3,4.6,3.1,1.5,0.2,setosa,set
4,5.0,3.6,1.4,0.2,setosa,set
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,vir
146,6.3,2.5,5.0,1.9,virginica,vir
147,6.5,3.0,5.2,2.0,virginica,vir
148,6.2,3.4,5.4,2.3,virginica,vir


In [98]:
# 뒤에서 3번째까지의 문자를 가져오는 함수
def smp(x):
    return x[-3:]

In [99]:
df['species_-3'] = df['species'].apply(smp)
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_3,species_-3
0,5.1,3.5,1.4,0.2,setosa,set,osa
1,4.9,3.0,1.4,0.2,setosa,set,osa
2,4.7,3.2,1.3,0.2,setosa,set,osa
3,4.6,3.1,1.5,0.2,setosa,set,osa
4,5.0,3.6,1.4,0.2,setosa,set,osa
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,vir,ica
146,6.3,2.5,5.0,1.9,virginica,vir,ica
147,6.5,3.0,5.2,2.0,virginica,vir,ica
148,6.2,3.4,5.4,2.3,virginica,vir,ica


# Handling Missing Data

In [100]:
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                   "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"),pd.NaT]})
df

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


## dropna
- axis = 0 or 1
- how = 'all' or 'any'

In [101]:
# axis = 1 : 열 기준
# how = 'all' : 모두 결측치일 때 column drop
df.dropna(axis = 1, how = 'all')

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [102]:
# how = 'any' : 하나라도 결측치이면 column drop
df.dropna(axis=1, how = 'any')

Unnamed: 0,name
0,Alfred
1,Batman
2,Catwoman


## fillna

In [103]:
df.fillna?

In [104]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5],
                   [np.nan, 3, np.nan, 4]],
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [105]:
# 결측치 다 0으로 채워라
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,0.0,3.0,0.0,4


In [106]:
# 앞에 값으로 채움
df.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,3.0,4.0,,5
3,3.0,3.0,,4


In [107]:
# 뒤에 값으로 채움
df.fillna(method='backfill')

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0
1,3.0,4.0,,1
2,,3.0,,5
3,,3.0,,4


In [108]:
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5
3,0.0,3.0,2.0,4


# Make New Columns

In [109]:
df = pd.DataFrame({'A' : range(1,11), 'B' : np.random.randn(10)})
df

Unnamed: 0,A,B
0,1,-0.773801
1,2,1.359437
2,3,-0.080831
3,4,-0.391189
4,5,-1.555192
5,6,-1.027686
6,7,0.570515
7,8,0.96423
8,9,-0.5384
9,10,0.899085


## [ ]

In [110]:
df['ln_A'] = np.log(df.A)
df

Unnamed: 0,A,B,ln_A
0,1,-0.773801,0.0
1,2,1.359437,0.693147
2,3,-0.080831,1.098612
3,4,-0.391189,1.386294
4,5,-1.555192,1.609438
5,6,-1.027686,1.791759
6,7,0.570515,1.94591
7,8,0.96423,2.079442
8,9,-0.5384,2.197225
9,10,0.899085,2.302585


## assign

In [111]:
df.assign(square_A = lambda df : df.A * df.A)

Unnamed: 0,A,B,ln_A,square_A
0,1,-0.773801,0.0,1
1,2,1.359437,0.693147,4
2,3,-0.080831,1.098612,9
3,4,-0.391189,1.386294,16
4,5,-1.555192,1.609438,25
5,6,-1.027686,1.791759,36
6,7,0.570515,1.94591,49
7,8,0.96423,2.079442,64
8,9,-0.5384,2.197225,81
9,10,0.899085,2.302585,100


## qcut
> qcut
- numerical -> categorical
- 범위 나눠서 범주형으로 바꾸고 싶을 때 사용

In [112]:
pd.qcut(df.A, 3, labels = ['good', 'medium', 'bad'])

0      good
1      good
2      good
3      good
4    medium
5    medium
6    medium
7       bad
8       bad
9       bad
Name: A, dtype: category
Categories (3, object): [good < medium < bad]

In [113]:
pd.qcut(df.B, 3, labels = ['good', 'medium', 'bad'])

0      good
1       bad
2    medium
3    medium
4      good
5      good
6    medium
7       bad
8      good
9       bad
Name: B, dtype: category
Categories (3, object): [good < medium < bad]

## max / min

In [114]:
# 행 기준으로 가장 큰 값
df.max(axis = 0)

A       10.000000
B        1.359437
ln_A     2.302585
dtype: float64

In [115]:
# 열 기준으로 가장 큰 값
df.max(axis = 1)

0     1.0
1     2.0
2     3.0
3     4.0
4     5.0
5     6.0
6     7.0
7     8.0
8     9.0
9    10.0
dtype: float64

In [116]:
df.min(axis = 0)

A       1.000000
B      -1.555192
ln_A    0.000000
dtype: float64

In [117]:
df.min(axis = 1)

0   -0.773801
1    0.693147
2   -0.080831
3   -0.391189
4   -1.555192
5   -1.027686
6    0.570515
7    0.964230
8   -0.538400
9    0.899085
dtype: float64

## clip
- 범위 지정해서 범위 내의 값으로 변경

In [118]:
df["A"].clip(lower = 2, upper = 8)

0    2
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    8
9    8
Name: A, dtype: int64

## abs
- 절댓값

In [119]:
df['B']

0   -0.773801
1    1.359437
2   -0.080831
3   -0.391189
4   -1.555192
5   -1.027686
6    0.570515
7    0.964230
8   -0.538400
9    0.899085
Name: B, dtype: float64

In [120]:
df['B'].abs()

0    0.773801
1    1.359437
2    0.080831
3    0.391189
4    1.555192
5    1.027686
6    0.570515
7    0.964230
8    0.538400
9    0.899085
Name: B, dtype: float64

## map
- apply와 동일하게 함수 적용 가능
- text를 숫자로 바꿀때 유용 (원하는 값으로)

In [121]:
df = pd.DataFrame([{'age': 20, 'job': 'student'},
         {'age': 30, 'job': 'developer'},
         {'age': 30, 'job': 'teacher'}])
df

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher


In [122]:
df.job = df.job.map({'student':1, 'developer':2, 'teacher':3})
df

Unnamed: 0,age,job
0,20,1
1,30,2
2,30,3


## applymap
- 모든 columns에 적용하고 싶을 때

In [123]:
df = pd.DataFrame({'A' : range(1,11), 'B' : np.random.randn(10)})
df

Unnamed: 0,A,B
0,1,-0.642421
1,2,0.705942
2,3,1.541312
3,4,-2.205768
4,5,-0.327494
5,6,0.364878
6,7,0.437572
7,8,0.038152
8,9,0.942243
9,10,1.091024


In [124]:
df = df.applymap(np.around)
df

Unnamed: 0,A,B
0,1,-1.0
1,2,1.0
2,3,2.0
3,4,-2.0
4,5,-0.0
5,6,0.0
6,7,0.0
7,8,0.0
8,9,1.0
9,10,1.0


# Reshaping Data

## sort_values

In [125]:
df = sns.load_dataset('mpg')
df.shape

(398, 9)

In [126]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [127]:
# mpg 기준으로 오름차순으로 sort(정렬)
# by = : 쓰지 않아도 됌.
df.sort_values(by = 'mpg')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
28,9.0,8,304.0,193.0,4732,18.5,70,usa,hi 1200d
25,10.0,8,360.0,215.0,4615,14.0,70,usa,ford f250
26,10.0,8,307.0,200.0,4376,15.0,70,usa,chevy c20
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala
124,11.0,8,350.0,180.0,3664,11.0,73,usa,oldsmobile omega
...,...,...,...,...,...,...,...,...,...
326,43.4,4,90.0,48.0,2335,23.7,80,europe,vw dasher (diesel)
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
325,44.3,4,90.0,48.0,2085,21.7,80,europe,vw rabbit c (diesel)
329,44.6,4,91.0,67.0,1850,13.8,80,japan,honda civic 1500 gl


In [128]:
# mpg 기준으로 내림차순으로 sort(정렬)
# ascending = True : default
df.sort_values(by = 'mpg', ascending=False)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
322,46.6,4,86.0,65.0,2110,17.9,80,japan,mazda glc
329,44.6,4,91.0,67.0,1850,13.8,80,japan,honda civic 1500 gl
325,44.3,4,90.0,48.0,2085,21.7,80,europe,vw rabbit c (diesel)
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
326,43.4,4,90.0,48.0,2335,23.7,80,europe,vw dasher (diesel)
...,...,...,...,...,...,...,...,...,...
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala
67,11.0,8,429.0,208.0,4633,11.0,72,usa,mercury marquis
25,10.0,8,360.0,215.0,4615,14.0,70,usa,ford f250
26,10.0,8,307.0,200.0,4376,15.0,70,usa,chevy c20


## rename

In [129]:
# 원래 dataframe이 변경되지는 않으므로 다시 초기화
df = df.rename(columns = {'model_year' : 'year'})
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


## sort_index
- index 순서로 정렬

In [130]:
df.sort_index()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


## reset_index
- 기존 인덱스가 열로 들어가고 인덱스가 새로 생김
- index 없을 때, index 만들고 싶으면 사용

In [131]:
df.reset_index()

Unnamed: 0,index,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...,...
393,393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


## drop
- columns = : 열 삭제
- axis = : 1(열), 0(행)
- inplace = , 다시 초기화 : 원래 dataframe 변경 됌.

In [132]:
df.drop([0,2])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
5,15.0,8,429.0,198.0,4341,10.0,70,usa,ford galaxie 500
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [133]:
# index로 삭제
# 행 이름 0, 2를 삭제하는 것이 아니라 0번째, 2번째 row (index) 삭제
df.drop(df.index[[0,2]])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
5,15.0,8,429.0,198.0,4341,10.0,70,usa,ford galaxie 500
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [134]:
df.drop(columns = ['mpg', 'year'])

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,origin,name
0,8,307.0,130.0,3504,12.0,usa,chevrolet chevelle malibu
1,8,350.0,165.0,3693,11.5,usa,buick skylark 320
2,8,318.0,150.0,3436,11.0,usa,plymouth satellite
3,8,304.0,150.0,3433,12.0,usa,amc rebel sst
4,8,302.0,140.0,3449,10.5,usa,ford torino
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,usa,ford mustang gl
394,4,97.0,52.0,2130,24.6,europe,vw pickup
395,4,135.0,84.0,2295,11.6,usa,dodge rampage
396,4,120.0,79.0,2625,18.6,usa,ford ranger


## melt
- columns에 있던 값을 row로 이동
- 열 이름을 값으로 녹인다고 생각
- pivot이랑 반대

In [135]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
df

Unnamed: 0,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


In [136]:
pd.melt(df, id_vars=['A'], value_vars = ['B'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5


In [137]:
pd.melt(df, id_vars=['A'], value_vars = ['B', 'C'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [138]:
pd.melt(df, value_vars = ['A', 'B', 'C']).rename(
    columns={'variable' : 'var','value' : 'val'})

Unnamed: 0,var,val
0,A,a
1,A,b
2,A,c
3,B,1
4,B,3
5,B,5
6,C,2
7,C,4
8,C,6


## pivot
- row 값들이 columns이 됨.
- 표를 만들어준다고 생각하면 될 듯
- melt랑 반대

In [139]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two','two'],
                   'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'baz': [1, 2, 3, 4, 5, 6]})
df

Unnamed: 0,foo,bar,baz
0,one,A,1
1,one,B,2
2,one,C,3
3,two,A,4
4,two,B,5
5,two,C,6


In [140]:
df2 = df.pivot(index = 'foo', columns = 'bar', values = 'baz').reset_index()
df2

bar,foo,A,B,C
0,one,1,2,3
1,two,4,5,6


In [141]:
df2.melt(id_vars=['foo'], value_vars=['A', 'B', 'C']).sort_values(
    ['foo', 'bar']).rename(
    columns = {'value':'baz'})

Unnamed: 0,foo,bar,baz
0,one,A,1
2,one,B,2
4,one,C,3
1,two,A,4
3,two,B,5
5,two,C,6


## merge(병합)

### concat
- dataframe 합치기
- 위아래로 합치는 느낌 (동일한 열에 행 추가)

#### series

In [142]:
s1 = pd.Series(['a', 'b'])
s2 = pd.Series(['c', 'd'])

In [143]:
pd.concat([s1,s2])

0    a
1    b
0    c
1    d
dtype: object

In [144]:
pd.concat([s1,s2], ignore_index = True)

0    a
1    b
2    c
3    d
dtype: object

In [145]:
pd.concat([s1,s2], keys = ['s1', 's2'], names = ['Series name', 'Row ID'])

Series name  Row ID
s1           0         a
             1         b
s2           0         c
             1         d
dtype: object

#### dataframe

In [146]:
df1 = pd.DataFrame([['a', 1], ['b', 2]],
                   columns=['letter', 'number'])
df1

Unnamed: 0,letter,number
0,a,1
1,b,2


In [147]:
df2 = pd.DataFrame([['c', 3], ['d', 4]],
                   columns=['letter', 'number'])
df2

Unnamed: 0,letter,number
0,c,3
1,d,4


In [148]:
pd.concat([df1, df2])

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


In [149]:
df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
                   columns=['letter', 'number', 'animal'])
df3

Unnamed: 0,letter,number,animal
0,c,3,cat
1,d,4,dog


In [150]:
pd.concat([df1, df3])

Unnamed: 0,letter,number,animal
0,a,1,
1,b,2,
0,c,3,cat
1,d,4,dog


In [151]:
# 겹치는 열만 존재
pd.concat([df1, df3], join = 'inner')

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


In [152]:
df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
                  columns = ['animal', 'name'])
df4

Unnamed: 0,animal,name
0,bird,polly
1,monkey,george


In [153]:
df5 = pd.DataFrame([1], index = ['a'])
df5

Unnamed: 0,0
a,1


In [154]:
df6 = pd.DataFrame([2], index = ['a'])
df6

Unnamed: 0,0
a,2


In [155]:
pd.concat([df5, df6])

Unnamed: 0,0
a,1
a,2


In [156]:
# index가 중복되서 오류가 남
# 겹치는 게 있나 확인할 때 사용하면 되는 option
pd.concat([df5, df6], verify_integrity=True)

ValueError: Indexes have overlapping values: Index(['a'], dtype='object')

### append

In [None]:
df1

In [None]:
df2

In [None]:
df1.append(df2, ignore_index=True)

### merge
- 다른 열을 가지고 있는 dataframe을 합침
- 왼쪽, 오른쪽

#### example 1

In [None]:
adf = pd.DataFrame({'x1' : ['A', 'B', 'C'], 'x2' : [1,2,3]})
adf

In [None]:
bdf = pd.DataFrame({'x1' : ['A', 'B', 'D'], 'x3' : ["T", "F", "T"]})
bdf

In [None]:
# adf : left, bdf : right
# bdf.x1은 달라서 값을 가져오지 않음
# on : key를 정해주는 것과 동일
pd.merge(adf, bdf, how='left', on='x1')

In [None]:
pd.merge(adf, bdf, how='right', on='x1')

In [None]:
# 둘 다 해당하는 값만
pd.merge(adf, bdf, how='inner', on='x1')

In [None]:
# 모든 값을 가져옴
pd.merge(adf, bdf, how='outer', on='x1')

In [None]:
adf[adf.x1.isin(bdf.x1)]

In [None]:
adf[~adf.x1.isin(bdf.x1)]

#### example 2

In [None]:
ydf = pd.DataFrame({'x1' : ['A', 'B', 'C'], 'x2' : [1,2,3]})
ydf

In [None]:
zdf = pd.DataFrame({'x1' : ['B', 'C', 'D'], 'x2' : [2,3,4]})
zdf

In [None]:
pd.merge(ydf, zdf)

In [None]:
pd.merge(ydf, zdf, how = 'outer')

In [None]:
pd.merge(ydf, zdf, how = 'outer', indicator = True)

In [None]:
pd.merge(ydf, zdf,
         how = 'outer',
         indicator = True).query('_merge == "left_only"')

In [None]:
pd.merge(ydf, zdf,
         how = 'outer',
         indicator = True).query('_merge == "left_only"').drop(columns = ['_merge'])

# Group Data

## groupby

### example 1

In [None]:
df = sns.load_dataset("mpg")
df.head()

In [None]:
df.groupby(by = 'origin')

In [None]:
df.groupby(by = 'origin').size()

In [None]:
df["origin"].value_counts()

In [None]:
df.groupby(by = "origin").mean()

In [None]:
df.groupby(by = "origin")['cylinders'].mean()

In [None]:
pd.DataFrame(df.groupby(['model_year', 'origin'])['cylinders'].mean())

### example 2

In [None]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"}
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

In [None]:
# 각 학과마다 몇 명이 존재하는지
groupby_major = df.groupby('major')
groupby_major.groups

In [None]:
for name, group in groupby_major:
    print(name + " : " + str(len(group)))
    print(group)
    print()

## rank
- 순서를 알려줌

In [None]:
df = sns.load_dataset("mpg")
df.head()

In [None]:
df['weight'].rank(method = 'min')

In [None]:
df['model_year'].rank(method = 'min').value_counts()

In [None]:
# 같은 값이어도 앞에 있으면 순서 높음
df['model_year'].rank(method = 'first')

In [None]:
df['model_year'].rank(pct = True)

## shift
- 결측치 처리할 때 유용하게 쓰신다고 말씀하심

In [None]:
df = pd.DataFrame(
    [[4, 7, 10],
    [5, 8, 11],
    [6, 9, 12]], 
    index=[1, 2, 3], 
    columns=['a', 'b', 'c'])
df

In [None]:
# 행이 하나씩 내려감
df.shift(1)

In [None]:
# 행이 하나씩 올라감
df.shift(-1)

In [None]:
df['a'].shift(2)

## cumulative
- cumsum : cumulative sum (누적합)
- cummax : cumulative max (누적 최대값)
- cummin : cumulative min (누적 최소값)
- cumprod : cumulative product (누적곱)

In [None]:
df.cumsum()

In [None]:
df.cummax()

In [None]:
df.cummin()

In [None]:
df.cumprod()

# Expanding & Rolling
- For Time series data (시계열 분석을 위한)
- https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.expanding.html
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.rolling.html

In [None]:
# 그래프가 표시되도록
%matplotlib inline

In [None]:
s = pd.Series(np.random.randn(1000),
             index = pd.date_range('1/1/2021', periods = 1000))
s.plot()

In [None]:
s = s.cumsum()
s.plot()

## Rolling
- window size를 지정해주고, window size만큼 이동해서 계산

In [None]:
# window의 size만큼 이동을 시켜줌.
# 이동평균 (moving average)을 위해 사용
r = s.rolling(window = 60)
r.mean()

In [None]:
# 추세선이 60 뒤로 갔음
s.plot(style = 'k--')
r.mean().plot(style='k')

## expanding
- window만큼 누적해서 계산

### example 1

In [None]:
dfe = pd.DataFrame({'B' : [0,1,2,np.nan,4]})
dfe

In [None]:
# 결측치가 있어 끊겨서 그려짐
dfe.plot()

In [None]:
dfe.expanding(2).sum()

In [None]:
dfe.expanding(2).sum().plot()

### example 2
- rolling & expanding

In [None]:
df = pd.DataFrame(np.random.randn(1000,4),
                  index = pd.date_range('1/1/2021', periods = 1000),
                  columns = ['A','B','C','D'])
df

In [None]:
df = df.cumsum()
df

In [None]:
df.plot()

In [None]:
# rolling
df.rolling(window=60).sum().plot(subplots = True)

In [None]:
# rolling
df.rolling(window = len(df), min_periods = 1).mean().plot()

In [None]:
# expanding
df.expanding(min_periods=1).mean().plot()