# Numpy & Pandas basic

### Reference
https://numpy.org/doc/stable/reference/routines.html  
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html  
https://github.com/PacktPublishing/Hands-On-Data-Analysis-with-NumPy-and-pandas

## numpy.array
* numpy = numerical python
* **array**를 사용하여 벡터, 행렬 연산 속도 빠름
* 저장이 효율적

In [11]:
import numpy as np

In [2]:
# one dimensional array
a = np.array([1,2,3])
print(a)

[1 2 3]


In [3]:
# two dimensional array
b = np.array([[1, 2], [3, 4]])
print(b)

[[1 2]
 [3 4]]


In [4]:
# minimum dimensions 
d = np.array([1, 2, 3, 4, 5], ndmin = 2)
print(d)

[[1 2 3 4 5]]


In [5]:
# dtype parameter 

e = np.array([1, 2, 3], dtype = float) #complex
print(e)

[1. 2. 3.]


### Array attributes
ndarray.shape, ndarray.size, ndarray.ndim, ndarray.dtype, reshape function, np.arange

In [6]:
x1 = np.array([[1,2,3],[4,5,6]]) 
print(x1.shape)
print(x1.size) # elements 갯수

(2, 3)
6


In [7]:
x3 = np.arange(24) 
print(x3)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]


In [8]:
x4 = x3.reshape(2,4,3)  #가장 바깥쪽부터 생각
print(x4)

[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]
  [ 9 10 11]]

 [[12 13 14]
  [15 16 17]
  [18 19 20]
  [21 22 23]]]


In [9]:
x4.ndim

3

In [10]:
x4.dtype

dtype('int32')

### zeros, ones

In [11]:
print(np.ones(5))
print(np.zeros(5))
print(np.ones([2,2], dtype=int))

[1. 1. 1. 1. 1.]
[0. 0. 0. 0. 0.]
[[1 1]
 [1 1]]


### arange, linspace function
arange function creates sequences of numbers. It is analogous to range that returns arrays instead of lists.  
linspace function returns evenly spaced numbers over a specified interval

In [12]:
np.arange(10, 20, 2)  

array([10, 12, 14, 16, 18])

In [13]:
np.linspace(2.0, 4.0, 5)

array([2. , 2.5, 3. , 3.5, 4. ])

### Array manipulation

In [14]:
import numpy as np
from numpy.random import randn

arr1 = np.array(randn(4, 4)*10, dtype = np.int8)  #-1~1 랜덤 실수 10배
print(arr1)

[[  0 -10 -11  -2]
 [ -6  16  -2 -11]
 [ 14  -4  -5   2]
 [-15   8 -13  -6]]


In [15]:
arr1.tolist()    # Turn arr1 to a list

[[0, -10, -11, -2], [-6, 16, -2, -11], [14, -4, -5, 2], [-15, 8, -13, -6]]

In [16]:
arr1.flatten()    # Make a 1D array

array([  0, -10, -11,  -2,  -6,  16,  -2, -11,  14,  -4,  -5,   2, -15,
         8, -13,  -6], dtype=int8)

In [17]:
arr1.sum() #모든 원소의 합

-45

In [18]:
arr1.sum(axis=0) #열 기준

array([ -7,  10, -31, -17])

In [19]:
arr1.sum(axis=1) #행 기준

array([-23,  -3,   7, -26])

In [20]:
arr1.cumsum(axis=0)

array([[  0, -10, -11,  -2],
       [ -6,   6, -13, -13],
       [  8,   2, -18, -11],
       [ -7,  10, -31, -17]], dtype=int32)

In [21]:
arr1.mean(axis=0)

array([-1.75,  2.5 , -7.75, -4.25])

### Arithmetic operations

In [22]:
z1 = np.array([10,20,30,40,50])
z2 = np.arange(5)
print(z1)
print(z2)

[10 20 30 40 50]
[0 1 2 3 4]


In [23]:
# addition
z_add = z1 + z2
print(z_add)

[10 21 32 43 54]


In [24]:
# subtraction
z_sub = z1 - z2
print(z_sub)

[10 19 28 37 46]


In [25]:
# multiplication - elementwise product
z_mult = z1 * z2
print(z_mult)

[  0  20  60 120 200]


In [26]:
# division
z_div = z1/2
print(z_div)

[ 5. 10. 15. 20. 25.]


In [27]:
# comparision operator
z1 < 35

array([ True,  True,  True, False, False])

### Indexing and Slicing

In [28]:
arr = np.arange(1,17)
arr = arr.reshape(1,4,4) 
#arr = arr.reshape(1,4,-1)
print(arr)

[[[ 1  2  3  4]
  [ 5  6  7  8]
  [ 9 10 11 12]
  [13 14 15 16]]]


In [29]:
print('shape: ', arr.shape)
print('data type: ', arr.dtype)
print('number of dimensions: ', arr.ndim)

shape:  (1, 4, 4)
data type:  int32
number of dimensions:  3


In [30]:
#index
print(arr[0, 1])
print(arr[-1, -1])

[5 6 7 8]
[13 14 15 16]


In [31]:
#slicing
arr[:,:,1:3:]

array([[[ 2,  3],
        [ 6,  7],
        [10, 11],
        [14, 15]]])

### np.array_split

In [32]:
arr = np.array([1, 2, 3, 4, 5, 6])
newarr = np.array_split(arr, 3)
print(newarr)

[array([1, 2]), array([3, 4]), array([5, 6])]


In [33]:
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15], [16, 17, 18]])
print(np.array_split(arr, 3))
print(np.array_split(arr, 3, axis=1))

[array([[1, 2, 3],
       [4, 5, 6]]), array([[ 7,  8,  9],
       [10, 11, 12]]), array([[13, 14, 15],
       [16, 17, 18]])]
[array([[ 1],
       [ 4],
       [ 7],
       [10],
       [13],
       [16]]), array([[ 2],
       [ 5],
       [ 8],
       [11],
       [14],
       [17]]), array([[ 3],
       [ 6],
       [ 9],
       [12],
       [15],
       [18]])]


### np.where
np.where(조건식): 조건 만족하는 **인덱스** 반환

In [34]:
a = np.arange(4, 20, 3)  
b = a.reshape(2,-1)
print(a)
print(b)

[ 4  7 10 13 16 19]
[[ 4  7 10]
 [13 16 19]]


In [35]:
np.where(a%2==0) #짝수인 원소의 인덱스를 반환

(array([0, 2, 4], dtype=int64),)

In [36]:
np.where(b%2==0) #axis=0(행)기준, axis=1(열) 기준 -> (0,0) (0,2) (1,1)

(array([0, 0, 1], dtype=int64), array([0, 2, 1], dtype=int64))

## Pandas

In [37]:
import pandas as pd

### Series

In [38]:
ser1 = pd.Series([1, 2, 3, 4])
print(ser1)

0    1
1    2
2    3
3    4
dtype: int64


In [39]:
# Create a pandas Index
idx = pd.Index(["New York", "Los Angeles", "Chicago",
                "Houston", "Philadelphia", "Phoenix", "San Antonio",
                "San Diego", "Dallas"])
print(idx)

Index(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Philadelphia',
       'Phoenix', 'San Antonio', 'San Diego', 'Dallas'],
      dtype='object')


In [40]:
pops = pd.Series([8550, 3972, 2721, 2296, 1567, np.nan, 1470, 1395, 1300],
              index=idx, name="Population")
print(pops)

New York        8550.0
Los Angeles     3972.0
Chicago         2721.0
Houston         2296.0
Philadelphia    1567.0
Phoenix            NaN
San Antonio     1470.0
San Diego       1395.0
Dallas          1300.0
Name: Population, dtype: float64


### Dataframe

In [41]:
pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]}) #소문자로 하면 에러

Unnamed: 0,Yes,No
0,50,131
1,21,2


In [42]:
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']},
             index=['Product A', 'Product B'])

Unnamed: 0,Bob,Sue
Product A,I liked it.,Pretty good.
Product B,It was awful.,Bland.


### 데이터 설명하기

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 두 개의 행을 확인합니다.
dataframe.head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
dataframe.tail(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1310,"Zenni, Mr Philip",3rd,22.0,male,0,0
1311,"Lievens, Mr Rene",3rd,24.0,male,0,0
1312,"Zimmerman, Leo",3rd,29.0,male,0,0


In [None]:
# 차원을 확인합니다.
dataframe.shape

(1313, 6)

In [None]:
# 통곗값을 확인합니다.
dataframe.describe()

Unnamed: 0,Age,Survived,SexCode
count,756.0,1313.0,1313.0
mean,30.397989,0.342727,0.351866
std,14.259049,0.474802,0.477734
min,0.17,0.0,0.0
25%,21.0,0.0,0.0
50%,28.0,0.0,0.0
75%,39.0,1.0,1.0
max,71.0,1.0,1.0


In [55]:
df2.iloc[:, 1] = 0 #column CCC
df2

Unnamed: 0,BBB,CCC
alpha,1,0
gamma,7,0
zeta,16,0


### 탐색하기

iloc과 loc의 차이
* **iloc**: integer location의 약어로, 데이터 프레임의 행이나 칼럼의 순서를 나타내는 정수로 특정 값을 추출해오는 방법
* '0번 행, 2번 칼럼' -> df.iloc[0,2]
* iloc는 컴퓨터가 읽기 좋은 방법으로(숫자로) 데이터가 있는 위치(순서)에 접근
* **loc**는 칼럼명을 직접 적거나 특정 조건식을 써줌으로써 사람이 읽기 좋은 방법으로 데이터에 접근하는 방법

In [1]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 첫 번째 행을 선택합니다.
dataframe.iloc[0]

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                 29.0
Sex                               female
Survived                               1
SexCode                                1
Name: 0, dtype: object

In [2]:
# 세 개의 행을 선택합니다.
dataframe.iloc[1:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [3]:
# 네 개의 행을 선택합니다.
dataframe.loc[1:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [None]:
# 네 개의 행을 선택합니다.
dataframe.iloc[:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [None]:
# 인덱스를 설정합니다.
dataframe = dataframe.set_index(dataframe['Name'])

# 행을 확인합니다.
dataframe.loc['Allen, Miss Elisabeth Walton']

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                   29
Sex                               female
Survived                               1
SexCode                                1
Name: Allen, Miss Elisabeth Walton, dtype: object

In [None]:
# 'Allison, Miss Helen Loraine' 이전까지 Age 열과 Sex 열만 선택합니다.
dataframe.loc[:'Allison, Miss Helen Loraine', 'Age':'Sex']

Unnamed: 0_level_0,Age,Sex
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Allen, Miss Elisabeth Walton",29.0,female
"Allison, Miss Helen Loraine",2.0,female


In [None]:
# dataframe[:2]와 동일합니다.
dataframe[:'Allison, Miss Helen Loraine']

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Allen, Miss Elisabeth Walton","Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
"Allison, Miss Helen Loraine","Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
dataframe['Name']

Name
Allen, Miss Elisabeth Walton                                      Allen, Miss Elisabeth Walton
Allison, Miss Helen Loraine                                        Allison, Miss Helen Loraine
Allison, Mr Hudson Joshua Creighton                        Allison, Mr Hudson Joshua Creighton
Allison, Mrs Hudson JC (Bessie Waldo Daniels)    Allison, Mrs Hudson JC (Bessie Waldo Daniels)
Allison, Master Hudson Trevor                                    Allison, Master Hudson Trevor
                                                                     ...                      
Zakarian, Mr Artun                                                          Zakarian, Mr Artun
Zakarian, Mr Maprieder                                                  Zakarian, Mr Maprieder
Zenni, Mr Philip                                                              Zenni, Mr Philip
Lievens, Mr Rene                                                              Lievens, Mr Rene
Zimmerman, Leo                               

In [None]:
dataframe[['Age', 'Sex']].head(2)

Unnamed: 0_level_0,Age,Sex
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Allen, Miss Elisabeth Walton",29.0,female
"Allison, Miss Helen Loraine",2.0,female


### 조건에 따라 행 선택

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# ‘sex’ 열이 ‘female’인 행 중 처음 두 개를 출력합니다.
dataframe[dataframe['Sex'] == 'female'].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
# 행을 필터링합니다.
dataframe[(dataframe['Sex'] == 'female') & (dataframe['Age'] >= 65)]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
73,"Crosby, Mrs Edward Gifford (Catherine Elizabet...",1st,69.0,female,1,1


In [None]:
# Name 열에 Allison이 포함된 행만 찾기
dataframe['Name'].str.find('Allison')

0      -1
1       0
2       0
3       0
4       0
       ..
1308   -1
1309   -1
1310   -1
1311   -1
1312   -1
Name: Name, Length: 1313, dtype: int64

### 최솟값, 최댓값, 합, 평균 계산 및 개수 세기

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 통곗값을 계산합니다.
print('최댓값:', dataframe['Age'].max())
print('최솟값:', dataframe['Age'].min())
print('평균:', dataframe['Age'].mean())
print('합:', dataframe['Age'].sum())
print('카운트:', dataframe['Age'].count())

최댓값: 71.0
최솟값: 0.17
평균: 30.397989417989415
합: 22980.88
카운트: 756


In [None]:
# 카운트를 출력합니다.
dataframe.count()

Name        1313
PClass      1313
Age          756
Sex         1313
Survived    1313
SexCode     1313
dtype: int64

### Dataframe sort, group by

#### sort_index: 인덱스 따라 정렬

In [12]:
df = pd.DataFrame(np.round(np.random.randn(7, 3) * 10),
               columns=["AAA", "BBB", "CCC"],
               index=list("defcabg"))
df

Unnamed: 0,AAA,BBB,CCC
d,-6.0,-4.0,-1.0
e,5.0,-4.0,-5.0
f,4.0,7.0,-1.0
c,-9.0,-10.0,13.0
a,-6.0,11.0,-9.0
b,-17.0,16.0,24.0
g,14.0,-1.0,-3.0


In [57]:
df.sort_index() #기본: 오름차순

Unnamed: 0,AAA,BBB,CCC
a,5.0,-16.0,3.0
b,-18.0,2.0,-8.0
c,5.0,-4.0,-9.0
d,0.0,0.0,13.0
e,10.0,10.0,-12.0
f,11.0,-3.0,-6.0
g,4.0,2.0,11.0


In [58]:
df.sort_index(axis=1, ascending=False)    # Sorting columns by index, opposite order (ccc,bbb,aaa)

Unnamed: 0,CCC,BBB,AAA
d,13.0,0.0,0.0
e,-12.0,10.0,10.0
f,-6.0,-3.0,11.0
c,-9.0,-4.0,5.0
a,3.0,-16.0,5.0
b,-8.0,2.0,-18.0
g,11.0,2.0,4.0


#### sort_values: 값을 기준으로 정렬

In [59]:
df.sort_values(by='AAA')    # According to contents of AAA

Unnamed: 0,AAA,BBB,CCC
b,-18.0,2.0,-8.0
d,0.0,0.0,13.0
g,4.0,2.0,11.0
c,5.0,-4.0,-9.0
a,5.0,-16.0,3.0
e,10.0,10.0,-12.0
f,11.0,-3.0,-6.0


In [60]:
df.sort_values(by=['BBB', 'CCC'])    # Arrange first by BBB, breaking ties with CCC

Unnamed: 0,AAA,BBB,CCC
a,5.0,-16.0,3.0
c,5.0,-4.0,-9.0
f,11.0,-3.0,-6.0
d,0.0,0.0,13.0
b,-18.0,2.0,-8.0
g,4.0,2.0,11.0
e,10.0,10.0,-12.0


#### groupby()
같은 값을 하나로 묶어 **통계 결과(평균, max, min, ...)** 를 얻기 위해 사용

In [61]:
df = pd.DataFrame({
    'city': ['부산', '부산', '부산', '부산', '서울', '서울', '서울'],
    'fruits': ['apple', 'orange', 'banana', 'banana', 'apple', 'apple', 'banana'],
    'price': [100, 200, 250, 300, 150, 200, 400],
    'quantity': [1, 2, 3, 4, 5, 6, 7]
})
df

Unnamed: 0,city,fruits,price,quantity
0,부산,apple,100,1
1,부산,orange,200,2
2,부산,banana,250,3
3,부산,banana,300,4
4,서울,apple,150,5
5,서울,apple,200,6
6,서울,banana,400,7


groupby를 사용하면 기본으로 그룹 라벨이 index가 되는데, index를 사용하고 싶은 않은 경우에는 as_index=False 를 설정하면 됩니다.

In [62]:
df.groupby('city', as_index=False).mean()

Unnamed: 0,city,price,quantity
0,부산,212.5,2.5
1,서울,250.0,6.0


In [63]:
df.groupby(['city', 'fruits'], as_index=False).mean()

Unnamed: 0,city,fruits,price,quantity
0,부산,apple,100.0,1.0
1,부산,banana,275.0,3.5
2,부산,orange,200.0,2.0
3,서울,apple,175.0,5.5
4,서울,banana,400.0,7.0


### 모든 열 원소에 함수 적용

In [13]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 함수를 만듭니다.
def uppercase(x):
    return x.upper()

# 함수를 적용하고 두 개의 행을 출력합니다.
dataframe['Name'].apply(uppercase)[0:2]

0    ALLEN, MISS ELISABETH WALTON
1     ALLISON, MISS HELEN LORAINE
Name: Name, dtype: object

In [None]:
# Survived 열의 1을 Live로, 0을 Dead로 바꿉니다.
dataframe['Survived'].map({1:'Live', 0:'Dead'})[:5]

0    Live
1    Dead
2    Dead
3    Dead
4    Live
Name: Survived, dtype: object

In [None]:
# 함수의 매개변수(age)를 apply 메서드를 호출할 때 전달할 수 있습니다.
dataframe['Age'].apply(lambda x, age: x < age, age=30)[:5]

0     True
1     True
2    False
3     True
4     True
Name: Age, dtype: bool

### 데이터프레임 연결

In [15]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터프레임을 만듭니다.
data_a = {'id': ['1', '2', '3'],
          'first': ['Alex', 'Amy', 'Allen'],
          'last': ['Anderson', 'Ackerman', 'Ali']}
dataframe_a = pd.DataFrame(data_a, columns = ['id', 'first', 'last'])

# 데이터프레임을 만듭니다.
data_b = {'id': ['4', '5', '6'],
          'first': ['Billy', 'Brian', 'Bran'],
          'last': ['Bonder', 'Black', 'Balwner']}
dataframe_b = pd.DataFrame(data_b, columns = ['id', 'first', 'last'])

# 행 방향으로 데이터프레임을 연결합니다.
pd.concat([dataframe_a, dataframe_b], axis=0)

Unnamed: 0,id,first,last
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner


In [16]:
dataframe_a

Unnamed: 0,id,first,last
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali


In [17]:
dataframe_b

Unnamed: 0,id,first,last
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner


In [None]:
# 열 방향으로 데이터프레임을 연결합니다.
pd.concat([dataframe_a, dataframe_b], axis=1)

Unnamed: 0,id,first,last,id.1,first.1,last.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner


In [None]:
# 행을 만듭니다.
row = pd.Series([10, 'Chris', 'Chillon'], index=['id', 'first', 'last'])

# 행을 추가합니다.
dataframe_a.append(row, ignore_index=True)

Unnamed: 0,id,first,last
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,10,Chris,Chillon


## 결측치 다루기 (dropna, fillna)

In [25]:
data_frame = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], [np.nan, np.nan, np.nan, 5],\
                   [3, 4, np.nan, 1], [3, 4, 0, 1]], columns=list('ABCD'))
data_frame

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,3.0,4.0,,1
4,3.0,4.0,0.0,1


In [26]:
data_frame.dropna()

Unnamed: 0,A,B,C,D
4,3.0,4.0,0.0,1


In [27]:
data_frame.dropna(axis=1)

Unnamed: 0,D
0,0
1,1
2,5
3,1
4,1


In [31]:
#A, B열에 NaN이 포함된 행을 삭제
data_frame.dropna(subset=['A','B'])

Unnamed: 0,A,B,C,D
1,3.0,4.0,,1
3,3.0,4.0,,1
4,3.0,4.0,0.0,1


In [32]:
#2행과 4행에 NaN이 포함된 열을 삭제
data_frame.dropna(axis=1, subset=[2, 4])

Unnamed: 0,D
0,0
1,1
2,5
3,1
4,1


In [33]:
#모든 NaN을 0으로 치환한다
data_frame.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,3.0,4.0,0.0,1
4,3.0,4.0,0.0,1


In [34]:
data_frame.isnull()

Unnamed: 0,A,B,C,D
0,True,False,True,False
1,False,False,True,False
2,True,True,True,False
3,False,False,True,False
4,False,False,False,False


In [35]:
#NaN을 같은 열의 바로 위의 행 값으로 대체
data_frame.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,3.0,4.0,,5
3,3.0,4.0,,1
4,3.0,4.0,0.0,1


In [36]:
#NaN을 같은 열의 바로 아래 행 값으로 대체
data_frame.fillna(method='bfill')

Unnamed: 0,A,B,C,D
0,3.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,3.0,4.0,0.0,5
3,3.0,4.0,0.0,1
4,3.0,4.0,0.0,1
