# 판다스 기초
- pandas는 통상 pd로 import하고
- 수치해석적 함수가 많은 numpy는 통상 np로 import 한다.

In [1]:
import pandas as pd
import numpy as np

### Pandas의 데이터형을 구성하는 기본은 Series이다.
- index와 value로 이루어져 있다.
- 한 가지 데이터 타입만 가질 수 있다.

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
pd.Series([1, 3, 5, np.nan, 6, 8], dtype = str)

0      1
1      3
2      5
3    NaN
4      6
5      8
dtype: object

### 날짜와 시간을 이용

In [4]:
dates = pd.date_range('20240421', periods = 6) # 2024년 4월 21일 부터 6일간 이란 의미
dates

DatetimeIndex(['2024-04-21', '2024-04-22', '2024-04-23', '2024-04-24',
               '2024-04-25', '2024-04-26'],
              dtype='datetime64[ns]', freq='D')

## DataFrame
- pd.Series()
    - index, value
- pd.DataFrame()
    - index, value, colum 


### 넘파이를 이용한 데이터 프레임 생성
- 판다스에서 가장 많이 사용되는 데이터형은 DataFrame이다.
- index와 colunms를 지정하면 된다.

### 랜덤 함수를 이용해 6행 4열 데이터프레임 만들기
- 변수명 = pd.DataFrame(np.random.randn(6, 4), index = dates, colunms = ['A', 'B', 'C', 'D'])

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [6]:
# 앞 5개만 조회
df.head()

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488


In [7]:
# 뒤 5개 조회
df.tail()

Unnamed: 0,A,B,C,D
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [8]:
# index만 조회
df.index

DatetimeIndex(['2024-04-21', '2024-04-22', '2024-04-23', '2024-04-24',
               '2024-04-25', '2024-04-26'],
              dtype='datetime64[ns]', freq='D')

In [9]:
# colunms만 조회
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
# value만 조회
df.values

array([[ 0.96169372, -0.95707971, -0.13773051,  1.52702343],
       [-1.46591329, -0.31345187,  0.91838064, -0.06317563],
       [ 1.61646728,  0.68052198,  1.09705017, -0.65743437],
       [-0.2355579 , -0.27010259,  0.6014425 ,  0.50978278],
       [ 0.16207205,  1.42850832,  0.94381231, -0.4164878 ],
       [ 0.66979367,  0.41127592,  1.48623874,  1.64488967]])

In [11]:
# DataFrame의 기본 정보 확인
# 여기서는 각 컬럼의 크기와 데이터 형태를 확인하는 경우가 많다
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2024-04-21 to 2024-04-26
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [12]:
# DataFrame의 통계적 기본 정보 확인
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.284759,0.163279,0.818199,0.4241
std,1.070568,0.848545,0.549661,0.982634
min,-1.465913,-0.95708,-0.137731,-0.657434
25%,-0.13615,-0.302615,0.680677,-0.32816
50%,0.415933,0.070587,0.931096,0.223304
75%,0.888719,0.61321,1.058741,1.272713
max,1.616467,1.428508,1.486239,1.64489


## 데이터 정렬 : sort_values
- df.sort_values(by = 'B', ascending = False)  # ascending = False : 내림차순으로 정렬.

In [13]:
df.sort_values(by = 'B', ascending = False)  # ascending = False : 내림차순으로 정렬.

Unnamed: 0,A,B,C,D
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-26,0.669794,0.411276,1.486239,1.64489
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-21,0.961694,-0.95708,-0.137731,1.527023


## 데이터 선택

In [14]:
# 특정 컬럼만 읽기
df['A']

2024-04-21    0.961694
2024-04-22   -1.465913
2024-04-23    1.616467
2024-04-24   -0.235558
2024-04-25    0.162072
2024-04-26    0.669794
Freq: D, Name: A, dtype: float64

## offset index
- [n:m] : n 부터 m - 1 까지
- 그러나 인덱스나 컬럼의 이름으로 slice하는 경우는 끝을 포함


In [15]:
df

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [16]:
df[0:3]

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434


In [17]:
# 인덱스 이름으로 슬라이스 한경우 끝이 포함되는걸 볼 수 있음
df['20240423':'20240426']

Unnamed: 0,A,B,C,D
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


## pandas slice - option loc
- loc : location
- index 이름으로 특정 행, 열을 선택 한다.
- 이름으로도 사용 가능
- pandas의 보편적인 slice 옵션

In [18]:
df

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [19]:
# loc 함수를 사용할 때는 항상 [인덱스 설정 , 컬럼 설정]
df.loc[:,['A', 'B']] # :, 모든 인덱스 중에 ['A', 'B'] A,B 컬럼에 해당하는 자료를 선택해줘

Unnamed: 0,A,B
2024-04-21,0.961694,-0.95708
2024-04-22,-1.465913,-0.313452
2024-04-23,1.616467,0.680522
2024-04-24,-0.235558,-0.270103
2024-04-25,0.162072,1.428508
2024-04-26,0.669794,0.411276


In [20]:
df.loc['20240423':'20240426',['A', 'B']]

Unnamed: 0,A,B
2024-04-23,1.616467,0.680522
2024-04-24,-0.235558,-0.270103
2024-04-25,0.162072,1.428508
2024-04-26,0.669794,0.411276


In [21]:
df.loc['20240423',['A', 'B']]

A    1.616467
B    0.680522
Name: 2024-04-23 00:00:00, dtype: float64

In [22]:
df.loc['20240421':'20240423',['A', 'D']]

Unnamed: 0,A,D
2024-04-21,0.961694,1.527023
2024-04-22,-1.465913,-0.063176
2024-04-23,1.616467,-0.657434


## pandas slice - option iloc
- iloc : inter location
    - 컴퓨터가 인식하는 인덱스 값
- iloc 옵션을 이용해서 번호로만 접근

In [23]:
df

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [24]:
df.iloc[3]

A   -0.235558
B   -0.270103
C    0.601442
D    0.509783
Name: 2024-04-24 00:00:00, dtype: float64

In [25]:
df.iloc[3, 2]

0.6014424961440628

In [26]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2024-04-24,-0.235558,-0.270103
2024-04-25,0.162072,1.428508


In [27]:
df.iloc[[1, 2, 4], [0,2]]

Unnamed: 0,A,C
2024-04-22,-1.465913,0.918381
2024-04-23,1.616467,1.09705
2024-04-25,0.162072,0.943812


In [28]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2024-04-21,-0.95708,-0.137731
2024-04-22,-0.313452,0.918381
2024-04-23,0.680522,1.09705
2024-04-24,-0.270103,0.601442
2024-04-25,1.428508,0.943812
2024-04-26,0.411276,1.486239


## pandas slice under condition
- df[condition]과 같이 사용하는 것이 일반적
- pandas의 버전에 따라 조금씩 허용되는 문법이 다르다.
- 인터넷에서 확보한 소스코드를 돌릴 때는 pandas의 버전을 확인하는 것이 필요하다.

In [29]:
df

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [30]:
df[df['A'] > 0] # df의 'A'컬럼에서 0보다 큰 경우만 보여줘

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [31]:
df[df > 0]  # 전체에 대한 조건을 입력 시 조건에 해당하지 않는 값은 NaN 처리가 된다.

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,,,1.527023
2024-04-22,,,0.918381,
2024-04-23,1.616467,0.680522,1.09705,
2024-04-24,,,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,
2024-04-26,0.669794,0.411276,1.486239,1.64489


## 컬럼 추가
- 기존 컬럼이 없으면 추가
- 기존 컬럼이 있으면 수정

In [32]:
df['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df

Unnamed: 0,A,B,C,D,E
2024-04-21,0.961694,-0.95708,-0.137731,1.527023,one
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176,one
2024-04-23,1.616467,0.680522,1.09705,-0.657434,two
2024-04-24,-0.235558,-0.270103,0.601442,0.509783,three
2024-04-25,0.162072,1.428508,0.943812,-0.416488,four
2024-04-26,0.669794,0.411276,1.486239,1.64489,three


In [33]:
# 특정 요소가 있는지 확인
df['E'].isin(['two', 'four'])       # 있으면 True 없으면 False가 나옴

2024-04-21    False
2024-04-22    False
2024-04-23     True
2024-04-24    False
2024-04-25     True
2024-04-26    False
Freq: D, Name: E, dtype: bool

In [34]:
# 특정 요소가 있는 행만 선택할 때는 전체에 중괄호 씌우고 앞에 변수명을 적어준다.
df[df['E'].isin(['two', 'four'])] # True 값만 선택됨

Unnamed: 0,A,B,C,D,E
2024-04-23,1.616467,0.680522,1.09705,-0.657434,two
2024-04-25,0.162072,1.428508,0.943812,-0.416488,four


## 특정 컬럼 제거

In [35]:
df

Unnamed: 0,A,B,C,D,E
2024-04-21,0.961694,-0.95708,-0.137731,1.527023,one
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176,one
2024-04-23,1.616467,0.680522,1.09705,-0.657434,two
2024-04-24,-0.235558,-0.270103,0.601442,0.509783,three
2024-04-25,0.162072,1.428508,0.943812,-0.416488,four
2024-04-26,0.669794,0.411276,1.486239,1.64489,three


In [36]:
del df['E']
df

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [37]:
df.drop(['D','C'], axis = 1) # axis = 0 : 가로 , axis = 1 : 세로

Unnamed: 0,A,B
2024-04-21,0.961694,-0.95708
2024-04-22,-1.465913,-0.313452
2024-04-23,1.616467,0.680522
2024-04-24,-0.235558,-0.270103
2024-04-25,0.162072,1.428508
2024-04-26,0.669794,0.411276


In [38]:
df.drop(['20240421'])

Unnamed: 0,A,B,C,D
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


## Pandas apply function
- apply : 함수를 적용

In [39]:
df

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [40]:
df['A'].apply('sum')

1.7085555304522482

In [41]:
df['A'].apply('mean')

0.2847592550753747

In [42]:
df['A'].apply('min'),df['A'].apply('max')

(-1.4659132900846215, 1.6164672770592432)

In [43]:
# 각 컬럼 누적 합
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-0.50422,-1.270532,0.78065,1.463848
2024-04-23,1.112248,-0.59001,1.8777,0.806413
2024-04-24,0.87669,-0.860112,2.479143,1.316196
2024-04-25,1.038762,0.568396,3.422955,0.899708
2024-04-26,1.708556,0.979672,4.909194,2.544598


In [44]:
df['A'].apply(np.std)

2024-04-21    0.0
2024-04-22    0.0
2024-04-23    0.0
2024-04-24    0.0
2024-04-25    0.0
2024-04-26    0.0
Freq: D, Name: A, dtype: float64

In [45]:
df['A'].apply(np.sum)

2024-04-21    0.961694
2024-04-22   -1.465913
2024-04-23    1.616467
2024-04-24   -0.235558
2024-04-25    0.162072
2024-04-26    0.669794
Freq: D, Name: A, dtype: float64

In [46]:
df

Unnamed: 0,A,B,C,D
2024-04-21,0.961694,-0.95708,-0.137731,1.527023
2024-04-22,-1.465913,-0.313452,0.918381,-0.063176
2024-04-23,1.616467,0.680522,1.09705,-0.657434
2024-04-24,-0.235558,-0.270103,0.601442,0.509783
2024-04-25,0.162072,1.428508,0.943812,-0.416488
2024-04-26,0.669794,0.411276,1.486239,1.64489


In [47]:
def plusMinus(num):
    return 'plus' if num > 0 else 'minus'

In [48]:
df['A'].apply(plusMinus)

2024-04-21     plus
2024-04-22    minus
2024-04-23     plus
2024-04-24    minus
2024-04-25     plus
2024-04-26     plus
Freq: D, Name: A, dtype: object

In [49]:
df['A'].apply(lambda num: 'plus' if num > 0 else 'minus')

2024-04-21     plus
2024-04-22    minus
2024-04-23     plus
2024-04-24    minus
2024-04-25     plus
2024-04-26     plus
Freq: D, Name: A, dtype: object

## 4. 두 데이터 합치기

#### Pandas에서 데이터 프레임을 병합하는 방법
- pd.concat()
- pd.merge()
- pd.join()

In [50]:
# 딕셔너리 안의 리스트 형태
left = pd.DataFrame({
    'key' : ['K0', 'K4','K2', 'K3'],
    'A' : ['A0', 'A1', 'A2', 'A3'],
    'B' : ['B0', 'B1', 'B2', 'B3']
})
left

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K4,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [51]:
# 리스트 안의 딕셔너리 형태
right = pd.DataFrame([
    {'key':'K0', 'C':'C0', 'D':'D0'},
    {'key':'K1', 'C':'C1', 'D':'D1'},
    {'key':'K2', 'C':'C2', 'D':'D2'},
    {'key':'K3', 'C':'C3', 'D':'D3'}
])
right

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K3,C3,D3


#### pd.merge()
- 두 데이터 프레임에서 컬럼이나 인텍스를 기준으로 잡고 병합하는 방법
- 기준이 되는 컬럼이나 인덱스를 키값이라고 한다.
- 기준이 되는 키 값은 두 데이터 프레임에 모두 포함되어 있어야 한다.

In [52]:
pd.merge(left, right, how = 'inner', on = 'key') # how = 'inner' 가 디폴트 값이다.

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K2,A2,B2,C2,D2
2,K3,A3,B3,C3,D3


In [53]:
pd.merge(left, right, how = 'left', on = 'key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K4,A1,B1,,
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


In [54]:
pd.merge(left, right, how = 'right', on = 'key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,,,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


In [55]:
pd.merge(left, right, how = 'outer', on = 'key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K4,A1,B1,,
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3
4,K1,,,C1,D1


In [56]:
url = 'https://en.wikipedia.org/wiki/FIFA_World_Cup'

In [57]:
df = pd.read_html(url)
df

[                         0                                                  1
 0                      NaN                                                NaN
 1          Organising body                                               FIFA
 2                  Founded                                 1930; 94 years ago
 3                   Region                                      International
 4          Number of teams                          32 (48 from 2026 onwards)
 5     Related competitions  FIFA Women's World Cup FIFA U-20 World Cup FIF...
 6        Current champions                       Argentina (3rd title) (2022)
 7  Most successful team(s)                                  Brazil (5 titles)
 8                  Website                                   Official website,
                                                    0
 0                   Argentina, the current champions
 1                                        Tournaments
 2  1930 1934 1938 1950 1954 1958 1962 1966 197

## Pivot table
- index, columns, values, aggfunc

In [58]:
!pip install openpyxl



In [59]:
df = pd.read_excel('../data/02. sales-funnel.xlsx')
df.head()

Unnamed: 0,Account,Name,Rep,Manager,Product,Quantity,Price,Status
0,714466,Trantow-Barrows,Craig Booker,Debra Henley,CPU,1,30000,presented
1,714466,Trantow-Barrows,Craig Booker,Debra Henley,Software,1,10000,presented
2,714466,Trantow-Barrows,Craig Booker,Debra Henley,Maintenance,2,5000,pending
3,737550,"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,CPU,1,35000,declined
4,146832,Kiehn-Spinka,Daniel Hilton,Debra Henley,CPU,2,65000,won


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Account   17 non-null     int64 
 1   Name      17 non-null     object
 2   Rep       17 non-null     object
 3   Manager   17 non-null     object
 4   Product   17 non-null     object
 5   Quantity  17 non-null     int64 
 6   Price     17 non-null     int64 
 7   Status    17 non-null     object
dtypes: int64(3), object(5)
memory usage: 1.2+ KB


In [61]:
df

Unnamed: 0,Account,Name,Rep,Manager,Product,Quantity,Price,Status
0,714466,Trantow-Barrows,Craig Booker,Debra Henley,CPU,1,30000,presented
1,714466,Trantow-Barrows,Craig Booker,Debra Henley,Software,1,10000,presented
2,714466,Trantow-Barrows,Craig Booker,Debra Henley,Maintenance,2,5000,pending
3,737550,"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,CPU,1,35000,declined
4,146832,Kiehn-Spinka,Daniel Hilton,Debra Henley,CPU,2,65000,won
5,218895,Kulas Inc,Daniel Hilton,Debra Henley,CPU,2,40000,pending
6,218895,Kulas Inc,Daniel Hilton,Debra Henley,Software,1,10000,presented
7,412290,Jerde-Hilpert,John Smith,Debra Henley,Maintenance,2,5000,pending
8,740150,Barton LLC,John Smith,Debra Henley,CPU,1,35000,declined
9,141962,Herman LLC,Cedric Moss,Fred Anderson,CPU,2,65000,won


#### index 설정

In [87]:
df_name = df.groupby('Name').sum()
df_name

Unnamed: 0_level_0,Account,Rep,Manager,Product,Quantity,Price,Status
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Barton LLC,740150,John Smith,Debra Henley,CPU,1,35000,declined
"Fritsch, Russel and Anderson",737550,Craig Booker,Debra Henley,CPU,1,35000,declined
Herman LLC,141962,Cedric Moss,Fred Anderson,CPU,2,65000,won
Jerde-Hilpert,412290,John Smith,Debra Henley,Maintenance,2,5000,pending
"Kassulke, Ondricka and Metz",307599,Wendy Yule,Fred Anderson,Maintenance,3,7000,won
Keeling LLC,688981,Wendy Yule,Fred Anderson,CPU,5,100000,won
Kiehn-Spinka,146832,Daniel Hilton,Debra Henley,CPU,2,65000,won
Koepp Ltd,1459666,Wendy YuleWendy Yule,Fred AndersonFred Anderson,CPUMonitor,4,70000,declinedpresented
Kulas Inc,437790,Daniel HiltonDaniel Hilton,Debra HenleyDebra Henley,CPUSoftware,3,50000,pendingpresented
Purdy-Kunde,163416,Cedric Moss,Fred Anderson,CPU,1,30000,presented


In [80]:
df.pivot_table(index=['Name'])

TypeError: Could not convert John Smith to numeric

In [90]:
df.groupby(['Rep','Manager']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Account,Name,Product,Quantity,Price,Status
Rep,Manager,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cedric Moss,Fred Anderson,784066,Herman LLCPurdy-KundeStokes LLCStokes LLC,CPUCPUMaintenanceSoftware,5,110000,wonpresentedpendingpresented
Craig Booker,Debra Henley,2880948,Trantow-BarrowsTrantow-BarrowsTrantow-BarrowsF...,CPUSoftwareMaintenanceCPU,5,80000,presentedpresentedpendingdeclined
Daniel Hilton,Debra Henley,584622,Kiehn-SpinkaKulas IncKulas Inc,CPUCPUSoftware,5,115000,wonpendingpresented
John Smith,Debra Henley,1152440,Jerde-HilpertBarton LLC,MaintenanceCPU,3,40000,pendingdeclined
Wendy Yule,Fred Anderson,2456246,"Kassulke, Ondricka and MetzKeeling LLCKoepp Lt...",MaintenanceCPUCPUMonitor,12,177000,wonwondeclinedpresented


In [81]:
# 멀티 인덱스 설정
df.pivot_table(index = ['Rep', 'Manager'])

TypeError: Could not convert Herman LLCPurdy-KundeStokes LLCStokes LLC to numeric

#### values 설정

In [64]:
df.head()

Unnamed: 0,Account,Name,Rep,Manager,Product,Quantity,Price,Status
0,714466,Trantow-Barrows,Craig Booker,Debra Henley,CPU,1,30000,presented
1,714466,Trantow-Barrows,Craig Booker,Debra Henley,Software,1,10000,presented
2,714466,Trantow-Barrows,Craig Booker,Debra Henley,Maintenance,2,5000,pending
3,737550,"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,CPU,1,35000,declined
4,146832,Kiehn-Spinka,Daniel Hilton,Debra Henley,CPU,2,65000,won


In [65]:
df.pivot_table(index = ['Manager', 'Rep'], values = 'Price')

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Manager,Rep,Unnamed: 2_level_1
Debra Henley,Craig Booker,20000.0
Debra Henley,Daniel Hilton,38333.333333
Debra Henley,John Smith,20000.0
Fred Anderson,Cedric Moss,27500.0
Fred Anderson,Wendy Yule,44250.0


In [66]:
df.pivot_table(index = ['Manager', 'Rep'], values = 'Price', aggfunc = np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Manager,Rep,Unnamed: 2_level_1
Debra Henley,Craig Booker,80000
Debra Henley,Daniel Hilton,115000
Debra Henley,John Smith,40000
Fred Anderson,Cedric Moss,110000
Fred Anderson,Wendy Yule,177000


In [67]:
df.pivot_table(index = ['Manager', 'Rep'], values = 'Price', aggfunc = [np.sum, len])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,len
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price
Manager,Rep,Unnamed: 2_level_2,Unnamed: 3_level_2
Debra Henley,Craig Booker,80000,4
Debra Henley,Daniel Hilton,115000,3
Debra Henley,John Smith,40000,2
Fred Anderson,Cedric Moss,110000,4
Fred Anderson,Wendy Yule,177000,4


#### columns 설정

In [68]:
df.pivot_table(index = ['Manager', 'Rep'], values = 'Price', columns = 'Product', aggfunc = np.sum)

Unnamed: 0_level_0,Product,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Debra Henley,Craig Booker,65000.0,5000.0,,10000.0
Debra Henley,Daniel Hilton,105000.0,,,10000.0
Debra Henley,John Smith,35000.0,5000.0,,
Fred Anderson,Cedric Moss,95000.0,5000.0,,10000.0
Fred Anderson,Wendy Yule,165000.0,7000.0,5000.0,


In [69]:
df.pivot_table(index = ['Manager', 'Rep'], 
               values = 'Price', 
               columns = 'Product', 
               aggfunc = np.sum, 
               fill_value = 0               # fill_value 는 NaN 값을 채워달라는 명령어
              )

Unnamed: 0_level_0,Product,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Debra Henley,Craig Booker,65000,5000,0,10000
Debra Henley,Daniel Hilton,105000,0,0,10000
Debra Henley,John Smith,35000,5000,0,0
Fred Anderson,Cedric Moss,95000,5000,0,10000
Fred Anderson,Wendy Yule,165000,7000,5000,0


In [70]:
df.pivot_table(index = ['Manager', 'Rep', 'Product'], values = ['Price', 'Quantity'], aggfunc = np.sum, fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Price,Quantity
Manager,Rep,Product,Unnamed: 3_level_1,Unnamed: 4_level_1
Debra Henley,Craig Booker,CPU,65000,2
Debra Henley,Craig Booker,Maintenance,5000,2
Debra Henley,Craig Booker,Software,10000,1
Debra Henley,Daniel Hilton,CPU,105000,4
Debra Henley,Daniel Hilton,Software,10000,1
Debra Henley,John Smith,CPU,35000,1
Debra Henley,John Smith,Maintenance,5000,2
Fred Anderson,Cedric Moss,CPU,95000,3
Fred Anderson,Cedric Moss,Maintenance,5000,1
Fred Anderson,Cedric Moss,Software,10000,1


In [71]:
df.pivot_table(index = ['Manager', 'Rep', 'Product'],
               values = ['Price', 'Quantity'], 
               aggfunc = [np.sum, np.mean], 
               fill_value = 0,
              margins = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum,sum,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Price,Quantity,Price,Quantity
Manager,Rep,Product,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Debra Henley,Craig Booker,CPU,65000,2,32500.0,1.0
Debra Henley,Craig Booker,Maintenance,5000,2,5000.0,2.0
Debra Henley,Craig Booker,Software,10000,1,10000.0,1.0
Debra Henley,Daniel Hilton,CPU,105000,4,52500.0,2.0
Debra Henley,Daniel Hilton,Software,10000,1,10000.0,1.0
Debra Henley,John Smith,CPU,35000,1,35000.0,1.0
Debra Henley,John Smith,Maintenance,5000,2,5000.0,2.0
Fred Anderson,Cedric Moss,CPU,95000,3,47500.0,1.5
Fred Anderson,Cedric Moss,Maintenance,5000,1,5000.0,1.0
Fred Anderson,Cedric Moss,Software,10000,1,10000.0,1.0


#### pip 명령
- python의 공식 모듈 관리자
- pip list
- pip install module_name
- pip uninstall module_name

In [72]:
#!pip list

#### conda 명령
- conda list
- conda install module_name
- conda uninstall module_name
- conda install -c channel_name module_name

## Google Maps API 설치

In [73]:
# AIzaSyCnTTJgg19JeTWrX3Yl4TH-6G99ONeBLkI

#### windows
- conda install -c conda-forge googlemaps

#### mac(m1)
- pip install googlemaps

In [74]:
import googlemaps

In [75]:
gmaps_key = 'AIzaSyCnTTJgg19JeTWrX3Yl4TH-6G99ONeBLkI'
gmaps = googlemaps.Client(key = gmaps_key)

In [76]:
gmaps.geocode('서울영등포경찰서', language ='ko')

[{'address_components': [{'long_name': '608',
    'short_name': '608',
    'types': ['premise']},
   {'long_name': '국회대로',
    'short_name': '국회대로',
    'types': ['political', 'sublocality', 'sublocality_level_4']},
   {'long_name': '영등포구',
    'short_name': '영등포구',
    'types': ['political', 'sublocality', 'sublocality_level_1']},
   {'long_name': '서울특별시',
    'short_name': '서울특별시',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': '대한민국',
    'short_name': 'KR',
    'types': ['country', 'political']},
   {'long_name': '150-043',
    'short_name': '150-043',
    'types': ['postal_code']}],
  'formatted_address': '대한민국 서울특별시 영등포구 국회대로 608',
  'geometry': {'location': {'lat': 37.5260441, 'lng': 126.9008091},
   'location_type': 'ROOFTOP',
   'viewport': {'northeast': {'lat': 37.5273930802915,
     'lng': 126.9021580802915},
    'southwest': {'lat': 37.5246951197085, 'lng': 126.8994601197085}}},
  'partial_match': True,
  'place_id': 'ChIJ1TimJLaffDURptXOs0Tj6s

---

## Python 반복문

In [77]:
# 간단한 for문 예제
for n in [1, 2, 3, 4]:
    print(n)

1
2
3
4


In [78]:
# 조금 복잡한 for문 예제
for n in range(10):
    print(n**2)

0
1
4
9
16
25
36
49
64
81


## 위 코드를 한줄로 : list comprehension

In [79]:
[n**2 for n in range(10)]

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

## pandas에 잘 맞춰진 반복문용 명령 itterrows()
- Pandas 데이터 프레임은 대부분 2차원
- 이럴 때 for문을 사용하면, n번 지정한 것을 반복해서 하면 가독률이 떨어짐
- Pandas 데이터 프레임으로 반복문을 만들떄 itterows() 옵션을 사용하면 편함
- 받을때, 인덱스와 내용으로 나누어 받는 것만 주의