# Pandas 학습 

1. 데이터 분석을 위한 모듈
2. excel과 가장 큰 차이점 : Pandas는 대용량 데이터 처리가 가능
3. 데이터 분석 및 데이터 가공에 절대적으로 사용되는 library
4. 주요 학습 내용
> 1. DataFrame - excel의 다수의 컬럼들을 보유한 table과 동일하다 간주
> 2. series - DataFrame을 구성하는 column 간주

**참고**

주피터 노트북 익스텐션을 활용하여 생산성 높이기
>pip install jupyter_contrib_nbextensions && jupyter contrib nbextension install

사용할 extension
>pep8 auto check<br>
>markdown contents

In [None]:
# !pip install jupyter_contrib_nbextensions && jupyter contrib nbextension install

## 기초 익히기
> 이미 존재하는 파일의 내용으로 DataFrame 생성하기 <br>
> 중복 데이터 제거 <br>
> 결측시 처리

## DataFrame  & Series 구조 
> https://pandas.pydata.org/docs/getting_started/index.html <br>
    


In [15]:
# 전처리를 위한 library import

import pandas as pd
import numpy as np

In [5]:
!pip show pandas
# requires & required : 의존관계

Name: pandas
Version: 1.1.3
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: c:\users\park1\anaconda3\lib\site-packages
Requires: numpy, python-dateutil, pytz
Required-by: statsmodels, seaborn


기초 익히기

In [8]:
df = pd.DataFrame(
    {
        "Name": [
            "Braund, Mr. Owen Harris",
            "Allen, Mr. William Henry",
            "Bonnell, Miss. Elizabeth",
        ],
        "Age": [22, 35, 58],
        "Sex": ["male", "male", "female"],
    }
)

In [11]:
print(df)

                       Name  Age     Sex
0   Braund, Mr. Owen Harris   22    male
1  Allen, Mr. William Henry   35    male
2  Bonnell, Miss. Elizabeth   58  female


In [6]:
# 시리즈 - 정수값으로 생성시 자동으로 int64 (64비트)
# 시리즈 객체

s = pd.Series([1, 2, 3]) 
s

0    1
1    2
2    3
dtype: int64

In [12]:
s2 = pd.Series([4, 5, 6])
print(s2)

0    4
1    5
2    6
dtype: int64


In [7]:
type(s)

pandas.core.series.Series

In [13]:
s.values

array([1, 2, 3], dtype=int64)

In [14]:
s.index

RangeIndex(start=0, stop=3, step=1)

In [None]:

# 결측치 : 데이터값 없음 NaN(Not a Number)
# NaN   : js에서 숫자가 아님을 의미

# python에서는 결측치로 사용

# numpy api 결측치 표현

# 데이터 관리를 위해서는 결측치를 임의로 생성해야 하는 경우 발생
# np.nan

# 결측치를 어떻게 처리할 것인가? (평균값? 중앙값? 최빈값?)
# fillna()

In [25]:
s3 = pd.Series([1, np.nan, 3, 4, 5, np.nan])

In [26]:
print(s3.count()) # NaN값은 uncounting

4


In [30]:
s3.fillna(0)

0    1.0
1    0.0
2    3.0
3    4.0
4    5.0
5    0.0
dtype: float64

In [37]:
s4 = pd.Series([1, np.nan, 3, 4, 5, np.nan])

In [41]:
print(s4.dropna()) # 원본데이터 s4는 보존
print()
print(s4)

0    1.0
2    3.0
3    4.0
4    5.0
dtype: float64

0    1.0
1    NaN
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64


In [42]:
print(s4.fillna(10))
print()
print(s4)

0     1.0
1    10.0
2     3.0
3     4.0
4     5.0
5    10.0
dtype: float64

0    1.0
1    NaN
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64


## 날짜와 문자열 데이터 다루기 


- Python의 datetime모듈

>date : 연, 월, 일

>time : 시간, 분, 초, 마이크로초(백만분의 1초)

>datetime : date와 time요소

- Pandas의 Timestamp

>날짜와 시간 모두 포함 : 나노초(10억분의 1초) 단위의 정밀도


- timedelta

>날짜의 덧셈과 뺄셈에 유용



### 특정 날짜를 기준으로 DataFrame생성해 보기 

In [43]:
datas = pd.date_range('20210628', periods=6) # 기준일로부터 6일 뒤
datas

DatetimeIndex(['2021-06-28', '2021-06-29', '2021-06-30', '2021-07-01',
               '2021-07-02', '2021-07-03'],
              dtype='datetime64[ns]', freq='D')

### 날짜를 index로 적용해서 DataFrame객체 생성

In [46]:
type(datas)

pandas.core.indexes.datetimes.DatetimeIndex

In [134]:
# 6 * 4 (6행 4열)
               #행,열
np.random.randn(6, 4) # randn = real numb

array([[ 1.23833879,  1.93455409, -0.9998881 ,  0.35918172],
       [-0.44706921, -2.01094583, -0.43823077,  0.00488347],
       [ 1.84381573,  0.84786162,  0.21303756, -1.10503748],
       [ 1.21747857, -0.64170849,  0.48803055,  0.20799814],
       [-0.25107437,  0.93268116,  0.15193479,  1.79629167],
       [-0.09052941, -0.22839015, -1.46348236,  0.24331608]])

In [136]:
np.random.randint(-5, 12, 5) # randn = integer numb

array([ 4, 10, 11,  9,  8])

In [146]:
# 날짜를 index로 적용해서 DataFrame객체 생성
# 6 * 4 구조로 날짜를 보유한 datas 변수를 index로 구성된 DataFrame 객체 생성하기
# Pandas로 데이터 처리시에 Numpy모듈이 필수적

df = pd.DataFrame(np.random.randn(6, 4), index=datas)
df # print(df) : table 형태로 출력안됨

Unnamed: 0,0,1,2,3
2021-06-28,0.717291,0.298282,0.478086,0.525063
2021-06-29,1.575366,0.885103,-0.922097,0.142877
2021-06-30,0.85612,0.289146,-0.341448,2.529856
2021-07-01,1.364346,0.468959,1.406875,0.167324
2021-07-02,0.039261,-1.180428,-0.27665,0.950726
2021-07-03,0.646354,-0.367488,-0.72449,-0.843805


In [154]:
# column name change

df = pd.DataFrame(np.random.randn(6, 4), index=datas,
                  columns=['R1', 'R2', 'R3', 'R4'])
df

Unnamed: 0,R1,R2,R3,R4
2021-06-28,-0.422388,-1.599009,1.148013,0.040915
2021-06-29,0.070151,-1.161283,0.200713,0.578007
2021-06-30,-0.851603,-1.269586,-0.455162,0.828408
2021-07-01,0.535274,-0.156388,-0.561658,-1.138827
2021-07-02,-0.319558,-0.913292,1.918423,0.168871
2021-07-03,-0.781402,-0.2237,-1.711524,0.950062


In [152]:
df.values

array([[ 0.91191503,  0.06245333, -0.06043434,  0.93740776],
       [ 0.17945243,  0.27918861, -0.36241341, -0.16565541],
       [ 1.36412358, -0.7022721 ,  0.14647554,  0.01725833],
       [-0.2816669 ,  2.16679358,  0.1492278 , -0.79458554],
       [-0.34898274,  0.28362812, -0.06966554, -0.68213486],
       [-0.48049552, -1.51477378,  0.07281452, -2.49955261]])

In [153]:
df.index

DatetimeIndex(['2021-06-28', '2021-06-29', '2021-06-30', '2021-07-01',
               '2021-07-02', '2021-07-03'],
              dtype='datetime64[ns]', freq='D')

In [161]:
df.columns

Index(['R1', 'R2', 'R3', 'R4'], dtype='object')

In [156]:
# ML / DL Model 만드는 과정에서 *.info를 한눈에 볼 줄 알아야 함

df.info() 

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2021-06-28 to 2021-07-03
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   R1      6 non-null      float64
 1   R2      6 non-null      float64
 2   R3      6 non-null      float64
 3   R4      6 non-null      float64
dtypes: float64(4)
memory usage: 400.0 bytes


In [159]:
df.describe()   # 이상데이터 관측 가능

Unnamed: 0,R1,R2,R3,R4
count,6.0,6.0,6.0,6.0
mean,-0.294921,-0.88721,0.089801,0.237906
std,0.526482,0.583612,1.299394,0.762954
min,-0.851603,-1.599009,-1.711524,-1.138827
25%,-0.691649,-1.24251,-0.535034,0.072904
50%,-0.370973,-1.037287,-0.127224,0.373439
75%,-0.027277,-0.396098,0.911188,0.765808
max,0.535274,-0.156388,1.918423,0.950062


In [170]:
df.sort_values(by='R1', ascending=True) # True : 오름차순

Unnamed: 0,R1,R2,R3,R4
2021-07-03,-0.781402,-0.2237,-1.711524,0.950062
2021-06-30,-0.851603,-1.269586,-0.455162,0.828408
2021-06-29,0.070151,-1.161283,0.200713,0.578007
2021-07-02,-0.319558,-0.913292,1.918423,0.168871
2021-06-28,-0.422388,-1.599009,1.148013,0.040915
2021-07-01,0.535274,-0.156388,-0.561658,-1.138827


In [171]:
df.sort_values(by='R4', ascending=False) # False : 내림차순

Unnamed: 0,R1,R2,R3,R4
2021-07-03,-0.781402,-0.2237,-1.711524,0.950062
2021-06-30,-0.851603,-1.269586,-0.455162,0.828408
2021-06-29,0.070151,-1.161283,0.200713,0.578007
2021-07-02,-0.319558,-0.913292,1.918423,0.168871
2021-06-28,-0.422388,-1.599009,1.148013,0.040915
2021-07-01,0.535274,-0.156388,-0.561658,-1.138827


In [186]:
df['R1']

2021-06-28   -0.422388
2021-06-29    0.070151
2021-06-30   -0.851603
2021-07-01    0.535274
2021-07-02   -0.319558
2021-07-03   -0.781402
Freq: D, Name: R1, dtype: float64

In [188]:
df.R1

2021-06-28   -0.422388
2021-06-29    0.070151
2021-06-30   -0.851603
2021-07-01    0.535274
2021-07-02   -0.319558
2021-07-03   -0.781402
Freq: D, Name: R1, dtype: float64

In [180]:
# 다중 column을 출력하려면 list구조로 적용

df[['R1', 'R2']]

Unnamed: 0,R1,R2
2021-06-28,-0.422388,-1.599009
2021-06-29,0.070151,-1.161283
2021-06-30,-0.851603,-1.269586
2021-07-01,0.535274,-0.156388
2021-07-02,-0.319558,-0.913292
2021-07-03,-0.781402,-0.2237


In [190]:
# index로 데이터 검색
print(df.index)

DatetimeIndex(['2021-06-28', '2021-06-29', '2021-06-30', '2021-07-01',
               '2021-07-02', '2021-07-03'],
              dtype='datetime64[ns]', freq='D')


In [191]:
df['2021-06-28' : '2021-06-30'] # data slicing

Unnamed: 0,R1,R2,R3,R4
2021-06-28,-0.422388,-1.599009,1.148013,0.040915
2021-06-29,0.070151,-1.161283,0.200713,0.578007
2021-06-30,-0.851603,-1.269586,-0.455162,0.828408


In [195]:
df[0:3]

Unnamed: 0,R1,R2,R3,R4
2021-06-28,-0.422388,-1.599009,1.148013,0.040915
2021-06-29,0.070151,-1.161283,0.200713,0.578007
2021-06-30,-0.851603,-1.269586,-0.455162,0.828408


In [206]:
# *.iloc[index, column]

df.iloc[0:2, 0:3]

Unnamed: 0,R1,R2,R3
2021-06-28,-0.422388,-1.599009,1.148013
2021-06-29,0.070151,-1.161283,0.200713


### file로 부터 데이터 read해서 date 타입 이해하기

In [210]:
# 01.date_data.csv
# csv file내용을 read하면서 DataFrame 객체로 변환
# DataFrame 객체의 장점은 여러 함수를 통해 데이터 조작 및 정제 가능


df = pd.read_csv('dataset/01.date_data.csv')

In [212]:
# 자동으로 DataFrame 객체로 변환됨
# csv파일 제공 => raw data는 보존하고 raw data 가공
# 문제점 "at", "@"

df

Unnamed: 0,Name,Birth,email
0,이순신,2021-01-01 9:10,happy@gmail.com
1,홍길동,2021-01-08 9:20,1004@NAVER.COM
2,유관순,2021-02-01 10:20,Iron at yahoo.co.kr
3,이이,2021-02-02 11:40,\tlee@gmail.com
4,김구,2021-02-28 15:10,kim@daum.net\t
5,윤봉길,2021-04-10 19:20,yeon@daum.ac.kr
6,강감찬,2021-06-30 21:20,kkc@gmail.com
7,신사임당,2021-07-20 23:30,monther@NAVER.COM
8,을지문덕,2021-08-28 11:48,ygmd@daum.net
9,유재석,2021-09-01 3:12,yjs at gmail.com


In [218]:
# format = '%Y-%M-%D %H:%M:%S'
df.columns

Index(['Name', 'Birth', 'email'], dtype='object')

In [220]:
df.values

array([['이순신', '2021-01-01 9:10', 'happy@gmail.com '],
       ['홍길동', '2021-01-08 9:20', '  1004@NAVER.COM'],
       ['유관순', '2021-02-01 10:20', ' Iron at yahoo.co.kr '],
       ['이이', '2021-02-02 11:40', '\tlee@gmail.com'],
       ['김구', '2021-02-28 15:10', 'kim@daum.net\t'],
       ['윤봉길', '2021-04-10 19:20', 'yeon@daum.ac.kr'],
       ['강감찬', '2021-06-30 21:20', 'kkc@gmail.com'],
       ['신사임당', '2021-07-20 23:30', 'monther@NAVER.COM '],
       ['을지문덕', '2021-08-28 11:48', '    ygmd@daum.net'],
       ['유재석', '2021-09-01 3:12', 'yjs at gmail.com']], dtype=object)

In [222]:
df.index

RangeIndex(start=0, stop=10, step=1)

In [234]:
df['Birth'] = pd.to_datetime(df['Birth'], format = '%y-%m-%d %H:%M:%S')

In [235]:
df['Birth'] - pd.to_datetime('2019-01-01') # pd.to_datetime('YYYY-MM-DD') => 문자열을 날짜 타입으로 변환

0   731 days 09:10:00
1   738 days 09:20:00
2   762 days 10:20:00
3   763 days 11:40:00
4   789 days 15:10:00
5   830 days 19:20:00
6   911 days 21:20:00
7   931 days 23:30:00
8   970 days 11:48:00
9   974 days 03:12:00
Name: Birth, dtype: timedelta64[ns]

In [209]:
!dir

 F 드라이브의 볼륨: 새 볼륨
 볼륨 일련 번호: 4A75-ADFD

 F:\BACKUP\backup\2021playdata\202105_lab\07.pythonlib 디렉터리

2021-06-28  오후 12:41    <DIR>          .
2021-06-28  오후 12:41    <DIR>          ..
2021-06-28  오후 12:31    <DIR>          .ipynb_checkpoints
2021-06-28  오전 09:58               532 0628_학습내용.txt
2021-06-28  오전 10:13    <DIR>          dataset
2021-06-28  오전 10:47               543 pandas_study.txt
2021-06-28  오후 12:41            61,226 step01_PandasBasic_s.ipynb
2021-06-28  오후 12:31            26,947 step02_PandasReview_s.ipynb
               4개 파일              89,248 바이트
               4개 디렉터리  50,296,020,992 바이트 남음


###  경과일 계산

- timedelta64[??] : 날짜 덧셈과 뺄셈에 효과적
  - Y : 연 
  - M : 월 
  - D : 일 
  - m : 분 
  - s : 초

In [237]:
eday = df['Birth'] - pd.to_datetime('2019-01-01')
eday

0   731 days 09:10:00
1   738 days 09:20:00
2   762 days 10:20:00
3   763 days 11:40:00
4   789 days 15:10:00
5   830 days 19:20:00
6   911 days 21:20:00
7   931 days 23:30:00
8   970 days 11:48:00
9   974 days 03:12:00
Name: Birth, dtype: timedelta64[ns]

In [248]:
# Y M D m s

eday.astype('timedelta64[D]')

0    731.0
1    738.0
2    762.0
3    763.0
4    789.0
5    830.0
6    911.0
7    931.0
8    970.0
9    974.0
Name: Birth, dtype: float64

In [255]:
# astype(int) == astype('int') 동일한 결과 출력됨

eday.astype('timedelta64[D]').astype('int')

0    731
1    738
2    762
3    763
4    789
5    830
6    911
7    931
8    970
9    974
Name: Birth, dtype: int32

## 인덱서(indexer)

### iloc
- 데이터 프레임의 부분집합 선택을 위한 도구
- iloc는 정수로만 선택
- 여러 개의 정수 위치를 선택하려면 리스트 전달
- 동일한 간격으로 선택하려면 슬라이스 표기

In [256]:
df = pd.read_csv('dataset/01.date_data.csv')
df

Unnamed: 0,Name,Birth,email
0,이순신,2021-01-01 9:10,happy@gmail.com
1,홍길동,2021-01-08 9:20,1004@NAVER.COM
2,유관순,2021-02-01 10:20,Iron at yahoo.co.kr
3,이이,2021-02-02 11:40,\tlee@gmail.com
4,김구,2021-02-28 15:10,kim@daum.net\t
5,윤봉길,2021-04-10 19:20,yeon@daum.ac.kr
6,강감찬,2021-06-30 21:20,kkc@gmail.com
7,신사임당,2021-07-20 23:30,monther@NAVER.COM
8,을지문덕,2021-08-28 11:48,ygmd@daum.net
9,유재석,2021-09-01 3:12,yjs at gmail.com


In [264]:
df.iloc[2] # index

Name                       유관순
Birth         2021-02-01 10:20
email     Iron at yahoo.co.kr 
Name: 2, dtype: object

In [268]:
df.iloc[0, 0]

'이순신'

In [276]:
df.iloc[1:3, 0:]

Unnamed: 0,Name,Birth,email
1,홍길동,2021-01-08 9:20,1004@NAVER.COM
2,유관순,2021-02-01 10:20,Iron at yahoo.co.kr


In [286]:
df.iloc[4, 2]

'kim@daum.net\t'

In [290]:
df.iloc[4:5, :]

Unnamed: 0,Name,Birth,email
4,김구,2021-02-28 15:10,kim@daum.net\t


### loc 속성

- 데이터들을 slicing 하는 기술
- loc[index, columns]
- loc는 레이블로만 선택
- 여러 개의 레이블을 선택하려면 리스트 전달
- 동일한 간격으로 선택하려면 슬라이스 표기 사용
- 마지막 레이블까지 포함

In [331]:
df = pd.read_csv('dataset/01.date_data.csv')
df

Unnamed: 0,Name,Birth,email
0,이순신,2021-01-01 9:10,happy@gmail.com
1,홍길동,2021-01-08 9:20,1004@NAVER.COM
2,유관순,2021-02-01 10:20,Iron at yahoo.co.kr
3,이이,2021-02-02 11:40,\tlee@gmail.com
4,김구,2021-02-28 15:10,kim@daum.net\t
5,윤봉길,2021-04-10 19:20,yeon@daum.ac.kr
6,강감찬,2021-06-30 21:20,kkc@gmail.com
7,신사임당,2021-07-20 23:30,monther@NAVER.COM
8,을지문덕,2021-08-28 11:48,ygmd@daum.net
9,유재석,2021-09-01 3:12,yjs at gmail.com


In [294]:
df.loc[:, ['Name', 'email']]

Unnamed: 0,Name,email
0,이순신,happy@gmail.com
1,홍길동,1004@NAVER.COM
2,유관순,Iron at yahoo.co.kr
3,이이,\tlee@gmail.com
4,김구,kim@daum.net\t
5,윤봉길,yeon@daum.ac.kr
6,강감찬,kkc@gmail.com
7,신사임당,monther@NAVER.COM
8,을지문덕,ygmd@daum.net
9,유재석,yjs at gmail.com


In [301]:
# 데이터 수정

print(df.iloc[0, 0])
print(df.loc[0, 'Name'])

이순신
이순신


In [303]:
df.loc[0, 'Name'] = '김순신'
df.head()

Unnamed: 0,Name,Birth,email
0,김순신,2021-01-01 9:10,happy@gmail.com
1,홍길동,2021-01-08 9:20,1004@NAVER.COM
2,유관순,2021-02-01 10:20,Iron at yahoo.co.kr
3,이이,2021-02-02 11:40,\tlee@gmail.com
4,김구,2021-02-28 15:10,kim@daum.net\t


In [338]:
# 새로운 series == column 추가

df['age'] = [1, 2, 3, 4, 5, 6, 7, 8, np.nan, np.nan]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    10 non-null     object 
 1   Birth   10 non-null     object 
 2   email   10 non-null     object 
 3   age     8 non-null      float64
dtypes: float64(1), object(3)
memory usage: 448.0+ bytes


In [339]:
df

Unnamed: 0,Name,Birth,email,age
0,이순신,2021-01-01 9:10,happy@gmail.com,1.0
1,홍길동,2021-01-08 9:20,1004@NAVER.COM,2.0
2,유관순,2021-02-01 10:20,Iron at yahoo.co.kr,3.0
3,이이,2021-02-02 11:40,\tlee@gmail.com,4.0
4,김구,2021-02-28 15:10,kim@daum.net\t,5.0
5,윤봉길,2021-04-10 19:20,yeon@daum.ac.kr,6.0
6,강감찬,2021-06-30 21:20,kkc@gmail.com,7.0
7,신사임당,2021-07-20 23:30,monther@NAVER.COM,8.0
8,을지문덕,2021-08-28 11:48,ygmd@daum.net,
9,유재석,2021-09-01 3:12,yjs at gmail.com,


In [341]:
del df['Name']
df

Unnamed: 0,Birth,email,age
0,2021-01-01 9:10,happy@gmail.com,1.0
1,2021-01-08 9:20,1004@NAVER.COM,2.0
2,2021-02-01 10:20,Iron at yahoo.co.kr,3.0
3,2021-02-02 11:40,\tlee@gmail.com,4.0
4,2021-02-28 15:10,kim@daum.net\t,5.0
5,2021-04-10 19:20,yeon@daum.ac.kr,6.0
6,2021-06-30 21:20,kkc@gmail.com,7.0
7,2021-07-20 23:30,monther@NAVER.COM,8.0
8,2021-08-28 11:48,ygmd@daum.net,
9,2021-09-01 3:12,yjs at gmail.com,


### 이미 존재하는 파일의 내용을 기반으로 DataFrame 생성하기

In [342]:
df = pd.read_csv('dataset/02.friends.csv')
df

Unnamed: 0,이름,나이,직업,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [343]:
type(df)

pandas.core.frame.DataFrame

In [353]:
# problem : tsv(tab separated value)

df = pd.read_csv('dataset/03.friendsTab.txt')
df

Unnamed: 0,이름\t나이\t직업\thobby
0,신동엽\t20\t연예인\tmusic
1,유재석\t41\t교수\tart
2,김새롬\t18\t학생\tstudy
3,이영자\t45\t상담사\ttalk
4,강호동\t38\t연예인\ttalk


In [354]:
# problem : tsv(tab separated value) delimiter='\t' 문법상에서 구분자를 명시적으로 표현해 주어야 함

df = pd.read_csv('dataset/03.friendsTab.txt', delimiter='\t')
df

Unnamed: 0,이름,나이,직업,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [355]:
# problem : column이 없음

df = pd.read_csv('dataset/04.friendsTabNoHead.txt', delimiter='\t')
df

Unnamed: 0,신동엽,20,연예인,music
0,유재석,41,교수,art
1,김새롬,18,학생,study
2,이영자,45,상담사,talk
3,강호동,38,연예인,talk


In [356]:
df.columns

Index(['신동엽', '20', '연예인', 'music'], dtype='object')

In [357]:
# header 

df = pd.read_csv('dataset/04.friendsTabNoHead.txt', delimiter='\t', header=None)
df

Unnamed: 0,0,1,2,3
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [359]:
# header name 추가

df = pd.read_csv('dataset/04.friendsTabNoHead.txt', delimiter='\t', names=['이름', '나이', '직업', '취미'])
df

Unnamed: 0,이름,나이,직업,취미
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [367]:
# 급여 series 추가. 학생인 경우에는 급여를 no로 변환, 학생이 아닐경우 yes
del df['sal']

df['급여'] = 0
df

Unnamed: 0,이름,나이,직업,취미,급여
0,신동엽,20,연예인,music,0
1,유재석,41,교수,art,0
2,김새롬,18,학생,study,0
3,이영자,45,상담사,talk,0
4,강호동,38,연예인,talk,0


In [377]:
# 3항 연산자
# np.where(bool, 'True', 'False')

df['급여'] = np.where(df['직업'] == '학생', 'no', 'yes')

df

Unnamed: 0,이름,나이,직업,취미,급여
0,신동엽,20,연예인,music,yes
1,유재석,41,교수,art,yes
2,김새롬,18,학생,study,no
3,이영자,45,상담사,talk,yes
4,강호동,38,연예인,talk,yes


### python의 데이터로 DataFrame 객체로 변환

- dict key를 DataFrame의 Series로 자동 변환



In [384]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'}]
type(friend_dict_list)

list

In [388]:
df = pd.DataFrame(friend_dict_list)

df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [389]:
type(df)

pandas.core.frame.DataFrame

In [392]:
age = df.loc[3, 'age']

In [393]:
type(age)

numpy.int64

### 중복 데이터 제거 기술

In [394]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'} ]

df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,20,연예인,music


In [396]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool

In [398]:
df = df.drop_duplicates() # default : df.duplicated() 출력시 True 데이터 제거
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [399]:
# step02 - 속성으로 선별해서 삭제

friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'} ]

df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,20,연예인,music


In [400]:
df = df.drop_duplicates(keep='last')
df

Unnamed: 0,name,age,job,hobby
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,20,연예인,music


In [405]:
# step03 - 동명이인일 경우에 선별해서 삭제

friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': 50, 'job': '연예인', 'hobby':'music'} ]

df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,50,연예인,music


In [408]:
df.drop_duplicates('name', keep='first') # 원본데이터는 유지

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [409]:
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,50,연예인,music


In [410]:
# step04 - 원본 DataFrame 자체에 수정 사항 바로 반영하는 속성

friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': 50, 'job': '연예인', 'hobby':'music'} ]

df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,50,연예인,music


In [414]:
df.drop_duplicates('name', keep='last', inplace=True) # 원본데이터 수정 inplace=True

df

Unnamed: 0,name,age,job,hobby
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,50,연예인,music


### 결측치 처리

In [452]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': None, 'job': '연예인', 'hobby':'music'} ]
df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20.0,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,38.0,연예인,talk
5,신동엽,,연예인,music


In [436]:
df.isnull()

Unnamed: 0,name,age,job,hobby
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,True,False,False


In [440]:
df['age'].sum().astype('int')

162

In [456]:
df['age'].fillna(df['age'].mean(), inplace=True)

In [458]:
df['gender'] = ['m', 'm', 'f', 'f', 'm', 'm', ]

In [459]:
#  1   age     5 non-null      float64
# 모든 사람이 나이가 있기 때문에 결측치일 수 없음
# non data를 어떻게 처리할 것인가, 어떤 값으로 대체할 것인가

df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    6 non-null      object 
 1   age     6 non-null      float64
 2   job     6 non-null      object 
 3   hobby   6 non-null      object 
 4   gender  6 non-null      object 
dtypes: float64(1), object(4)
memory usage: 368.0+ bytes


In [417]:
df.iloc[0, 1]

20.0

In [460]:
df.fillna(df.iloc[0, 1])

Unnamed: 0,name,age,job,hobby,gender
0,신동엽,20.0,연예인,music,m
1,유재석,41.0,교수,art,m
2,김새롬,18.0,학생,study,f
3,이영자,45.0,상담사,talk,f
4,강호동,38.0,연예인,talk,m
5,신동엽,32.4,연예인,music,m


In [429]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    6 non-null      object 
 1   age     6 non-null      float64
 2   job     6 non-null      object 
 3   hobby   6 non-null      object 
dtypes: float64(1), object(3)
memory usage: 320.0+ bytes


### 그룹화 하기

In [556]:
# None 값을 0 값으로 치환

friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby': 'music'},
                    {'name': '유재석', 'age': 41, 'job': '교수', 'hobby': 'art'},
                    {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby': 'study'},
                    {'name': '이영자', 'age': 45, 'job': '상담사', 'hobby': 'talk'},
                    {'name':  '강호동', 'age': 38, 'job': '연예인', 'hobby': 'talk'},
                    {'name': '신동엽', 'age': None, 'job': '연예인', 'hobby': 'music'}
                    ]

df = pd.DataFrame(friend_dict_list)
df['age'].fillna(df['age'].mean(), inplace=True)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20.0,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,38.0,연예인,talk
5,신동엽,32.4,연예인,music


<hr>

**미션**

1. hobby 종류 확인
2. hobby 종류의 총수 확인
3. 종류별 개수

In [573]:
print(df['hobby'].unique())          # mission 1 : list로 반환

print('----------------------------------')

print(len(df['hobby'].unique()))     # mission 2 : 반환된 list의 길이

print('----------------------------------')

print(df.hobby.value_counts())       # mission 3 : hobby column의 value별로 count

['music' 'art' 'study' 'talk']
----------------------------------
4
----------------------------------
music    2
talk     2
study    1
art      1
Name: hobby, dtype: int64


In [511]:
# 값이 짝수개일 때에는 중앙값이 유일하지 않고 두 개가 될 수도 있다. 
# 짝수일 경우 가장 가운데에 있는 두 수의 평균이 중앙값이 된다.

df['age'] = df.groupby('job')['age'].transform('median')
df

Unnamed: 0,name,age,job,hobby
0,신동엽,35.45,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,35.45,연예인,talk
5,신동엽,35.45,연예인,music
6,가,35.45,연예인,music
7,나,35.45,연예인,art
8,다,35.45,연예인,music
9,라,35.45,연예인,art


In [512]:
df.groupby('job')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000245DBC6D640>

In [513]:
df.value_counts() # 신동엽은 2개의 데이터를 가짐

name  age    job  hobby
신동엽   35.45  연예인  music    2
이영자   45.00  상담사  talk     1
유재석   41.00  교수   art      1
마     35.45  연예인  music    1
라     35.45  연예인  art      1
다     35.45  연예인  music    1
나     35.45  연예인  art      1
김새롬   18.00  학생   study    1
강호동   35.45  연예인  talk     1
가     35.45  연예인  music    1
dtype: int64

In [514]:
df['job'].value_counts() # 신동엽은 2개의 데이터를 가짐

연예인    8
학생     1
상담사    1
교수     1
Name: job, dtype: int64

In [515]:
df['hobby'].value_counts()

music    5
art      3
talk     2
study    1
Name: hobby, dtype: int64

In [521]:
df['gender'] = ['m', 'm', 'f', 'f', 'm', 'm', 'm', 'm', 'f', 'f', 'm']
df.groupby('gender').sum().astype('int')

Unnamed: 0_level_0,age
gender,Unnamed: 1_level_1
f,133
m,218


1. job 종류 확인 : unique()
2. job 별 개수 확인 :  value_counts()

In [517]:
df['job'].unique() # SQL select distinct job from emp;

array(['연예인', '교수', '학생', '상담사'], dtype=object)

In [518]:
df.iloc[5, 1] = None # np.nan

In [519]:
df

Unnamed: 0,name,age,job,hobby
0,신동엽,35.45,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,35.45,연예인,talk
5,신동엽,,연예인,music
6,가,35.45,연예인,music
7,나,35.45,연예인,art
8,다,35.45,연예인,music
9,라,35.45,연예인,art


In [507]:
df.groupby('job')['age'].median()

job
교수     41.0
상담사    45.0
연예인    29.0
학생     18.0
Name: age, dtype: float64

In [504]:
# 'job' == '연예인'인 사람의 'age'를 *.median() 값으로

df['age'] = df.groupby('job')['age'].transform('median')
df

Unnamed: 0,name,age,job,hobby,gender
0,신동엽,29.0,연예인,music,m
1,유재석,41.0,교수,art,m
2,김새롬,18.0,학생,study,f
3,이영자,45.0,상담사,talk,f
4,강호동,29.0,연예인,talk,m
5,신동엽,29.0,연예인,music,m


In [522]:
# 데이터 제공

friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby': 'music'},
                    {'name': '유재석', 'age': 41, 'job': '교수', 'hobby': 'art'},
                    {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby': 'study'},
                    {'name': '이영자', 'age': 45, 'job': '상담사', 'hobby': 'talk'},
                    {'name': '강호동', 'age': 38, 'job': '연예인', 'hobby': 'talk'},
                    {'name': '신동엽', 'age': None, 'job': '연예인', 'hobby': 'music'},
                    {'name': '고현정', 'age': 44, 'job': '가수', 'hobby': 'music'},
                    {'name': '박민영', 'age': 22, 'job': '학생', 'hobby': 'art'},
                    {'name': '박서준', 'age': 18, 'job': '학생', 'hobby': 'study'},
                    {'name': '박보검', 'age': 45, 'job': '상담사', 'hobby': 'talk'},
                    {'name': '이효리', 'age': 28, 'job': '교수', 'hobby': 'talk'},
                    {'name': '이상순', 'age': 29, 'job': '주부', 'hobby': 'music'}]

df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20.0,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,38.0,연예인,talk
5,신동엽,,연예인,music
6,고현정,44.0,가수,music
7,박민영,22.0,학생,art
8,박서준,18.0,학생,study
9,박보검,45.0,상담사,talk


### 조건 기반 필터링

- Pandas의 비교 및 논리 연산자
- 비교 연산자는 기본 연산자를 사용
- AND : &
- OR : |
- NOT : ~ (tilde)
- 논리 연산에 기본 연산자를 사용할 경우 객체
전체의 참을 계산하기 때문에 오류 발생
- 각 표현은 괄호로 묶는 것은 권장

In [535]:

f1 = df.name == '김새롬'
f2 = df.job == '학생'

f1 & f2

0     False
1     False
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
dtype: bool

In [536]:
df2 = df[f1 & f2]
df2

Unnamed: 0,name,age,job,hobby
2,김새롬,18.0,학생,study


### 두개의 DataFrame 병합하기 - row 기준

In [537]:
l1 = [{'name': '이효리', 'job': "교수"},
      {'name': '이상순', 'job': "학생"},
      {'name': '박보검', 'job': "개발자"}]

l2 = [{'name': '신동엽', 'job': "치과의사"},
      {'name': '이영자', 'job': "농부"},
      {'name': '정찬우', 'job': "연예인"}]

In [538]:
df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['name', 'job'])

print(df1)
print(type(df1))
print(df1.ndim)
print(df1.shape) # table structure

print('-'*20)

print(df2)

  name  job
0  이효리   교수
1  이상순   학생
2  박보검  개발자
<class 'pandas.core.frame.DataFrame'>
2
(3, 2)
--------------------
  name   job
0  신동엽  치과의사
1  이영자    농부
2  정찬우   연예인


In [546]:
# Multi DataFrame Merge

df_all = pd.concat([df1, df2]) # axis default = 0
df_all

Unnamed: 0,name,job
0,이효리,교수
1,이상순,학생
2,박보검,개발자
0,신동엽,치과의사
1,이영자,농부
2,정찬우,연예인


In [549]:
# Multi DataFrame Merge

df_all = pd.concat([df1, df2], axis=0 )
df_all

Unnamed: 0,name,job
0,이효리,교수
1,이상순,학생
2,박보검,개발자
0,신동엽,치과의사
1,이영자,농부
2,정찬우,연예인


In [553]:
# Multi DataFrame Merge

df_all = pd.concat([df1, df2], axis=1)
df_all

Unnamed: 0,name,job,name.1,job.1
0,이효리,교수,신동엽,치과의사
1,이상순,학생,이영자,농부
2,박보검,개발자,정찬우,연예인


In [555]:
# Multi DataFrame Merge
# Existing index ignore => New indexing

df_all = pd.concat([df1, df2], axis=0, ignore_index=True)
df_all

Unnamed: 0,name,job
0,이효리,교수
1,이상순,학생
2,박보검,개발자
3,신동엽,치과의사
4,이영자,농부
5,정찬우,연예인
