### Pandas
- Series and Dataframe
- 분석을 위한 전처리

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action="ignore")

print(np.__version__)
print(pd.__version__)

1.19.2
1.1.3


### Series 클래스
- 넘파이 1차원 배열과 비슷
- series = index + value

In [2]:
# numpy vector
ary = np.array([1,2,3,4,'ruby'], dtype=np.object)
print(ary)
print(ary.dtype)

[1 2 3 4 'ruby']
object


In [3]:
# pandas series
ary = pd.Series([1,2,3,4,'ruby'], dtype=np.object)
print(ary)
print(ary.values)
print(type(ary.values))
print(ary.index)
print(type(ary.index))

0       1
1       2
2       3
3       4
4    ruby
dtype: object
[1 2 3 4 'ruby']
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=5, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>


In [4]:
def seriesInfo(ary) :
    print('index + value : \n', ary)
    print('value : ', ary.values)
    print('value type : ', type(ary.values))
    print('index : ', ary.index)
    print('index type : ', type(ary.index))

- 인덱스의 라벨은 정수, 문자, 날짜, 시간으로 변경 가능

In [5]:
ary = pd.Series([1,2,3,4,5], dtype=np.int32, index=['강남','서초','방배','동작','사당'])

In [6]:
seriesInfo(ary)

index + value : 
 강남    1
서초    2
방배    3
동작    4
사당    5
dtype: int32
value :  [1 2 3 4 5]
value type :  <class 'numpy.ndarray'>
index :  Index(['강남', '서초', '방배', '동작', '사당'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [7]:
# head추가
ary.index.name='구별'
seriesInfo(ary)

index + value : 
 구별
강남    1
서초    2
방배    3
동작    4
사당    5
dtype: int32
value :  [1 2 3 4 5]
value type :  <class 'numpy.ndarray'>
index :  Index(['강남', '서초', '방배', '동작', '사당'], dtype='object', name='구별')
index type :  <class 'pandas.core.indexes.base.Index'>


In [8]:
# 두개 이상의 인덱스 [[]]
print(ary['서초'])
print(ary[['강남', '방배']])

2
구별
강남    1
방배    3
dtype: int32


In [9]:
# index, values 모두 가져올 때
for idx, value in ary.items():
    print('idx : {}, value : {}'.format(idx, value))

idx : 강남, value : 1
idx : 서초, value : 2
idx : 방배, value : 3
idx : 동작, value : 4
idx : 사당, value : 5


In [10]:
# index만 가져올 때
for idx in ary.keys():
    print('idx : {}'.format(idx))

idx : 강남
idx : 서초
idx : 방배
idx : 동작
idx : 사당


In [11]:
# value만 가져올 때
for value in ary.values:
    print('value : {}'.format(value))

value : 1
value : 2
value : 3
value : 4
value : 5


In [12]:
ary = pd.Series(range(10, 21,2))
seriesInfo(ary)

index + value : 
 0    10
1    12
2    14
3    16
4    18
5    20
dtype: int64
value :  [10 12 14 16 18 20]
value type :  <class 'numpy.ndarray'>
index :  RangeIndex(start=0, stop=6, step=1)
index type :  <class 'pandas.core.indexes.range.RangeIndex'>


In [13]:
# dict형태로 series만들기
ary = pd.Series({'c' : 1, 'b' : 5, 'a' : -8, 'k' : 10}, dtype=np.float64)
seriesInfo(ary)

index + value : 
 c     1.0
b     5.0
a    -8.0
k    10.0
dtype: float64
value :  [ 1.  5. -8. 10.]
value type :  <class 'numpy.ndarray'>
index :  Index(['c', 'b', 'a', 'k'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [14]:
# 연산이 되더라도 index는 변함없음
ary*10

c     10.0
b     50.0
a    -80.0
k    100.0
dtype: float64

- fancy indexing & boolean indexing

In [15]:
print('fancy [0,2] indexing : \n{}'.format(ary[[0,2]]))

fancy [0,2] indexing : 
c    1.0
a   -8.0
dtype: float64


In [16]:
# 2의 배수
print('boolean ary % 2 == 0 :\n{}'.format(ary[ary % 2 == 0]))

boolean ary % 2 == 0 :
a    -8.0
k    10.0
dtype: float64


### 시간, 날짜 불러오기

In [17]:
from datetime import date, datetime, timedelta
from dateutil.parser import parse

In [18]:
strDate = datetime(2021,2,25)
print(strDate)
print(strDate + timedelta(days=1))

2021-02-25 00:00:00
2021-02-26 00:00:00


In [19]:
# 평균이 50이고 편차가 5인 정규분포 데이터를 10일간 만들기
fac01 = pd.Series([int(x) for x in np.random.normal(50, 5, (10))], index=[strDate + timedelta(days=day) for day in range(10)])
seriesInfo(fac01)

index + value : 
 2021-02-25    50
2021-02-26    52
2021-02-27    51
2021-02-28    51
2021-03-01    53
2021-03-02    57
2021-03-03    53
2021-03-04    44
2021-03-05    59
2021-03-06    57
dtype: int64
value :  [50 52 51 51 53 57 53 44 59 57]
value type :  <class 'numpy.ndarray'>
index :  DatetimeIndex(['2021-02-25', '2021-02-26', '2021-02-27', '2021-02-28',
               '2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06'],
              dtype='datetime64[ns]', freq=None)
index type :  <class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [20]:
# 평균이 70이고 편차가 8인 정규분포 데이터를 10일간 만들기
fac02 = pd.Series([int(x) for x in np.random.normal(70, 8, (10))], index=[strDate + timedelta(days=day) for day in range(10)])
seriesInfo(fac02)

index + value : 
 2021-02-25    58
2021-02-26    58
2021-02-27    71
2021-02-28    76
2021-03-01    77
2021-03-02    75
2021-03-03    63
2021-03-04    69
2021-03-05    68
2021-03-06    68
dtype: int64
value :  [58 58 71 76 77 75 63 69 68 68]
value type :  <class 'numpy.ndarray'>
index :  DatetimeIndex(['2021-02-25', '2021-02-26', '2021-02-27', '2021-02-28',
               '2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06'],
              dtype='datetime64[ns]', freq=None)
index type :  <class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [21]:
fac01 + fac02

2021-02-25    108
2021-02-26    110
2021-02-27    122
2021-02-28    127
2021-03-01    130
2021-03-02    132
2021-03-03    116
2021-03-04    113
2021-03-05    127
2021-03-06    125
dtype: int64

In [22]:
# casting
print(set(fac01.index))
print(list(fac01.index))

{Timestamp('2021-03-04 00:00:00'), Timestamp('2021-03-03 00:00:00'), Timestamp('2021-02-25 00:00:00'), Timestamp('2021-02-28 00:00:00'), Timestamp('2021-03-02 00:00:00'), Timestamp('2021-02-27 00:00:00'), Timestamp('2021-02-26 00:00:00'), Timestamp('2021-03-01 00:00:00'), Timestamp('2021-03-06 00:00:00'), Timestamp('2021-03-05 00:00:00')}
[Timestamp('2021-02-25 00:00:00'), Timestamp('2021-02-26 00:00:00'), Timestamp('2021-02-27 00:00:00'), Timestamp('2021-02-28 00:00:00'), Timestamp('2021-03-01 00:00:00'), Timestamp('2021-03-02 00:00:00'), Timestamp('2021-03-03 00:00:00'), Timestamp('2021-03-04 00:00:00'), Timestamp('2021-03-05 00:00:00'), Timestamp('2021-03-06 00:00:00')]


In [23]:
for idx in fac01.index :
    print(idx)

2021-02-25 00:00:00
2021-02-26 00:00:00
2021-02-27 00:00:00
2021-02-28 00:00:00
2021-03-01 00:00:00
2021-03-02 00:00:00
2021-03-03 00:00:00
2021-03-04 00:00:00
2021-03-05 00:00:00
2021-03-06 00:00:00


- Series indexing

In [24]:
fac01

2021-02-25    50
2021-02-26    52
2021-02-27    51
2021-02-28    51
2021-03-01    53
2021-03-02    57
2021-03-03    53
2021-03-04    44
2021-03-05    59
2021-03-06    57
dtype: int64

In [25]:
fac01[1]

52

In [26]:
fac01[datetime.strptime('2021-02-25', '%Y-%m-%d')]

50

In [27]:
price_series = pd.Series([4000, 3000, 3500, 2000], index=['a', 'b', 'c', 'd'])
seriesInfo(price_series)

index + value : 
 a    4000
b    3000
c    3500
d    2000
dtype: int64
value :  [4000 3000 3500 2000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [28]:
# value값 바꾸기
price_series['a']=5000
print(seriesInfo(price_series))
price_series[0] = 6000
print(seriesInfo(price_series))

index + value : 
 a    5000
b    3000
c    3500
d    2000
dtype: int64
value :  [5000 3000 3500 2000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None
index + value : 
 a    6000
b    3000
c    3500
d    2000
dtype: int64
value :  [6000 3000 3500 2000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None


In [29]:
# index, value값 추가
price_series['e'] = 1000
seriesInfo(price_series)

index + value : 
 a    6000
b    3000
c    3500
d    2000
e    1000
dtype: int64
value :  [6000 3000 3500 2000 1000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [30]:
# index, value값 삭제
del price_series['e']
seriesInfo(price_series)

index + value : 
 a    6000
b    3000
c    3500
d    2000
dtype: int64
value :  [6000 3000 3500 2000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [31]:
# null값 추가
price_series['e'] = np.NaN
seriesInfo(price_series)

index + value : 
 a    6000.0
b    3000.0
c    3500.0
d    2000.0
e       NaN
dtype: float64
value :  [6000. 3000. 3500. 2000.   nan]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [32]:
# null o --> True
pd.isnull(price_series)

a    False
b    False
c    False
d    False
e     True
dtype: bool

In [33]:
ser01 = pd.Series([100, 200, 300, 400], index=['a','b','c','d'])
ser02 = pd.Series([500, 600, 700, 800], index=['a','b','e','d'])

In [34]:
# 연산--> +
ser03 = ser01 + ser02
ser03

a     600.0
b     800.0
c       NaN
d    1200.0
e       NaN
dtype: float64

In [35]:
# 연산--> .add
ser04 = ser01.add(ser02, fill_value=0)
ser04

a     600.0
b     800.0
c     300.0
d    1200.0
e     700.0
dtype: float64

In [36]:
# 결측값
zser = ser03.fillna(0)
print(seriesInfo(zser))
zser = ser03.fillna(ser03.mean())
print(seriesInfo(zser))

index + value : 
 a     600.0
b     800.0
c       0.0
d    1200.0
e       0.0
dtype: float64
value :  [ 600.  800.    0. 1200.    0.]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None
index + value : 
 a     600.000000
b     800.000000
c     866.666667
d    1200.000000
e     866.666667
dtype: float64
value :  [ 600.          800.          866.66666667 1200.          866.66666667]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None


In [37]:
# subset
print(pd.isnull(ser03))
subset = ser03[pd.isnull(ser03)]
subset

a    False
b    False
c     True
d    False
e     True
dtype: bool


c   NaN
e   NaN
dtype: float64

In [38]:
print(pd.notnull(ser03))
subset = ser03[pd.notnull(ser03)]
subset

a     True
b     True
c    False
d     True
e    False
dtype: bool


a     600.0
b     800.0
d    1200.0
dtype: float64

In [39]:
print(seriesInfo(ser04))
# 배열인덱싱
print(ser04[0:2])
# 라벨인덱싱
print(ser04[['a', 'c']])
print(ser04[['c', 'a']])
print(ser04['a':'c'])
print(ser04[[3,1]])

index + value : 
 a     600.0
b     800.0
c     300.0
d    1200.0
e     700.0
dtype: float64
value :  [ 600.  800.  300. 1200.  700.]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None
a    600.0
b    800.0
dtype: float64
a    600.0
c    300.0
dtype: float64
c    300.0
a    600.0
dtype: float64
a    600.0
b    800.0
c    300.0
dtype: float64
d    1200.0
b     800.0
dtype: float64


In [40]:
tuple_ser = pd.Series((10, 20, 30, 40))
tuple_ser

0    10
1    20
2    30
3    40
dtype: int64

In [41]:
# set은 순서가 없기때문에 list로 형변환을 시켜 순서를 넣어줘야 함
set_ser = pd.Series(list({10, 20, 30, 40}))
set_ser

0    40
1    10
2    20
3    30
dtype: int64

### DataFrame

In [42]:
data = {'name' : ['ruby', 'dia', '은영', '녕', 'jslim'], 'birth' : [2000,2001,2002,2003,2004]}
userDF = pd.DataFrame(data)
display(userDF)

Unnamed: 0,name,birth
0,ruby,2000
1,dia,2001
2,은영,2002
3,녕,2003
4,jslim,2004


In [43]:
print(userDF.shape)
print(userDF.size)
print(userDF.ndim)
print(userDF.index)
print(userDF.columns)
print(userDF.values)
print(type(userDF.values))

(5, 2)
10
2
RangeIndex(start=0, stop=5, step=1)
Index(['name', 'birth'], dtype='object')
[['ruby' 2000]
 ['dia' 2001]
 ['은영' 2002]
 ['녕' 2003]
 ['jslim' 2004]]
<class 'numpy.ndarray'>


In [44]:
def frameInfo(df) :
    print('shape : {}'.format(df.shape))
    print('size : {}'.format(df.size))
    print('ndim : {}'.format(df.ndim))
    print('index : {}'.format(df.index))
    print('index type : {}'.format(type(df.index)))
    print('columns : {}'.format(df.columns))
    print('columns type : {}'.format(type(df.columns)))

In [45]:
data = {
    "2021" : [9910293, 8384050, 2938485, 1203948],
    "2018" : [8910293, 7384050, 5938485, 3203948],
    "2016" : [7910293, 5384050, 7938485, 6203948],
    "2014" : [5910293, 3384050, 4938485, 4203948],
    "지역" : ['수도권' , '경상권' , '수도권' , '경상권'],
    "증가율" : [0.2343 , 0.0434 , 0.0944 , 0.0034]
}

In [46]:
columns = ["지역","2014","2016","2018","2021","증가율",]
popDF = pd.DataFrame(data, index=["서울","부산","경기","대구"], columns = columns)
popDF

Unnamed: 0,지역,2014,2016,2018,2021,증가율
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


In [47]:
frameInfo(popDF)

shape : (4, 6)
size : 24
ndim : 2
index : Index(['서울', '부산', '경기', '대구'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
columns : Index(['지역', '2014', '2016', '2018', '2021', '증가율'], dtype='object')
columns type : <class 'pandas.core.indexes.base.Index'>


In [48]:
popDF.index

Index(['서울', '부산', '경기', '대구'], dtype='object')

In [49]:
popDF.columns

Index(['지역', '2014', '2016', '2018', '2021', '증가율'], dtype='object')

In [50]:
# index의 name 설정
popDF.index.name="city"
popDF.columns.name = "feature"
display(popDF)
display(popDF.T)

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


city,서울,부산,경기,대구
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지역,수도권,경상권,수도권,경상권
2014,5910293,3384050,4938485,4203948
2016,7910293,5384050,7938485,6203948
2018,8910293,7384050,5938485,3203948
2021,9910293,8384050,2938485,1203948
증가율,0.2343,0.0434,0.0944,0.0034


### 다음 조건을 만족하는 임의의 데이터프레임을 만들어보자
- 열의 갯수와 행의 갯수가 각각 5개 이상이여야 한다.
- 열에는 정수, 문자열, 실수, 날짜 데이터가 각각 1개 이상 포함되어야 한다.

In [51]:
# 정수컬럼
random_int = np.random.randint(1, 100, 10)
# 실수컬럼(표준정규분포)
random_gao = np.random.randn(10)
# 실수 컬럼
random_uni = np.random.rand(10)
# 날짜 컬럼
startDay = datetime(2021,2,25)
years = [startDay + timedelta(day) for day in range(0, 10)]
# 문자 컬럼
random_moon = ["apple", "banana", "mango", "berry", "melon", "cherry", "pineapple", "peach", "blueberry", "avocado"]

In [52]:
data = {
    '정수' : random_int,
    '실수1': random_gao,
    '실수2':random_uni,
    '문자': random_moon,
    '날짜': years,
}

testDF = pd.DataFrame(data)
testDF

Unnamed: 0,정수,실수1,실수2,문자,날짜
0,24,1.543061,0.15228,apple,2021-02-25
1,33,-0.484514,0.658957,banana,2021-02-26
2,17,1.274822,0.407525,mango,2021-02-27
3,16,-0.954644,0.663351,berry,2021-02-28
4,74,0.544376,0.680719,melon,2021-03-01
5,95,2.56351,0.828082,cherry,2021-03-02
6,56,-0.575368,0.428636,pineapple,2021-03-03
7,98,0.759926,0.70837,peach,2021-03-04
8,4,-0.388827,0.337552,blueberry,2021-03-05
9,69,0.46206,0.003603,avocado,2021-03-06


In [53]:
# 새로운 컬럼 추가
popDF['2014-2016 증가율'] = ((popDF['2016'] - popDF['2014']) / popDF['2014'] * 100).round(2)
popDF

feature,지역,2014,2016,2018,2021,증가율,2014-2016 증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343,33.84
부산,경상권,3384050,5384050,7384050,8384050,0.0434,59.1
경기,수도권,4938485,7938485,5938485,2938485,0.0944,60.75
대구,경상권,4203948,6203948,3203948,1203948,0.0034,47.57


In [54]:
# 컬럼 삭제
del popDF['2014-2016 증가율']
popDF

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


In [55]:
print(type(popDF[['지역', '증가율']]))
print(type(popDF['지역']))
print(type(popDF['지역'].values))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>


- 행(row) indexing
- 슬라이싱만 가능
- 배열 인덱스, 라벨인덱스 가능

In [56]:
popDF

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


In [57]:
# 배열인덱싱
display(popDF[:1])
# 라벨인덱싱
display(popDF[: '서울'])

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343


feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343


In [58]:
# 배열인덱싱
display(popDF[0:3])
# 라벨인덱싱
display(popDF['서울': '경기'])

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944


feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944


- 개별 인덱싱 : 특정 행에 대한 특정 컬럼

In [59]:
print(popDF['2021']['서울'])
print(popDF['2021'][:'서울'])
print(type(popDF['2021']['서울']))
print(type(popDF['2021'][:'서울']))

9910293
city
서울    9910293
Name: 2021, dtype: int64
<class 'numpy.int64'>
<class 'pandas.core.series.Series'>


In [60]:
popDF['2021'][['서울', '대구']]

city
서울    9910293
대구    1203948
Name: 2021, dtype: int64

In [61]:
score_data = {
    'kor'  : [80,90,70,30],
    'eng'  : [90,70,60,40],
    'math' : [90,60,90,70] 
}
columns = ['kor','eng','math']
index   = ['김지은', '황인범', '김정수', '최호진']

exec_df = pd.DataFrame(score_data , index=index , columns=columns)
exec_df

Unnamed: 0,kor,eng,math
김지은,80,90,90
황인범,90,70,60
김정수,70,60,90
최호진,30,40,70


In [62]:
# 위 데이터를 보고 모든 학생의 수학 점수를 시리즈로 출력하라
# 모든 학생의 국어와 영어 점수를 데이터 프레임으로  만들어라
# 모든 학생의 각 과목 평균 점수를 새로운 열로 추가하라
# 최호진 학생의 영어 점수를 90점으로 수정하고 평균 점수도 다시 계산하라
# 김지은 학생의 점수를 데이터 프레임으로 만들어라
# 김정수 학생의 점수를 시리즈로 출력하라
# 황인범 학생의 국어점수와 수학점수를 100점으로 수정하고 평균 점수도 다시 계산하라

### pandas 문자함수
- 함수 앞에 str 을 붙여주면 된다.

In [66]:
testDF

Unnamed: 0,정수,실수1,실수2,문자,날짜
0,24,1.543061,0.15228,apple,2021-02-25
1,33,-0.484514,0.658957,banana,2021-02-26
2,17,1.274822,0.407525,mango,2021-02-27
3,16,-0.954644,0.663351,berry,2021-02-28
4,74,0.544376,0.680719,melon,2021-03-01
5,95,2.56351,0.828082,cherry,2021-03-02
6,56,-0.575368,0.428636,pineapple,2021-03-03
7,98,0.759926,0.70837,peach,2021-03-04
8,4,-0.388827,0.337552,blueberry,2021-03-05
9,69,0.46206,0.003603,avocado,2021-03-06


In [70]:
courtDF = pd.read_csv('./data/court_code.txt', sep='\t', encoding='cp949')
courtDF.head()

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1,1111000000,서울특별시 종로구,존재
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재


In [72]:
court_subset_df = courtDF[ courtDF['폐지여부'] == '폐지' ]
court_subset_df

Unnamed: 0,법정동코드,법정동명,폐지여부
89,1111090100,서울특별시 종로구 창신1동,폐지
90,1111090200,서울특별시 종로구 창신2동,폐지
91,1111090300,서울특별시 종로구 창신3동,폐지
92,1111090400,서울특별시 종로구 숭인1동,폐지
93,1111090500,서울특별시 종로구 숭인2동,폐지
...,...,...,...
45962,4972032025,제주도 남제주군 표선면 세화리,폐지
45963,4972032026,제주도 남제주군 표선면 토산리,폐지
46025,5011025305,제주특별자치도 제주시 애월읍 귀일리,폐지
46026,5011025306,제주특별자치도 제주시 애월읍 어도리,폐지


In [73]:
courtDF.head()

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1,1111000000,서울특별시 종로구,존재
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재


In [79]:
court_subset_df = courtDF[ courtDF['법정동명'].str[ : 5] == '서울특별시'] # '서울특별시' 다섯글자 가져오기
court_subset_df

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1,1111000000,서울특별시 종로구,존재
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재
...,...,...,...
1107,1174010600,서울특별시 강동구 둔촌동,존재
1108,1174010700,서울특별시 강동구 암사동,존재
1109,1174010800,서울특별시 강동구 성내동,존재
1110,1174010900,서울특별시 강동구 천호동,존재


- 분할 str.split()

In [88]:
# 공백으로 분리해보기 리스트로 값이 넘어오는데 expand = True 옵션을 사용하면 DF 형식으로 받는다.
court_subset_df = courtDF['법정동명'].str.split(" ", expand = True)
court_subset_df

Unnamed: 0,0,1,2,3,4
0,서울특별시,,,,
1,서울특별시,종로구,,,
2,서울특별시,종로구,청운동,,
3,서울특별시,종로구,신교동,,
4,서울특별시,종로구,궁정동,,
...,...,...,...,...,...
46175,제주특별자치도,서귀포시,표선면,하천리,
46176,제주특별자치도,서귀포시,표선면,성읍리,
46177,제주특별자치도,서귀포시,표선면,가시리,
46178,제주특별자치도,서귀포시,표선면,세화리,


- str.startswith() : 특정 문자로 시작하는 것
- str.endswith() : 특정 문자로 끝나는 것
- str.contains() : 특정 문자를 담고 있는가
- str.replace() : 특정 문자를 대체

In [98]:
# Series의 boolean masking 된 형태기에 DF에 다시 담아주어야 원하는 결과값 출력
court_subset_df = courtDF[ courtDF['법정동명'].str.startswith('서울') ]
court_subset_df = courtDF[ courtDF['법정동명'].str.endswith('시') ]
court_subset_df = courtDF[ courtDF['법정동명'].str.contains('강서') & courtDF['법정동명'].str.contains('부산')  ]
court_subset_df = courtDF['법정동명'].str.replace(' ', '-')
court_subset_df

0                       서울특별시
1                   서울특별시-종로구
2               서울특별시-종로구-청운동
3               서울특별시-종로구-신교동
4               서울특별시-종로구-궁정동
                 ...         
46175    제주특별자치도-서귀포시-표선면-하천리
46176    제주특별자치도-서귀포시-표선면-성읍리
46177    제주특별자치도-서귀포시-표선면-가시리
46178    제주특별자치도-서귀포시-표선면-세화리
46179    제주특별자치도-서귀포시-표선면-토산리
Name: 법정동명, Length: 46180, dtype: object

- str.strip()
- str.lstrip()
- str.rstrip()
- str.lower()
- str.upper()
- str.swapcase()


In [99]:
emptyDF = pd.DataFrame({
    'col01' : ['abcd   ' , ' FFFght '  , 'abCCe    '],
    'col02' : ['   fjHij' , ' ffght '  , 'Ibcce    '],
        
})


In [109]:
print(emptyDF['col01'].str.strip())
print(emptyDF['col01'].str.lstrip())
print(emptyDF['col01'].str.rstrip())
print(emptyDF['col01'].str.lower())
print(emptyDF['col01'].str.upper())
print(emptyDF['col01'].str.swapcase())

0      abcd
1    FFFght
2     abCCe
Name: col01, dtype: object
0      abcd   
1      FFFght 
2    abCCe    
Name: col01, dtype: object
0       abcd
1     FFFght
2      abCCe
Name: col01, dtype: object
0      abcd   
1      fffght 
2    abcce    
Name: col01, dtype: object
0      ABCD   
1      FFFGHT 
2    ABCCE    
Name: col01, dtype: object
0      ABCD   
1      fffGHT 
2    ABccE    
Name: col01, dtype: object
