In [1]:
import pandas as pd, numpy as np
a = pd.Series([3,10,np.nan], index = np.arange(1,6,2))
a

1     3.0
3    10.0
5     NaN
dtype: float64

In [2]:
1 in a

True

In [3]:
10.0 in a # False

False

In [5]:
a.values

array([ 3., 10., nan])

In [6]:
10 in a.values # True 

True

In [7]:
a.isnull()

1    False
3    False
5     True
dtype: bool

In [8]:
0 < a

1     True
3     True
5    False
dtype: bool

In [9]:
a[0 < a] # Series가 출력됨

1     3.0
3    10.0
dtype: float64

In [10]:
a[(0 < a ) | (a < 10)]  # 0보다 크거나 10보다 작은 값들의 Series가 출력됨

1     3.0
3    10.0
dtype: float64

In [12]:
a + 10

1    13.0
3    20.0
5     NaN
dtype: float64

In [13]:
a + [1,2,3]

1     4.0
3    12.0
5     NaN
dtype: float64

In [14]:
a + [11]

1    14.0
3    21.0
5     NaN
dtype: float64

In [15]:
a + a

1     6.0
3    20.0
5     NaN
dtype: float64

# 예제

In [16]:
fruits = ['apples','oranges','cherries','pears']
S1 = pd.Series([20,33,52,10], index = fruits)
S1

apples      20
oranges     33
cherries    52
pears       10
dtype: int64

In [17]:
S2 = pd.Series([17,13,31,32], index = sorted(fruits))
S2

apples      17
cherries    13
oranges     31
pears       32
dtype: int64

In [19]:
S1 + S2 # 동일 원소끼리 연산 (Series의 장점)

apples      37
cherries    65
oranges     64
pears       42
dtype: int64

In [20]:
print("합계: ", sum(S1) + sum(S2))

합계:  208


In [22]:
fruits = ['peaches','oranges','cherries','pears']
fruits2 = ['raspberries','cherries','oranges','pears']
S1 = pd.Series([20,33,52,10], index = fruits)
S2 = pd.Series([17,13,31,32], index = fruits2)
S1 + S2

cherries       65.0
oranges        64.0
peaches         NaN
pears          42.0
raspberries     NaN
dtype: float64

# 사전을 이용한 Series 생성

In [23]:
a = {'Ohio':25000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
b = pd.Series(a)
b

Ohio      25000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [24]:
c = ['Ohio','California','Texas','Oregon']
pd.Series(a,c) # a에 California는 없음

Ohio          25000.0
California        NaN
Texas         71000.0
Oregon        16000.0
dtype: float64

# name

In [32]:
b.name = 'population'
b.name

'population'

In [35]:
b.index.name = 'state'
b

state
Ohio      25000
Texas     71000
Oregon    16000
Utah       5000
Name: population, dtype: int64

In [36]:
b.index

Index(['Ohio', 'Texas', 'Oregon', 'Utah'], dtype='object', name='state')

In [37]:
b.index.name

'state'

# apply

In [38]:
b.apply(float)

state
Ohio      25000.0
Texas     71000.0
Oregon    16000.0
Utah       5000.0
Name: population, dtype: float64

In [39]:
b.apply(lambda x : x/b.sum()) # 상대빈도

state
Ohio      0.213675
Texas     0.606838
Oregon    0.136752
Utah      0.042735
Name: population, dtype: float64

# missing data: NaN

In [40]:
a = {'Ohio':35000, 'Texas':71000,
    'Oregon':16000, 'Utah':5000}
c = ['Ohio','California','Texas','Oregon']
d = pd.Series(a,c)
d

Ohio          35000.0
California        NaN
Texas         71000.0
Oregon        16000.0
dtype: float64

In [41]:
pd.isnull(d) # d.isnull() 같음

Ohio          False
California     True
Texas         False
Oregon        False
dtype: bool

In [43]:
d[pd.isnull(d)] # nan 행만 출력 d[d.isnull()] 같음

California   NaN
dtype: float64

In [45]:
d[d.notnull()] # null값이 아닌 행만

Ohio      35000.0
Texas     71000.0
Oregon    16000.0
dtype: float64

In [46]:
d.dropna() # pd.Series.dropna(d) 와 같음 

Ohio      35000.0
Texas     71000.0
Oregon    16000.0
dtype: float64

In [48]:
d.fillna(0) # nan을 0으로 채움

Ohio          35000.0
California        0.0
Texas         71000.0
Oregon        16000.0
dtype: float64

# 추가, 제거

In [49]:
d

Ohio          35000.0
California        NaN
Texas         71000.0
Oregon        16000.0
dtype: float64

In [51]:
d[d.isnull()] = 0 # null에 0집어넣음
d

Ohio          35000.0
California        0.0
Texas         71000.0
Oregon        16000.0
dtype: float64

In [52]:
d.append(pd.Series({'Arizona':10000}))

Ohio          35000.0
California        0.0
Texas         71000.0
Oregon        16000.0
Arizona       10000.0
dtype: float64

In [53]:
del d['Ohio']
d

California        0.0
Texas         71000.0
Oregon        16000.0
dtype: float64

# 연습문제1
## 1. 확장자 txt파일 모두 불러오기
## 2. 텍스트를 소문자 변환 후 어휘 단위 분리
## 3. 어휘 앞 그리고 뒤 문장부호 하나 이상 연쇄 제거
## 4. 4개 파일의 the, a, an의 빈도와 합계 빈도를 오른쪽과 같은 Series로 만드시오.

In [55]:
import os, re
import pandas as pd
from collections import Counter
a = ' '.join(open('./01_data/'+i).read().lower() for i in os.listdir('./01_data')
            if i.endswith('.txt'))
b = re.findall('\\b(the|a|an)\\b',a)
c = pd.Series(dict(Counter(b).most_common()))
c.append(pd.Series({'합계':c.sum()}))

the    18
a       8
an      3
합계     29
dtype: int64