# 사전을 이용한 데이터프레임 생성

In [1]:
import pandas as pd
data = {'state': ['Ohio','Ohio','Ohio',
                 'Nevada','Nevada'],
       'year': [2000,2001,2002,2001,2002],
       'pop':[1.5,1.7,3.6,2.4,2.9]}
pd.DataFrame(data) #dict를 이용한 df 생성

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [2]:
pd.DataFrame(data, columns = ['pop','year','state'],
            index = ['one','two','three','four','five'])

Unnamed: 0,pop,year,state
one,1.5,2000,Ohio
two,1.7,2001,Ohio
three,3.6,2002,Ohio
four,2.4,2001,Nevada
five,2.9,2002,Nevada


# 컬럼 다루기

In [3]:
a = pd.DataFrame(data, columns = ['pop','year','state','debt'],
                                 index = ['one','two','three','four','five'])
a

Unnamed: 0,pop,year,state,debt
one,1.5,2000,Ohio,
two,1.7,2001,Ohio,
three,3.6,2002,Ohio,
four,2.4,2001,Nevada,
five,2.9,2002,Nevada,


In [4]:
a['state'] # a.state 같음

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [8]:
a.debt = 16.5
a

Unnamed: 0,pop,year,state,debt
one,1.5,2000,Ohio,16.5
two,1.7,2001,Ohio,16.5
three,3.6,2002,Ohio,16.5
four,2.4,2001,Nevada,16.5
five,2.9,2002,Nevada,16.5


In [10]:
a['debt'] = pd.Series([-1.2,-3,-2], index = ['four','two','one'])
a

Unnamed: 0,pop,year,state,debt
one,1.5,2000,Ohio,-2.0
two,1.7,2001,Ohio,-3.0
three,3.6,2002,Ohio,
four,2.4,2001,Nevada,-1.2
five,2.9,2002,Nevada,


In [11]:
a['debt'] = range(5)
a

Unnamed: 0,pop,year,state,debt
one,1.5,2000,Ohio,0
two,1.7,2001,Ohio,1
three,3.6,2002,Ohio,2
four,2.4,2001,Nevada,3
five,2.9,2002,Nevada,4


In [12]:
a['eastern'] = a.state  # a.eastern = a.state 는 에러
a

Unnamed: 0,pop,year,state,debt,eastern
one,1.5,2000,Ohio,0,Ohio
two,1.7,2001,Ohio,1,Ohio
three,3.6,2002,Ohio,2,Ohio
four,2.4,2001,Nevada,3,Nevada
five,2.9,2002,Nevada,4,Nevada


In [13]:
del a['eastern']
a

Unnamed: 0,pop,year,state,debt
one,1.5,2000,Ohio,0
two,1.7,2001,Ohio,1
three,3.6,2002,Ohio,2
four,2.4,2001,Nevada,3
five,2.9,2002,Nevada,4


# 연습문제1

In [15]:
a = pd.DataFrame({'Nevada':[2.4,2.9,None], 'Ohio':[3.2,2.1,1.9]},
                 index = [2001,2003,2002])
a

Unnamed: 0,Nevada,Ohio
2001,2.4,3.2
2003,2.9,2.1
2002,,1.9


# 연습문제 2

In [16]:
a['Utah'] = [3.3,3.1, 3.2]
a

Unnamed: 0,Nevada,Ohio,Utah
2001,2.4,3.2,3.3
2003,2.9,2.1,3.1
2002,,1.9,3.2


# 인덱스 순서 변경

In [17]:
pd.DataFrame(a, index = [2003,2001,2002])

Unnamed: 0,Nevada,Ohio,Utah
2003,2.9,2.1,3.1
2001,2.4,3.2,3.3
2002,,1.9,3.2


In [18]:
pd.DataFrame(a, index=[2003,2002])

Unnamed: 0,Nevada,Ohio,Utah
2003,2.9,2.1,3.1
2002,,1.9,3.2


# 행렬 전환

In [19]:
a

Unnamed: 0,Nevada,Ohio,Utah
2001,2.4,3.2,3.3
2003,2.9,2.1,3.1
2002,,1.9,3.2


In [20]:
a.T

Unnamed: 0,2001,2003,2002
Nevada,2.4,2.9,
Ohio,3.2,2.1,1.9
Utah,3.3,3.1,3.2


# index, columns, values

In [21]:
type(a)

pandas.core.frame.DataFrame

In [22]:
a.index

Int64Index([2001, 2003, 2002], dtype='int64')

In [23]:
type(a.index)

pandas.core.indexes.numeric.Int64Index

In [24]:
a.columns

Index(['Nevada', 'Ohio', 'Utah'], dtype='object')

In [25]:
type(a.columns)

pandas.core.indexes.base.Index

In [26]:
a.values

array([[2.4, 3.2, 3.3],
       [2.9, 2.1, 3.1],
       [nan, 1.9, 3.2]])

# name

In [27]:
a.index.name = 'year'
a

Unnamed: 0_level_0,Nevada,Ohio,Utah
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001,2.4,3.2,3.3
2003,2.9,2.1,3.1
2002,,1.9,3.2


In [28]:
a.index.name

'year'

In [29]:
a.columns.name = 'state'

In [30]:
a

state,Nevada,Ohio,Utah
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001,2.4,3.2,3.3
2003,2.9,2.1,3.1
2002,,1.9,3.2


In [31]:
a.columns.name

'state'

# 연습문제 3
## 1. 8개의 파일 중 확장자 txt 파일 4개 모두 불러오기
## 2. 텍스트를 소문자 변환 후 어휘 단위 분리
## 3. 어휘 앞 그리고 뒤의 문장부호 하나 이상 연쇄 제거
## 4. 4개 파일의 어휘별 빈도 DataFrame 생성Term-Document Matrix 행 - 어휘, 컬럼 - 파일명
## 5. 커럼 이름을 Files로, 행 이름을 Words로 할당


In [34]:
import os, re
import pandas as pd
from collections import Counter
Freq = {i: Counter(re.sub('^\W+|\W+$','',j)
                  for j in open('./01_data/'+i).read().lower().split())
                  for i in os.listdir('./01_data')
                  if i.endswith('.txt')}
TDM = pd.DataFrame(Freq)
TDM.columns.name = 'Files'
TDM.index.name = 'Words'

In [35]:
TDM

Files,01.txt,02.txt,03.txt,04.txt
Words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
for,1.0,,,2.0
the,4.0,3.0,7.0,4.0
past,1.0,,,
25,1.0,,,
years,2.0,,,
...,...,...,...,...
sea,,,,1.0
put,,,,1.0
accurate,,,,1.0
navigator's,,,,1.0
