# 파일을 데이터프레임으로 불러오기

In [1]:
import pandas as pd
TDM = pd.read_csv('TDM.txt',sep='\t', index_col=0)
TDM.head()

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
14th,0,0,0,1
15,0,0,0,1
1656,0,0,0,1
1761,0,0,0,1
17th,0,0,0,1


# Series: 정렬

In [3]:
a = TDM['01.txt']
a

14th     0
15       0
1656     0
1761     0
17th     0
        ..
years    2
york     0
you      4
young    0
your     5
Name: 01.txt, Length: 244, dtype: int64

In [4]:
a.sort_index().head()

14th    0
15      0
1656    0
1761    0
17th    0
Name: 01.txt, dtype: int64

In [5]:
a.sort_index(ascending=False).head()

your     5
young    0
you      4
york     0
years    2
Name: 01.txt, dtype: int64

In [6]:
a.sort_values().head()

14th             0
often            0
old-fashioned    0
one              0
open             0
Name: 01.txt, dtype: int64

In [7]:
a.sort_values(ascending=False).head()

your    5
you     4
the     4
to      4
have    3
Name: 01.txt, dtype: int64

# DataFrame: 정렬

In [8]:
TDM.iloc[[0,1,2,-3,-2,-1]]

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
14th,0,0,0,1
15,0,0,0,1
1656,0,0,0,1
you,4,0,0,0
young,0,1,0,0
your,5,0,0,0


In [9]:
TDM.sort_index().head(4)

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
14th,0,0,0,1
15,0,0,0,1
1656,0,0,0,1
1761,0,0,0,1


In [10]:
TDM.sort_index(ascending=False).head(4)

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
your,5,0,0,0
young,0,1,0,0
you,4,0,0,0
york,0,0,1,0


In [11]:
TDM.sort_index(axis=0,ascending=False).head(4)

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
your,5,0,0,0
young,0,1,0,0
you,4,0,0,0
york,0,0,1,0


In [12]:
TDM.sort_index(axis=1,ascending=False).head(4)

Unnamed: 0,04.txt,03.txt,02.txt,01.txt
14th,1,0,0,0
15,1,0,0,0
1656,1,0,0,0
1761,1,0,0,0


In [13]:
TDM.sort_values(by='03.txt', ascending=False).head()

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
to,4,5,11,5
the,4,3,7,4
is,1,3,5,0
it,1,0,4,1
they,0,1,4,0


In [14]:
TDM.sort_values(by=['03.txt','04.txt'],
               ascending=[False,True]).head(6)

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
to,4,5,11,5
the,4,3,7,4
is,1,3,5,0
they,0,1,4,0
it,1,0,4,1
of,2,1,4,1


# 연산

In [15]:
a = TDM.loc[['a','an','the','of','to']]
a

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
a,1,2,2,3
an,0,2,1,0
the,4,3,7,4
of,2,1,4,1
to,4,5,11,5


In [16]:
b = a.iloc[::-2,::-2]
b

Unnamed: 0,04.txt,02.txt
to,5,5
the,4,3
a,3,2


In [17]:
b+a

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
a,,4.0,,6.0
an,,,,
of,,,,
the,,6.0,,8.0
to,,10.0,,10.0


In [18]:
b.add(a)

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
a,,4.0,,6.0
an,,,,
of,,,,
the,,6.0,,8.0
to,,10.0,,10.0


In [19]:
b.add(a,fill_value=0) # sub, mul, div, power
# 없는 경우에 default 0

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
a,1.0,4.0,2.0,6.0
an,0.0,2.0,1.0,0.0
of,2.0,1.0,4.0,1.0
the,4.0,6.0,7.0,8.0
to,4.0,10.0,11.0,10.0


# apply

In [20]:
a

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
a,1,2,2,3
an,0,2,1,0
the,4,3,7,4
of,2,1,4,1
to,4,5,11,5


In [21]:
a.apply(sum)

01.txt    11
02.txt    13
03.txt    25
04.txt    13
dtype: int64

In [22]:
a.apply(sum,axis=1)

a       8
an      3
the    18
of      8
to     25
dtype: int64

In [23]:
a.apply(lambda x : max(x) - min(x))

01.txt     4
02.txt     4
03.txt    10
04.txt     5
dtype: int64

# applymap

In [24]:
a.apply(sum)

01.txt    11
02.txt    13
03.txt    25
04.txt    13
dtype: int64

In [25]:
a.applymap(float)

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
a,1.0,2.0,2.0,3.0
an,0.0,2.0,1.0,0.0
the,4.0,3.0,7.0,4.0
of,2.0,1.0,4.0,1.0
to,4.0,5.0,11.0,5.0


# 기술 통계: 수치 해석

In [26]:
TDM.head()

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
14th,0,0,0,1
15,0,0,0,1
1656,0,0,0,1
1761,0,0,0,1
17th,0,0,0,1


In [27]:
TDM.sum()

01.txt     85
02.txt     87
03.txt    126
04.txt    120
dtype: int64

In [28]:
TDM.sum(axis=1)

14th     1
15       1
1656     1
1761     1
17th     1
        ..
years    2
york     1
you      4
young    1
your     5
Length: 244, dtype: int64

In [29]:
TDM.mean()

01.txt    0.348361
02.txt    0.356557
03.txt    0.516393
04.txt    0.491803
dtype: float64

In [30]:
TDM.std() # 표준 편차

01.txt    0.778807
02.txt    0.647818
03.txt    1.152800
04.txt    0.833910
dtype: float64

In [31]:
TDM.describe()

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
count,244.0,244.0,244.0,244.0
mean,0.348361,0.356557,0.516393,0.491803
std,0.778807,0.647818,1.1528,0.83391
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,1.0
max,5.0,5.0,11.0,5.0


In [32]:
type(TDM.describe())

pandas.core.frame.DataFrame

In [33]:
TDM.median()

01.txt    0.0
02.txt    0.0
03.txt    0.0
04.txt    0.0
dtype: float64

In [34]:
TDM.describe().loc['75%'].sub(TDM.describe().loc['25%'])

01.txt    0.0
02.txt    1.0
03.txt    1.0
04.txt    1.0
dtype: float64

In [35]:
TDM.idxmax()

01.txt    your
02.txt      to
03.txt      to
04.txt     and
dtype: object

In [36]:
TDM.idxmax(axis=1)

14th     04.txt
15       04.txt
1656     04.txt
1761     04.txt
17th     04.txt
          ...  
years    01.txt
york     03.txt
you      01.txt
young    02.txt
your     01.txt
Length: 244, dtype: object

In [37]:
TDM.idxmin()

01.txt    14th
02.txt    14th
03.txt    14th
04.txt    1979
dtype: object

In [38]:
TDM.idxmin(axis=1)

14th     01.txt
15       01.txt
1656     01.txt
1761     01.txt
17th     01.txt
          ...  
years    02.txt
york     01.txt
you      02.txt
young    01.txt
your     02.txt
Length: 244, dtype: object

# 연습문제1
### 1. 빈도 합계 > 3 어휘만을 추출
### 2. describe 매서드 이요하여 기술통계 데이터프레임 추출
### 3. 기술통계 데이터프레임에 median과 IQR 추가
### 4. 기술통계 데이터프레임을 소수점 2자리까지 출력
### 5. 기술통계 데이터프레임을 describe.txt 파일로 출력

In [40]:
A = TDM[TDM.sum(axis=1) > 3]
B = A.describe()
C = pd.concat([B,pd.DataFrame({'median':A.median(),
                              'IQR':B.loc['75%'].sub(B.loc['25%'])}).T])
C

Unnamed: 0,01.txt,02.txt,03.txt,04.txt
count,21.0,21.0,21.0,21.0
mean,1.904762,1.190476,2.666667,1.52381
std,1.578124,1.327368,2.633122,1.887301
min,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0
50%,2.0,1.0,2.0,1.0
75%,3.0,2.0,4.0,3.0
max,5.0,5.0,11.0,5.0
median,2.0,1.0,2.0,1.0
IQR,2.0,2.0,3.0,3.0


In [41]:
C.round(2).to_csv('descrie.txt',sep='\t')

# 기술 통계 : 명목 척도

In [42]:
import numpy as np
a = pd.DataFrame(np.random.choice([np.nan, 'apple', 'banana', 'orange'],
                                 15, replace = True,
                                 p = [0.2, 0.2, 0.3, 0.3]).reshape(5,3),
                index = list('abcde'),
                columns = list('ABC'))
a

Unnamed: 0,A,B,C
a,orange,banana,
b,orange,orange,
c,banana,banana,banana
d,banana,orange,apple
e,banana,banana,apple


# 연습문제 2

In [43]:
a.apply(lambda x : x.value_counts()).fillna(0).astype('i') # 명목척도 기술통계

Unnamed: 0,A,B,C
apple,0,0,2
banana,3,3,1
,0,0,2
orange,2,2,0
