## EnglishStopwords.txt
- stop word list
- 길이 2 또는 3 이하 어휘 제거 => a an in on 등 제거
- the in what 등 기능어/문법어, say, people 등 텍스트 주제어가 될 가능성이 적다고 가정되는 고빈도 어휘 제거

## 연습문제 1
- TDM 만들기
- 텍스트로부터 알파벳 포함 어휘 추출하여 소문자 변환
- EnglishStopwords.txt 목롥에 해당하지 않는 어휘라면 어휘 앞 뒤 문장부호 연쇄 제거하여 빈도 추출
- 컬럼명으로 파일명 중 .txt 제거하여 입력

In [4]:
import os, re
import pandas as pd
from collections import Counter
Stopwords = open('EnglishStopwords.txt').read().splitlines()
TDM = pd.concat(
    [pd.DataFrame(pd.Series(
        Counter(re.sub('^\W+|\W+$','',j)
               for j in open('./InauguralAddress/'+i,encoding='cp949').read().lower().split()
               if re.search('[a-z]',j) and
               re.sub('^\W+|\W+$','',j) not in Stopwords)),
                 columns = [i.replace('.txt','')])
    for i in os.listdir('./InauguralAddress/') if i.endswith('.txt')],
    axis = 1)
TDM = TDM.reindex(sorted(TDM.index)).fillna(0).astype('int')

In [5]:
TDM.head()

Unnamed: 0,1789-Washington,1793-Washington,1797-Adams,1801-Jefferson,1805-Jefferson,1809-Madison,1813-Madison,1817-Monroe,1821-Monroe,1825-Adams,...,1981-Reagan,1985-Reagan,1989-Bush,1993-Clinton,1997-Clinton,2001-Bush,2005-Bush,2009-Obama,2013-Obama,2017-Trump
14th,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15th,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18th,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
19th,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
200th,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 키워드 분석

In [6]:
import statsmodels.api as sm
TDM_resid = sm.stats.Table(TDM).resid_pearson

In [7]:
TDM_resid.sort_values(by = ['1789-Washington'], axis=0, ascending=False)['1789-Washington']

nature          2.260437
providential    2.103545
impressions     2.103545
immutable       2.103545
pecuniary       2.020843
                  ...   
freedom        -1.507175
america        -1.517686
people         -1.753851
peace          -1.810786
world          -1.819684
Name: 1789-Washington, Length: 8935, dtype: float64

In [8]:
TDM_resid.sort_values(by = ['2017-Trump'],axis=0, ascending=False)['2017-Trump']

america         8.168133
dreams          5.064350
american        4.917164
protected       4.913764
jobs            4.307024
                  ...   
freedom        -1.517296
union          -1.530471
constitution   -1.622381
peace          -1.822039
government     -2.072020
Name: 2017-Trump, Length: 8935, dtype: float64

In [11]:
TDM_resid.loc['america'].idxmax(axis=1)

'2017-Trump'

In [12]:
TDM_resid.loc['freedom'].idxmax(axis=1)

'2005-Bush'

In [13]:
TDM_resid.loc['america'].idxmin(axis=1)

'1841-Harrison'

In [14]:
TDM_resid.loc['freedom'].idxmin(axis=1)

'1821-Monroe'

## 연습문제 2
- 알파벳 추출
- EnglishStopwords.txt 목록에 해당하지 않는 연장부호 제거
- 이전 시기 대통령 취임사 전체에서 사용되지 않는 어휘 목록 추출

In [16]:
import os, re

Stopwords = open('EnglishStopwords.txt').read().splitlines()
New = set()
Union = set()

for i in os.listdir('./InauguralAddress/'):
    Words = {re.sub('^\W+|\W+$','',j)
                     for j in open('./InauguralAddress/'+i,encoding='cp949').read().lower().split()
                    if re.search('[a-z]',j) and
                    re.sub('^\W+|\W+$','',j) not in Stopwords}
    if Union == set():
        Union.update(Words)
    else:
        New.update(Words.difference(Union))
        Union.update(Words)
New = sorted(New)

## 조작적 정의 operational / functional definition
- 뜨겁다 / 차갑다 또는 똑똑하다 /멍청하다와 같은
명확한 정의가 없다면 객관적 관찰 불가능
- 조작정 정의
관찰 가능한 절차(process) 를 통해 어떤 현상을 정의
정의가 명확하지 않아 관찰 불가한것을 수치 측정을 통해 관찰 가능

In [17]:
TDM.loc[New].apply(lambda x : x[x != 0].size/ TDM.apply(lambda x : x[x!=0].size))

Unnamed: 0,1789-Washington,1793-Washington,1797-Adams,1801-Jefferson,1805-Jefferson,1809-Madison,1813-Madison,1817-Monroe,1821-Monroe,1825-Adams,...,1981-Reagan,1985-Reagan,1989-Bush,1993-Clinton,1997-Clinton,2001-Bush,2005-Bush,2009-Obama,2013-Obama,2017-Trump
1789-Washington,0.0,0.063736,1.096703,0.907692,1.096703,0.659341,0.696703,1.417582,1.826374,1.461538,...,1.21978,1.279121,1.010989,0.802198,1.006593,0.793407,1.074725,1.358242,1.2,0.756044
1793-Washington,0.0,0.568627,9.784314,8.098039,9.784314,5.882353,6.215686,12.647059,16.294118,13.039216,...,10.882353,11.411765,9.019608,7.156863,8.980392,7.078431,9.588235,12.117647,10.705882,6.745098
1797-Adams,0.0,0.046474,0.799679,0.661859,0.799679,0.480769,0.508013,1.033654,1.331731,1.065705,...,0.889423,0.932692,0.737179,0.584936,0.733974,0.578526,0.783654,0.990385,0.875,0.551282
1801-Jefferson,0.0,0.055769,0.959615,0.794231,0.959615,0.576923,0.609615,1.240385,1.598077,1.278846,...,1.067308,1.119231,0.884615,0.701923,0.880769,0.694231,0.940385,1.188462,1.05,0.661538
1805-Jefferson,0.0,0.048739,0.838655,0.694118,0.838655,0.504202,0.532773,1.084034,1.396639,1.117647,...,0.932773,0.978151,0.773109,0.613445,0.769748,0.606723,0.821849,1.038655,0.917647,0.578151
1809-Madison,0.0,0.074935,1.289406,1.067183,1.289406,0.775194,0.819121,1.666667,2.147287,1.718346,...,1.434109,1.503876,1.18863,0.943152,1.183463,0.932817,1.263566,1.596899,1.410853,0.888889
1813-Madison,0.0,0.075325,1.296104,1.072727,1.296104,0.779221,0.823377,1.675325,2.158442,1.727273,...,1.441558,1.511688,1.194805,0.948052,1.18961,0.937662,1.27013,1.605195,1.418182,0.893506
1817-Monroe,0.0,0.03699,0.63648,0.526786,0.63648,0.382653,0.404337,0.822704,1.059949,0.848214,...,0.707908,0.742347,0.586735,0.465561,0.584184,0.460459,0.623724,0.788265,0.696429,0.438776
1821-Monroe,0.0,0.029713,0.51127,0.423156,0.51127,0.307377,0.324795,0.660861,0.851434,0.681352,...,0.568648,0.596311,0.471311,0.373975,0.469262,0.369877,0.501025,0.633197,0.559426,0.352459
1825-Adams,0.0,0.036478,0.627673,0.519497,0.627673,0.377358,0.398742,0.811321,1.045283,0.836478,...,0.698113,0.732075,0.578616,0.459119,0.576101,0.454088,0.615094,0.777358,0.686792,0.432704


In [18]:
TDM.loc[New].apply(sum)/TDM.apply(sum)

1789-Washington    0.000000
1793-Washington    0.576923
1797-Adams         0.699105
1801-Jefferson     0.735650
1805-Jefferson     0.758537
1809-Madison       0.722222
1813-Madison       0.787554
1817-Monroe        0.724919
1821-Monroe        0.778668
1825-Adams         0.723584
1829-Jackson       0.711497
1833-Jackson       0.656250
1837-VanBuren      0.780534
1841-Harrison      0.733574
1845-Polk          0.729499
1849-Taylor        0.686364
1853-Pierce        0.762200
1857-Buchanan      0.728997
1861-Lincoln       0.765266
1865-Lincoln       0.827160
1869-Grant         0.773810
1873-Grant         0.771670
1877-Hayes         0.694045
1881-Garfield      0.767285
1885-Cleveland     0.696350
1889-Harrison      0.788948
1893-Cleveland     0.775581
1897-McKinley      0.785344
1901-McKinley      0.758581
1905-Roosevelt     0.799401
1909-Taft          0.822212
1913-Wilson        0.813725
1917-Wilson        0.740443
1921-Harding       0.833215
1925-Coolidge      0.797893
1929-Hoover        0