In [38]:
import re
import numpy as np
import pandas as pd

## 把整段文字檔存進doc裡面

In [39]:
doc = 'And Yugoslav authorities are planning the arrest of eleven coal miners and two opposition politicians on suspicion of sabotage, that\'s in connection with strike action against President Slobodan Milosevic. You are listening to BBC news for The World.'

## 切割文字
我選擇把,.'還有空格都當作切割符。這樣我們在處理一些像是that's之類的縮寫就會直接幫我們分成兩個文字。

In [40]:
tok =  re.split('[,.\' ]', doc)
for i in tok:
    if i == '':
        tok.remove(i)
    if len(i) == 1:
        tok.remove(i)
tok

['And',
 'Yugoslav',
 'authorities',
 'are',
 'planning',
 'the',
 'arrest',
 'of',
 'eleven',
 'coal',
 'miners',
 'and',
 'two',
 'opposition',
 'politicians',
 'on',
 'suspicion',
 'of',
 'sabotage',
 'that',
 'in',
 'connection',
 'with',
 'strike',
 'action',
 'against',
 'President',
 'Slobodan',
 'Milosevic',
 'You',
 'are',
 'listening',
 'to',
 'BBC',
 'news',
 'for',
 'The',
 'World']

## 把整段文字轉成小寫

In [41]:
for i in range(len(tok)):
    tok[i] = tok[i].lower()
tok

['and',
 'yugoslav',
 'authorities',
 'are',
 'planning',
 'the',
 'arrest',
 'of',
 'eleven',
 'coal',
 'miners',
 'and',
 'two',
 'opposition',
 'politicians',
 'on',
 'suspicion',
 'of',
 'sabotage',
 'that',
 'in',
 'connection',
 'with',
 'strike',
 'action',
 'against',
 'president',
 'slobodan',
 'milosevic',
 'you',
 'are',
 'listening',
 'to',
 'bbc',
 'news',
 'for',
 'the',
 'world']

## 處理重複的字串

In [42]:
tok = list(set(tok))
tok

['connection',
 'listening',
 'eleven',
 'you',
 'arrest',
 'opposition',
 'world',
 'with',
 'planning',
 'in',
 'slobodan',
 'authorities',
 'action',
 'miners',
 'of',
 'president',
 'are',
 'politicians',
 'against',
 'the',
 'yugoslav',
 'sabotage',
 'bbc',
 'coal',
 'two',
 'news',
 'milosevic',
 'for',
 'that',
 'and',
 'strike',
 'to',
 'suspicion',
 'on']

## Porter's algorithm的實作
我把s(es, ies)以及(ing)都當作結尾刪除，其餘部分因為我看文本沒有，就先沒有處理。

In [43]:
def porteralgm(s):
    if s[-1] == 's':
        s = s[0:-1]
        if s[-1] == 'e':
            s = s[0:-1]
            if s [-1] == 'i':
                s = s[0:-1] + 'y'
    #print(s)
    
    if s[-3::1] == 'ing':
        s = s[0:-3]
        if s[-1] == s[-2]:
            s = s[0:-1]
            
    return s

In [44]:
for i in range(len(tok)):
    tok[i] = porteralgm(tok[i])
tok.sort()
tok

['action',
 'against',
 'and',
 'are',
 'arrest',
 'authority',
 'bbc',
 'coal',
 'connection',
 'eleven',
 'for',
 'in',
 'listen',
 'milosevic',
 'miner',
 'new',
 'of',
 'on',
 'opposition',
 'plan',
 'politician',
 'president',
 'sabotage',
 'slobodan',
 'strike',
 'suspicion',
 'that',
 'the',
 'to',
 'two',
 'with',
 'world',
 'you',
 'yugoslav']

## 建立我的stopword

In [45]:
stopword = ['and', 'are', 'the', 'of', 'on', 'of', 'in', 'that', 'with', 'you', 'to', 'for']
stopword.sort()
stopword

['and',
 'are',
 'for',
 'in',
 'of',
 'of',
 'on',
 'that',
 'the',
 'to',
 'with',
 'you']

## 刪除stopword
因為stopword字串跟tok字串都有排序過，可以在Linear的時間裡面處理完刪除stopword這件事情。

In [46]:
i = 0
j = 0
rmv = []
while i < len(tok) and j < len(stopword):
    #print(i,j)
    if tok[i] == stopword[j]:
        rmv.append(tok[i])
        i += 1
        j += 1
        continue
    if tok[i] < stopword[j]:
        i += 1
    else:
        j += 1

for i in rmv:
    tok.remove(i)
        
rmv

['and', 'are', 'for', 'in', 'of', 'on', 'that', 'the', 'to', 'with', 'you']

## 寫入txt檔之中。

In [47]:
f = open('output.txt', 'w+')
for i in tok:
    f.write(i)
    f.write('\n')
    #print(i)
f.close()

In [48]:
f = open('output.txt', 'r')
contents = f.read()
contents

'action\nagainst\narrest\nauthority\nbbc\ncoal\nconnection\neleven\nlisten\nmilosevic\nminer\nnew\nopposition\nplan\npolitician\npresident\nsabotage\nslobodan\nstrike\nsuspicion\ntwo\nworld\nyugoslav\n'