# Preprocessing  
1. cleaning text
1. split to words
1. regularizing words
1. remove swap word
1. id-nize words
1. padding

# cleaning text 

In [2]:
from bs4 import BeautifulSoup

def clean_html(html, strip = False):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(strip= strip)
    return text

In [3]:
html = '\n\n\n    this is example. \n  \n'

In [4]:
clean_html(html)

'\n\n\n    this is example. \n  \n'

In [5]:
clean_html(html, strip = True)

'this is example.'

In [6]:
text = "let's write documents with MKDocs"

In [7]:
import re 
def clean_hashtag(text):
    cleaned_text = re.sub(r'#[a-zA-Z]+', '', text)
    return cleaned_text

## hashtag

In [8]:
def clean_hashtag(text):
    cleaned_text = re.sub(r'( #[a-zA-Z]+)+$', '', text)
    cleaned_text = re.sub(r' #(?P<tag>[a-zA-Z]+) ', r'\g<tag>', cleaned_text)
    return cleaned_text

In [9]:
clean_hashtag('機械学習やるなら #python がいいよね。 #jupyter #pycon #scipy')

'機械学習やるならpythonがいいよね。'

## split words

In [10]:
from janome.tokenizer import Tokenizer
text = '彼女と国立新美術館ヘ行った。'
t = Tokenizer()
for token in t.tokenize(text):
    print(token)

彼女	名詞,代名詞,一般,*,*,*,彼女,カノジョ,カノジョ
と	助詞,格助詞,一般,*,*,*,と,ト,ト
国立	名詞,一般,*,*,*,*,国立,コクリツ,コクリツ
新	接頭詞,名詞接続,*,*,*,*,新,シン,シン
美術館	名詞,一般,*,*,*,*,美術館,ビジュツカン,ビジュツカン
ヘ	助詞,格助詞,一般,*,*,*,ヘ,ヘ,エ
行っ	動詞,自立,*,*,五段・ワ行促音便,連用タ接続,行う,オコナッ,オコナッ
た	助動詞,*,*,*,特殊・タ,基本形,た,タ,タ
。	記号,句点,*,*,*,*,。,。,。


In [11]:
t = Tokenizer(wakati = True)
t.tokenize(text)

['彼女', 'と', '国立', '新', '美術館', 'ヘ', '行っ', 'た', '。']

In [12]:
from janome.analyzer import Analyzer
from janome.tokenfilter import POSKeepFilter

token_filters = [POSKeepFilter('名詞')]
a = Analyzer(token_filters=token_filters)
for token in a.analyze(text):
    print(token)

彼女	名詞,代名詞,一般,*,*,*,彼女,カノジョ,カノジョ
国立	名詞,一般,*,*,*,*,国立,コクリツ,コクリツ
美術館	名詞,一般,*,*,*,*,美術館,ビジュツカン,ビジュツカン


# Add word to dictionary

In [17]:
# with open('data/userdic.csv', 'a') as f:
#     f.write('国立新美術館,1288,1288,100,名詞,固有名詞,一般,*,*,*,国立新美術館,コクリツシンビジュツカン,コクリツシンビジュツカン')

In [18]:
from janome.tokenizer import Tokenizer
t = Tokenizer(udic = 'data/userdic.csv', udic_enc = 'utf8')

In [19]:
for token in t.tokenize(text):
    print(token)

彼女	名詞,代名詞,一般,*,*,*,彼女,カノジョ,カノジョ
と	助詞,並立助詞,*,*,*,*,と,ト,ト
国立新美術館	名詞,固有名詞,一般,*,*,*,国立新美術館,コクリツシンビジュツカン,コクリツシンビジュツカン
ヘ	助詞,格助詞,一般,*,*,*,ヘ,ヘ,エ
行っ	動詞,自立,*,*,五段・ワ行促音便,連用タ接続,行う,オコナッ,オコナッ
た	助動詞,*,*,*,特殊・タ,基本形,た,タ,タ
。	記号,句点,*,*,*,*,。,。,。


# Regularization

## unification of character types
upper-case to lower-case.

In [23]:
text = 'President Obama is speaking at the White House.'
text.lower()

'president obama is speaking at the white house.'

In [24]:
text.upper()

'PRESIDENT OBAMA IS SPEAKING AT THE WHITE HOUSE.'

In [25]:
text.title()

'President Obama Is Speaking At The White House.'

## Replace disit-numbers  
if we assume that digit-numbers are not useful for our task (example for news classification in 'sports', 'politics'), we should replace them with other symbol.  

In [26]:
import re 
def normalize_number(text):
    return re.sub(r'\d+', '0', text)

In [27]:
text = '2万0689.24ドル'
normalize_number(text)

'0万0.0ドル'

if you want not to change number of digit-numbers, you can processed how you want as bellow.

In [28]:
def normalize_number(text):
    return re.sub(r'\d', '0', text)

In [29]:
normalize_number(text)

'0万0000.00ドル'

## Unification of words with common-means using dictionary  
There are in sentences frequently that words which have different spell but same mean.  
In such situation, we can use modified dictionary to change them to same spell.  

ex)
* ソニー, Sony, sony 
* パナソニック, Panasonic, Pana, 松下電器

# Remove stop-words  
Stop-words mean not useful-words for NLP tasks. For example, 「は」, 「の」．  
they are exist frequently and have bad effects fot preprocess calculation and quality.   

methods:
1. using dictionary
1. using frequency

## Remove using dictionary

In [30]:
# import requests
# url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
# content_text = requests.get(url).content.decode('utf-8')
# with open('data/Japanese.txt', 'a') as f:
#     f.write(content_text)

In [33]:
with open('data/Japanese.txt', 'r', encoding = 'utf-8') as f:
    stopwords = [w.strip() for w in f]
    stopwords = set(stopwords)

In [34]:
def remove_stopwords(words, stopwords):
    return [w for w in words if w not in stopwords]

In [35]:
from janome.tokenizer import Tokenizer
t = Tokenizer(wakati = True)
text = 'りんごをいくつか買う'
words = t.tokenize(text)
words

['りんご', 'を', 'いくつ', 'か', '買う']

In [36]:
remove_stopwords(words, stopwords)

['りんご', 'を', 'か', '買う']

## Remove using frequency

First of all, we should stop word list from text.

In [37]:
import requests
url = 'https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/ja.text8.zip'
dl_file = requests.get(url, stream=True)
with open('data/ja_text.zip', 'wb') as f:
    for chunk in dl_file.iter_content(chunk_size = 1024):
        if chunk:
            f.write(chunk)
            f.flush()

In [41]:
# zipファイルを展開
import zipfile
import os
target_directory = 'data'
zfile = zipfile.ZipFile('data/ja_text.zip')
if 'ja.text8' not in os.listdir('data'):
    zfile.extractall(target_directory)
else:
    print('already exists')

already exists


In [42]:
with open('data/ja.text8', 'r', encoding = 'utf-8') as f:
    text = f.read()
    words = text.split()

In [44]:
from collections import Counter 
Counter(['cat', 'dog', 'cat'])

Counter({'cat': 2, 'dog': 1})

In [46]:
fdist = Counter(words)

In [47]:
fdist.most_common(n = 10)

[('の', 828585),
 ('、', 785716),
 ('。', 532921),
 ('に', 527014),
 ('は', 488009),
 ('を', 423115),
 ('た', 421908),
 ('が', 353221),
 ('で', 350821),
 ('て', 259995)]

# ID conversion  
allocate id-number to a word and replace word by its id.

In [52]:
UNK = '<UNK>' # for unknown word
PAD = '<PAD>' # for padding process
vocab = {PAD:0, UNK:1}
for word, _ in fdist.most_common():
    vocab[word] = len(vocab)

In [54]:
words = ['私', 'は', '元気']
word_ids = [vocab.get(w, vocab[UNK]) for w in words]
word_ids

[1151, 6, 7901]

In [62]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
sequences = [[1,2], [3,4,5], [6,7,8,9]]
pad_sequences(sequences)

array([[0, 0, 1, 2],
       [0, 3, 4, 5],
       [6, 7, 8, 9]], dtype=int32)

In [64]:
pad_sequences(sequences, padding = 'post') #あと詰

array([[1, 2, 0, 0],
       [3, 4, 5, 0],
       [6, 7, 8, 9]], dtype=int32)

In [65]:
pad_sequences(sequences, maxlen = 3)

array([[0, 1, 2],
       [3, 4, 5],
       [7, 8, 9]], dtype=int32)

In [68]:
pad_sequences(sequences, maxlen = 3, truncating = 'post')

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]], dtype=int32)

In [69]:
pad_sequences(sequences, value = 99)

array([[99, 99,  1,  2],
       [99,  3,  4,  5],
       [ 6,  7,  8,  9]], dtype=int32)

# Practice preprocessing

* Split word
* Cleaning HTML-tag
* Regularization of digit-numbers
* Word conversion to original form
* Character conversion to lower case