# Stop Words

Stop words refer to a list of common words of a language that are often useful to filter out, for example 'and' or 'I' in English, or 'が' or 'を' in Japanese.

In [1]:
import pandas as pd
import spacy

In [2]:
# Load NLP models (both English and Japanese)
enlp = spacy.load('en_core_web_trf')
jnlp = spacy.load('ja_core_news_lg')

## Listing out Default Stop Words of a Language

In [3]:
# Find default stop words in English
print(f'Number of stop words: {len(enlp.Defaults.stop_words)}')
print(enlp.Defaults.stop_words)

Number of stop words: 326
{'thereafter', 'whole', 'did', 'nine', 'twelve', 'for', 'ever', 'using', 'only', 'two', 'onto', 'herein', 'used', 'you', 'whenever', '’ve', 'on', 'itself', 'through', 'though', 'ca', 'hereby', 'neither', 'mostly', 'thence', 'upon', 'namely', 'behind', 'thereby', '‘d', 'can', 'become', 'who', 'himself', 'former', 'among', 'nothing', 'this', '‘ll', 'he', 'part', 'eleven', 'made', 'another', 'ten', 'very', 'whose', 'otherwise', 'somehow', 'several', 'perhaps', 'your', 'that', 'there', 'indeed', 'once', 'nobody', 'she', 'via', 'every', 'everyone', 'afterwards', 'here', 'same', 'bottom', 'side', 'must', 'top', 'as', 'doing', 'myself', 'own', 'many', 'to', 'n‘t', 'other', 'fifty', 'front', 'show', 'take', 'whom', 'off', 'themselves', 'whether', 'few', 'unless', '‘m', 'regarding', 'ours', 'give', 'back', 'per', 'herself', 'everywhere', 'go', 'are', 'seem', 'eight', 'all', 'they', 'down', 'say', 'see', 'n’t', 'of', 'well', 'however', 'with', 'where', 'am', 'whereas', 

In [4]:
# Find default stop words in Japanese
print(f'Number of stop words: {len(jnlp.Defaults.stop_words)}')
print(jnlp.Defaults.stop_words)

Number of stop words: 154
{'うち', 'まま', 'きっかけ', 'ため', 'なお', 'なし', 'ず', 'ぶり', 'は', 'で', 'かなり', 'ほど', 'られる', 'いう', 'すべて', 'ほか', 'および', 'る', 'それぞれ', 'て', 'しかし', 'なく', 'しか', 'いつ', 'ご', 'が', 'こう', 'かつ', 'す', 'よく', 'な', 'べき', 'なる', 'だ', 'いい', 'さ', 'ただし', 'せる', 'いずれ', 'しよう', 'お', 'き', 'か', 'でき', 'を', 'つい', 'に', 'かつて', 'い', 'たい', 'あまり', 'の', 'など', 'あ', 'から', 'ちゃん', 'そこ', 'とっ', 'ね', 'つつ', 'これ', 'なかっ', 'なり', 'まで', 'られ', 'ら', 'え', 'どう', 'み', 'いっ', 'れ', 'へ', 'おい', 'その', 'また', 'なっ', 'もと', 'らしい', 'より', 'そして', 'あるいは', 'よれ', 'よっ', 'ます', 'ぬ', 'も', 'たり', 'つ', 'する', 'あっ', 'とき', 'かけ', 'た', 'ここ', 'くる', 'し', 'だっ', 'ところ', 'だけ', 'にて', 'ほぼ', 'たら', 'しまっ', 'やっ', 'あり', 'ほとんど', 'ま', 'そう', 'こ', 'おり', 'おら', 'と', 'とも', 'いる', 'よ', 'それ', 'すぐ', '一', 'しまう', 'のみ', 'もの', 'ながら', 'ば', 'なら', 'こと', 'ひと', 'はじめ', 'れる', 'あれ', 'ん', 'なけれ', 'ある', 'できる', 'たち', 'おけ', 'のち', 'いく', 'ない', 'や', 'さん', 'つけ', 'さらに', 'もう', 'ごと', 'くん', 'です', 'よう', 'いわ', 'この', 'せい', 'せ', 'よる', 'ち', 'もっ'}


## Checking if a Word is a Stop Word

In [5]:
enlp.vocab['he'].is_stop

True

In [6]:
enlp.vocab['hello'].is_stop

False

In [7]:
jnlp.vocab['こ'].is_stop

True

In [8]:
jnlp.vocab['こんにちは'].is_stop

False

In [9]:
def stopWordsInfo(doc: spacy.tokens.Doc) -> pd.DataFrame:
    stop_words_info = []
    for token in doc:
        stop_words_info.append([token.text, token.is_stop])
    
    # Table header
    headers = ['Text', 'Is Stop']
    
    # Create and return a Pandas DataFrame containing all tokens
    # with information about whether each token is a stop word
    table = pd.DataFrame(columns=headers, data=stop_words_info)
    return table

In [10]:
edoc1 = enlp(u'It is great to have you here')
edoc1

It is great to have you here

In [11]:
jdoc1 = jnlp(u'今日はお天気がいいですね！')
jdoc1

今日はお天気がいいですね！

In [12]:
stopWordsInfo(edoc1)

Unnamed: 0,Text,Is Stop
0,It,True
1,is,True
2,great,False
3,to,True
4,have,True
5,you,True
6,here,True


In [13]:
stopWordsInfo(jdoc1)

Unnamed: 0,Text,Is Stop
0,今日,False
1,は,True
2,お,True
3,天気,False
4,が,True
5,いい,True
6,です,True
7,ね,True
8,！,False


## Adding a Stop Word

There may be times when you wish to add a new stop word to the default set. Perhaps you decide that `'btw'` (common shorthand for "by the way") should be considered a stop word.

In [14]:
# Check that before adding 'btw' to the stop words set, it isn't a stop word
enlp.vocab['btw'].is_stop

False

In [15]:
# Add the word to the set of stop words. Use lowercase!
enlp.Defaults.stop_words.add('btw')

# Set the stop_word tag on the lexeme
enlp.vocab['btw'].is_stop = True

In [16]:
len(enlp.Defaults.stop_words)

327

In [17]:
enlp.vocab['btw'].is_stop

True

<font color='red'>When adding stop words, always use lowercase. Lexemes are converted to lowercase before being added to **vocab**</font>

In [18]:
# Check that before adding '名前' to the stop words set, it isn't a stop word
j_word = '名前'
jnlp.vocab[j_word].is_stop

False

In [19]:
# Add the Japanese word to the set of stop words
jnlp.Defaults.stop_words.add(j_word)

# Set the stop_word tag on the lexeme
jnlp.vocab[j_word].is_stop = True

In [20]:
len(jnlp.Defaults.stop_words)

155

In [21]:
stopWordsInfo(jnlp(u'私の名前はspaCyです。'))

Unnamed: 0,Text,Is Stop
0,私,False
1,の,True
2,名前,True
3,は,True
4,spaCy,False
5,です,True
6,。,False


## Removing a Stop Word

At some point in time, you may decide that `'btw'` should not be considered a stop word.

In [22]:
# Remove the word from the set of stop words
enlp.Defaults.stop_words.remove('btw')

# Remove the stop_word tag from the lexeme
enlp.vocab['btw'].is_stop = False

In [23]:
len(enlp.Defaults.stop_words)

326

In [24]:
enlp.vocab['btw'].is_stop

False

In [25]:
# Remove the Japanese word '名前' that was set as a stop word
# from the set of stop words
print(j_word)
jnlp.Defaults.stop_words.remove(j_word)

# Remove the stop_word tag from the lexeme
jnlp.vocab[j_word].is_stop = False

名前


In [26]:
len(jnlp.Defaults.stop_words)

154

In [27]:
stopWordsInfo(jnlp(u'私の名前はspaCyです。'))

Unnamed: 0,Text,Is Stop
0,私,False
1,の,True
2,名前,False
3,は,True
4,spaCy,False
5,です,True
6,。,False
