# Custom Sources - Example: Remove Stopwords

**Goal:** `open alice from gutenberg. Remove stopwords from 100 most common words`

In [1]:
import nltk

In [2]:
alice = nltk.corpus.gutenberg.words("carroll-alice.txt")

In [3]:
# Normalize words: Make sure all words are alpha (no punctuation)
# Then lowercase all words 
alice = [word.lower() for word in alice if word.isalpha()]
alice[:8]

['alice', 's', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll']

In [4]:
# Frequency distribution of the normalized words
alice_fd = nltk.FreqDist(alice)
alice_fd

FreqDist({'the': 1642, 'and': 872, 'to': 729, 'a': 632, 'it': 595, 'she': 553, 'i': 543, 'of': 514, 'said': 462, 'you': 411, ...})

In [5]:
# Grab 100 most common words
alice_100 = alice_fd.most_common(100)
alice_100[:6]
# It gives the most common words and their count/ frequency

[('the', 1642),
 ('and', 872),
 ('to', 729),
 ('a', 632),
 ('it', 595),
 ('she', 553)]

In [6]:
# Select out the most common words and not their count
alice_common = [word[0] for word in alice_100]

In [7]:
# First 12 most common words
alice_common[:12]

['the', 'and', 'to', 'a', 'it', 'she', 'i', 'of', 'said', 'you', 'alice', 'in']

**Now we will load up the stop words and remove them from the alice_common list**

In [8]:
descriptive = list(set(alice_common) - set(nltk.corpus.stopwords.words("english")))

In [9]:
descriptive

['began',
 'turtle',
 'like',
 'see',
 'would',
 'alice',
 'know',
 'much',
 'king',
 'well',
 'way',
 'gryphon',
 'say',
 'mock',
 'went',
 'think',
 'queen',
 'go',
 'could',
 'said',
 'rabbit',
 'time',
 'one',
 'little',
 'head',
 'first',
 'quite',
 'thought',
 'hatter']

`These are the descriptive words among the 100 most common words in alice in wonderland after removing the stopwords`

In [39]:
len(alice_common), len(descriptive)

(100, 29)

### 100 most common words reduced to only 29 descriptive words