# What words are most used in poems?
We will investigate the project Gutenburg corpus of poems to find out. It is assumed that the "quick-experiments" notebook has been run to attain the corpus. Given that this is largely old, we will expand the set of stop-words to account for outdated terms. 

In [1]:
import gzip, json
import re
from collections import Counter
import nltk

In [2]:
all_lines = []
for line in gzip.open("gutenberg-poetry-v001.ndjson.gz"):
    all_lines.append(json.loads(line.strip()))

In [3]:
stop_words = set(nltk.corpus.stopwords.words('english')) | set(
    [
        "thy", "thou", "yet", "thee", "thus", "like", "shall", "upon", "would", "let", 
        "us", "me", "come", "could","though", "must", "may", ".", "'tis", "go", "hath", 
        "ye", "came", "/", "o'", "--", "i'll", "i'm", "'twas", "there's", "th'", "'til",
        "o'er", "till", "thine", "mine", "unto", "doth", "since", "an'",
    ]
)

In [4]:
line_text = [line["s"].lower().replace(",", "").replace('"', "") for line in all_lines]
split_line = " ".join(line_text).split()

In [5]:
unsplit_line = [line for line in split_line if line not in stop_words]

In [6]:
counts = Counter(unsplit_line).most_common(100)
counts

[('one', 54569),
 ('love', 39450),
 ('heart', 29736),
 ('see', 28616),
 ('still', 28527),
 ('man', 27023),
 ('little', 26744),
 ('old', 26182),
 ('day', 25648),
 ('never', 24271),
 ('eyes', 24136),
 ('life', 23709),
 ('long', 23444),
 ('said', 22054),
 ('great', 21736),
 ('men', 20648),
 ('made', 20100),
 ('know', 19896),
 ('god', 19842),
 ('sweet', 19719),
 ('night', 19543),
 ('whose', 19309),
 ('light', 19124),
 ('many', 19047),
 ('every', 18891),
 ('time', 18501),
 ('good', 18360),
 ('far', 18136),
 ('first', 18052),
 ('fair', 17470),
 ('hand', 17304),
 ('well', 17301),
 ('make', 16645),
 ('ever', 16511),
 ('earth', 16351),
 ('might', 16220),
 ('world', 16063),
 ('soul', 15809),
 ('round', 14777),
 ('last', 14712),
 ('saw', 14584),
 ('say', 13879),
 ('away', 13745),
 ('high', 13615),
 ('face', 13613),
 ('death', 13533),
 ('sun', 13270),
 ('back', 13258),
 ('king', 13245),
 ('thought', 13087),
 ('way', 13002),
 ('take', 12986),
 ('give', 12752),
 ('two', 12629),
 ('heaven', 12624),
 

In [7]:
Counter(split_line).most_common(300)

[('the', 1306085),
 ('and', 832878),
 ('of', 533157),
 ('to', 443925),
 ('a', 363104),
 ('in', 343001),
 ('i', 248851),
 ('that', 235010),
 ('with', 221273),
 ('his', 210137),
 ('he', 172201),
 ('for', 158052),
 ('my', 152525),
 ('is', 148548),
 ('as', 141122),
 ('on', 129292),
 ('from', 122053),
 ('but', 122000),
 ('her', 120705),
 ('all', 119248),
 ('it', 110637),
 ('was', 105615),
 ('not', 102952),
 ('by', 95086),
 ('they', 89053),
 ('you', 85911),
 ('their', 83347),
 ('so', 83187),
 ('at', 82083),
 ('be', 81988),
 ('me', 79195),
 ('when', 78874),
 ('this', 76244),
 ('or', 76089),
 ('thy', 70912),
 ('she', 70322),
 ('we', 68001),
 ('are', 67333),
 ('who', 64125),
 ('no', 62910),
 ('him', 61963),
 ('thou', 60940),
 ('have', 59198),
 ('then', 58925),
 ('one', 54569),
 ('what', 52898),
 ('which', 52300),
 ('your', 51685),
 ('now', 51492),
 ('like', 51362),
 ('our', 51285),
 ('there', 50559),
 ('had', 49779),
 ('will', 48382),
 ('if', 47723),
 ('where', 47694),
 ('were', 46071),
 ('shal

In [8]:
print("\n".join([count[0] for count in counts[:50]]))

one
love
heart
see
still
man
little
old
day
never
eyes
life
long
said
great
men
made
know
god
sweet
night
whose
light
many
every
time
good
far
first
fair
hand
well
make
ever
earth
might
world
soul
round
last
saw
say
away
high
face
death
sun
back
king
thought
