In [1]:
import requests
# download the text
result = requests.get('http://www.gutenberg.org/files/36/36-0.txt')

# This line removes the header and footer
text = result.text[840:].split("*** END")[0]

# This line removes the (weird) non ascii characters
text = text.encode('ascii',errors='ignore').decode('utf-8')

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prajw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#1. 
tokens_low = [tk.lower() for tk in text.split() ]
print(f"{len(set(tokens_low))} unique lowercase tokens")

10046 unique lowercase tokens


In [4]:
# 2.
def chartokenizer(word):
    # extracts all the characters in a given word
    return [c for c in word]

tokens = [txt for txt in text.split()]

chars = []
for tk in tokens :
    chars += chartokenizer(tk)
    
lowercase_chars = [c.lower() for c in chars]
print('Total number of characters : ', len(chars))

print("Number of unique_chars : ", len(set(chars)))

print("Number of unique lowercase characters : ", len(set(lowercase_chars)))

Total number of characters :  274677
Number of unique_chars :  70
Number of unique lowercase characters :  44


In [5]:
# 3.
from nltk.tokenize import WordPunctTokenizer
from collections import Counter
# instantiate the tokenizer and tokenize in one line
tokens = WordPunctTokenizer().tokenize(text)

In [6]:
tokens_count = Counter(tokens)
print("20 most common tokens:\n",tokens_count.most_common(20))
print(f"\n- tokens_count['the'] + tokens_count['The'] = {tokens_count['the'] + tokens_count['The'] } which is less than 5000 ")
total_token_count = len(tokens)
print(total_token_count)

20 most common tokens:
 [('the', 4399), (',', 4134), ('.', 3141), ('and', 2358), ('of', 2284), ('a', 1529), ('I', 1264), ('to', 1157), ('in', 920), ('was', 850), ('that', 739), ('had', 565), ('it', 483), ('with', 435), ('my', 411), ('as', 402), ('at', 369), ('were', 368), ('on', 360), ('The', 346)]

- tokens_count['the'] + tokens_count['The'] = 4745 which is less than 5000 
68580


In [7]:
top_10_count = 0

# token_count is a tuple: (token, count)
for token_count in tokens_count.most_common(10):
    top_10_count += token_count[1]
print(top_10_count)

print(f"Top 10 most common words make up {top_10_count/total_token_count*100 : .2f}% of total words")

22036
Top 10 most common words make up  32.13% of total words


In [8]:
# 4.
from nltk.tokenize import WordPunctTokenizer

from nltk.corpus import stopwords

stopwords = stopwords.words("english")

In [9]:
tokens = [tk.lower() for tk in tokens if tk.lower() not in stopwords]
Counter(tokens).most_common(20)

[(',', 4134),
 ('.', 3141),
 ('-', 321),
 (';', 243),
 ('!', 202),
 ('one', 196),
 ('upon', 171),
 ('martians', 167),
 ('said', 166),
 ('people', 158),
 ('came', 150),
 ('saw', 131),
 ('towards', 129),
 ('black', 122),
 ('?', 121),
 ('time', 120),
 ('man', 119),
 ('could', 116),
 ('little', 112),
 ('road', 104)]

In [10]:
# 5.
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)
lemmas = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]
lemmas[:10]

[' ',
 'Wells',
 '\r\n\r\n\r\n\r\n\r\n   ',
 'shall',
 'dwell',
 'world',
 'inhabit',
 '\r\n    ',
 'Lords',
 'World']

In [11]:
capitalized = [tk for tk in lemmas if tk.capitalize() == tk]
Counter(capitalized).most_common(20)

[('\r\n', 4376),
 ('\r\n\r\n', 853),
 ('Martians', 163),
 ('London', 56),
 ('Woking', 50),
 ('Martian', 48),
 ('Mars', 42),
 ('Heat', 36),
 ('Ray', 34),
 ('\r\n\r\n\r\n\r\n\r\n', 31),
 ('\r\n\r\n\r\n', 30),
 ('Hill', 29),
 ('\r\n ', 25),
 ('Weybridge', 24),
 ('Horsell', 23),
 ('Street', 23),
 ('Ogilvy', 20),
 ('Sunday', 20),
 ('Thames', 20),
 ('God', 19)]

In [12]:
# 6.
persons = [ent.text for ent in doc.ents if ent.label_ in ['PERSON']]
gpes = [ent.text for ent in doc.ents if ent.label_ in ['GPE']]
locs = [ent.text for ent in doc.ents if ent.label_ in ['LOC']]

In [13]:
Counter(persons).most_common(20)

[('Londonward', 10),
 ('Ottershaw', 7),
 ('Chertsey', 7),
 ('Surrey', 7),
 ('Stent', 6),
 ('Heat-Ray', 6),
 ('Walton', 6),
 ('Weybridge', 5),
 ('Waterloo', 4),
 ('Ripley', 4),
 ('gaunt', 4),
 ('Putney Hill', 4),
 ('bush', 3),
 ('Kew', 3),
 ('Ditton', 3),
 ('George', 3),
 ('Elphinstone', 3),
 ('Miss Elphinstone', 3),
 ('Sheen', 3),
 ('Henderson', 2)]

In [14]:
Counter(gpes).most_common(20)

[('London', 56),
 ('Woking', 23),
 ('Richmond', 11),
 ('Henderson', 10),
 ('Byfleet', 9),
 ('LONDON', 6),
 ('Shepperton', 6),
 ('Halliford', 6),
 ('England', 4),
 ('Pyrford', 4),
 ('Sunbury', 4),
 ('Maybury', 3),
 ('Titan', 3),
 ('Strand', 3),
 ('Kilburn', 3),
 ('Ripley', 3),
 ('Edgware', 3),
 ('Stanmore', 3),
 ('Ulla', 3),
 ('Cardigan', 2)]

In [15]:
Counter(locs).most_common(20)

[('Mars', 42),
 ('Regents Park', 5),
 ('Venus', 4),
 ('EARTH', 2),
 ('Send', 2),
 ('New Barnet', 2),
 ('Sparks', 1),
 ('Smiths', 1),
 ('Earth', 1),
 ('the South-Eastern', 1),
 ('West Surrey', 1),
 ('Regent Street', 1),
 ('the Thames Valley', 1),
 ('Richmond Hill', 1),
 ('Richmond Park', 1),
 ('East Barnet', 1),
 ('Asia', 1),
 ('the Black Smoke', 1),
 ('Kensington Gardens', 1),
 ('Hyde Park', 1)]

In [16]:
# 7.
lemmas = [token.lemma_.lower() for token in doc if not (token.is_stop or token.is_punct)]

from nltk.util import ngrams
bigram_generator = ngrams(lemmas, n = 2)
bigrams = [tk for tk in bigram_generator]
Counter(bigrams).most_common(20)

[(('come', '\r\n'), 41),
 (('heat', 'ray'), 37),
 (('house', '\r\n'), 37),
 (('see', '\r\n'), 36),
 (('\r\n', 'martians'), 33),
 (('\r\n', 'man'), 33),
 (('martians', '\r\n'), 32),
 (('man', '\r\n'), 30),
 (('\r\n', 'see'), 30),
 (('go', '\r\n'), 29),
 (('\r\n', 'people'), 28),
 (('\r\n', 'come'), 26),
 (('\r\n', 'house'), 25),
 (('red', 'weed'), 25),
 (('say', '\r\n\r\n'), 24),
 (('time', '\r\n'), 24),
 (('\r\n', 'road'), 23),
 (('people', '\r\n'), 23),
 (('thing', '\r\n'), 22),
 (('\r\n', '\r\n\r\n'), 22)]

In [17]:
trigram_generator = ngrams(lemmas, n = 3)
trigrams = [tk for tk in trigram_generator]
Counter(trigrams).most_common(20)

[(('ulla', 'ulla', 'ulla'), 11),
 (('\r\n', 'heat', 'ray'), 8),
 (('heat', 'ray', '\r\n'), 8),
 (('black', 'smoke', '\r\n'), 7),
 (('red', 'weed', '\r\n'), 7),
 (('handling', 'machine', '\r\n'), 6),
 (('\r\n', 'sand', 'pit'), 5),
 (('\r\n', 'handling', 'machine'), 5),
 (('not', 'know', '\r\n'), 4),
 (('far', 'away', '\r\n'), 4),
 (('\r\n\r\n', 'come', 'say'), 4),
 (('see', '\r\n', 'martians'), 4),
 (('people', 'come', '\r\n'), 4),
 (('heat', 'ray', '\r\n\r\n'), 4),
 (('\r\n', 'fighting', 'machine'), 4),
 (('st.', 'georges', 'hill'), 4),
 (('st.', 'johns', 'wood'), 4),
 (('\r\n', 'strange', 'thing'), 3),
 (('hear', '\r\n', 'people'), 3),
 (('smoke', 'rise', '\r\n'), 3)]

In [18]:
# 8.
sentences = text.split('.')
[s for s in sentences if "ulla" in s.lower()]

[' It was a sobbing alternation of\r\ntwo notes, Ulla, ulla, ulla, ulla, keeping on perpetually',
 '\r\n\r\nUlla, ulla, ulla, ulla, wailed that superhuman notegreat waves of\r\nsound sweeping down the broad, sunlit roadway, between the tall\r\nbuildings on each side',
 '\r\n\r\nUlla, ulla, ulla, ulla, cried the voice, coming, as it seemed to me,\r\nfrom the district about Regents Park',
 '\r\n\r\nI awoke to find that dismal howling still in my ears, Ulla, ulla,\r\nulla, ulla',
 ' That perpetual sound of Ulla,\r\nulla, ulla, ulla, confused my mind',
 ' As the yelping died away down the silent road, the\r\nwailing sound of Ulla, ulla, ulla, ulla, reasserted itself',
 '\r\n\r\nAs I crossed the bridge, the sound of Ulla, ulla, ulla, ulla, ceased']

In [19]:
# 9.
import numpy as np

punctuation_signs = ['!', '(', ')', ',', '-', '.',':', ';', '?','_']

# Splitting the text over '.', since we are considering the complete sentence
sentences = text.split('.')

# for each sentence add its length
sentence_length = []
for s in sentences:
  tokens = [tk for tk in WordPunctTokenizer().tokenize(s) if tk not in punctuation_signs]
  sentence_length.append(len(tokens))

# average sentence length
np.mean(sentence_length)

19.180807117890055

In [None]:
# 10.
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
sentences = [sent.text.replace('\r\n', ' ').strip() for sent in doc.sents]

sdoc = nlp(sentence)
[token for token in sdoc if token.pos_ == 'ADJ']