In [1]:
import sys, subprocess

def install(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

try:
    import requests
except ImportError:
    install("requests")
    import requests

try:
    import nltk
except ImportError:
    install("nltk")
    import nltk

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker")
nltk.download("words")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [2]:
gutenberg_url = "https://www.gutenberg.org/cache/epub/77428/pg77428.txt"
response = requests.get(gutenberg_url, timeout=30)

if response.status_code != 200:
    raise RuntimeError(f"Failed to download: {response.status_code}")

raw_text = response.text


In [3]:
start_marker = "* START OF"
end_marker = "* END OF"
start_idx = raw_text.find(start_marker)
end_idx = raw_text.find(end_marker)
if start_idx != -1 and end_idx != -1:
    clean_text = raw_text[start_idx:end_idx]
else:
    clean_text = raw_text

# Save locally
with open("novel_raw.txt", "w", encoding="utf-8") as f:
    f.write(clean_text)

In [6]:
import re
import nltk
nltk.download('punkt_tab') # Download the missing resource

text_lower = clean_text.lower()
text_no_punct = re.sub(r"[^a-z0-9\s]", " ", text_lower)
text_clean = re.sub(r"\s+", " ", text_no_punct).strip()

# Tokenization
from nltk.tokenize import sent_tokenize, word_tokenize
sentences = sent_tokenize(clean_text)
words = word_tokenize(text_clean)

print(f"Sentences: {len(sentences)}")
print(f"Words: {len(words)}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Sentences: 6854
Words: 80268


In [7]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
words_no_sw = [w for w in words if w not in stop_words and w.isalpha()]


In [8]:
from nltk.probability import FreqDist
fdist = FreqDist(words_no_sw)
print("\nTop 20 words (no stopwords):")
for word, freq in fdist.most_common(20):
    print(f"{word}: {freq}")


Top 20 words (no stopwords):
one: 278
conquest: 240
would: 216
tuan: 203
garon: 169
man: 153
upon: 145
like: 140
know: 140
black: 135
could: 129
muda: 129
lhassa: 128
white: 127
eyes: 127
seemed: 124
night: 120
said: 118
yes: 117
room: 116


In [9]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stems = [stemmer.stem(w) for w in words_no_sw]
fdist_stems = FreqDist(stems)
print("\nTop 20 stems:")
for stem, freq in fdist_stems.most_common(20):
    print(f"{stem}: {freq}")



Top 20 stems:
one: 278
conquest: 241
would: 216
tuan: 203
garon: 169
know: 164
man: 154
like: 151
seem: 149
upon: 145
black: 140
eye: 140
go: 137
smile: 137
white: 129
could: 129
muda: 129
lhassa: 128
night: 127
face: 127


In [13]:
import nltk
nltk.download('averaged_perceptron_tagger_eng') # Download the missing resource

sample_tokens = words_no_sw[:500]
pos_tags = nltk.pos_tag(sample_tokens)
print("\nSample POS tags (first 25):")
for token, tag in pos_tags[:25]:
    print(f"{token}: {tag}")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.



Sample POS tags (first 25):
start: NN
project: NN
gutenberg: NN
ebook: VB
black: JJ
parrot: NN
black: JJ
parrot: NN
tale: NN
golden: JJ
chersonese: JJ
harry: NN
hervey: NN
author: NN
caravans: NNS
night: NN
etc: VBP
perceive: JJ
grace: NN
romance: NN
man: NN
exalted: VBD
animals: NNS
james: NNS
branch: VBP


In [18]:
sample_sents = sentences[:10]
tokenized_sents = [word_tokenize(s) for s in sample_sents]
pos_tagged_sents = [nltk.pos_tag(ts) for ts in tokenized_sents]
ner_chunks = [nltk.ne_chunk(tags) for tags in pos_tagged_sents]

print("\nNamed entities (sample):")
for tree in ner_chunks:
    for subtree in tree:
        if hasattr(subtree, "label") and subtree.label() in ("PERSON", "ORGANIZATION", "GPE"):
            entity = " ".join(token for token, _ in subtree.leaves())
            print(f"{subtree.label()}: {entity}")




Named entities (sample):
ORGANIZATION: THE
ORGANIZATION: PROJECT
ORGANIZATION: BLACK
ORGANIZATION: PARROT
ORGANIZATION: THE
ORGANIZATION: BLACK
ORGANIZATION: PARROT _A Tale
ORGANIZATION: Golden Chersonese_
ORGANIZATION: HARRY
ORGANIZATION: ETC
GPE: Romance
ORGANIZATION: JAMES
ORGANIZATION: BRANCH
ORGANIZATION: CABELL
ORGANIZATION: THE
ORGANIZATION: CENTURY
ORGANIZATION: BEDELL
ORGANIZATION: Khmers
PERSON: Manipur
PERSON: Arakan
ORGANIZATION: Lake
PERSON: Tonle Sap
GPE: Angkor
ORGANIZATION: Tevadas
PERSON: Naga
ORGANIZATION: MAN
ORGANIZATION: FROM
ORGANIZATION: BLUE
PERSON: _Cambodia_ V
ORGANIZATION: CONQUEST
ORGANIZATION: DREAM
ORGANIZATION: MALAY
ORGANIZATION: HOUSE
ORGANIZATION: BARABBAS
ORGANIZATION: BLACK
ORGANIZATION: PARROT
ORGANIZATION: MAN
ORGANIZATION: FROM
ORGANIZATION: Pacific
GPE: Nowhere


In [19]:
from nltk.util import ngrams
bigrams = list(ngrams(words_no_sw, 2))
trigrams = list(ngrams(words_no_sw, 3))
fdist_bi = FreqDist(bigrams)
fdist_tri = FreqDist(trigrams)

print("\nTop 10 bigrams:")
for bg, freq in fdist_bi.most_common(10):
    print(f"{' '.join(bg)}: {freq}")

print("\nTop 10 trigrams:")
for tg, freq in fdist_tri.most_common(10):
    print(f"{' '.join(tg)}: {freq}")



Top 10 bigrams:
tuan muda: 129
barth lemy: 100
black parrot: 52
abu hassan: 41
tuan rajah: 40
dr garth: 36
blue slendong: 21
emerald buddha: 20
barabbas town: 19
stephen conquest: 19

Top 10 trigrams:
captain barth lemy: 17
mr da vargas: 15
cap st jacques: 13
le perroquet noir: 10
pi noi bayadere: 8
woman peacock shawl: 5
man blue slendong: 4
man scarred wrists: 4
drew deep breath: 4
black parrot know: 4


In [23]:
text_tokens_original = word_tokenize(clean_text)
text_obj = nltk.Text(text_tokens_original)
print("\nConcordance for 'Guiana':")
try:
    text_obj.concordance("Guiana", width=80, lines=5)
except Exception as e:
    print(f"No concordance output: {e}")



Concordance for 'Guiana':
Displaying 5 of 19 matches:
remember ? _ CONTENTS I THE MAN FROM GUIANA II EPISODE III THE BLUE SLENDONG IV 
 BLACK PARROT CHAPTER I THE MAN FROM GUIANA He had come up from that necklace of
iting . If so , he began near home ; Guiana 's just across from Hades , you know
rchant of Hai Fong , who was sent to Guiana . The prisoners dubbed him the Black
apes -- or evasions , as they say in Guiana . Then , one day , the very man who 
