# Load the book

In [3]:
with open("texts/miracle_in_the_andes.txt", "r", encoding="utf-8") as file:
    book = file.read()

In [7]:
type(book)

str

# How many chapters?

### With string methods

In [9]:
book.count("Chapter")

11

### With regex

In [11]:
import re

In [16]:
pattern = re.compile("Chapter [0-9]+")
findings = re.findall(pattern, book)
print(findings)
len(findings)

['Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Chapter 6', 'Chapter 7', 'Chapter 8', 'Chapter 9', 'Chapter 10']


10

# Which sentences contain the word "love"?

In [22]:
pattern = re.compile("[A-Z]{1}[^.]*[^a-zA-Z]*[L|l]ove[^a-zA-Z]+[^.]*.")
findings = re.findall(pattern, book)
len(findings)

67

# What are the most used words?

In [25]:
pattern = re.compile("[a-z]+")
findings = re.findall(pattern, book.lower())
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word] + 1
    else:
        d[word] = 1

In [30]:
d_list =[(value, key) for (key, value) in d.items()]
sorted(d_list, reverse = True)[:10]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

# Extract the paragraphs where "love" is used

In [38]:
pattern = re.compile("[^\n]*love[^\n]*")
findings = re.findall(pattern, book)
len(findings)

60

# Extract the chapter titles

In [57]:
pattern = re.compile("Chapter [0-9]+\n\n(.*)\n")
findings = [match.group(1) for match in re.finditer(pattern, book)]
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

# Function that finds the number of occurrences of any word

In [58]:
def find_word(word, book=book):
    pattern = re.compile(f"[^a-z]+{word}[^a-z]+")
    findings = re.findall(pattern, book.lower())
    if findings:
        return len(findings)
    else:
        return f'The book does not contain the word "{word}".'

# Call the function

In [59]:
find_word("love")

83

In [60]:
find_word("hate")

'The book does not contain the word "hate".'

# The most used non-article words

In [65]:
d_list = sorted(d_list, reverse = True)
d_list[:10]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

In [72]:
import nltk
nltk.download("stopwords")
english_stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RubenduPon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
filtered_words = [(word, count) for count, word in d_list if word not in english_stopwords]
filtered_words[:10]

[('would', 575),
 ('us', 519),
 ('said', 292),
 ('roberto', 284),
 ('could', 252),
 ('one', 249),
 ('snow', 227),
 ('mountain', 183),
 ('time', 182),
 ('like', 165)]

# Sentiment Analysis: What are the most positive and negative chapters?

In [79]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [82]:
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\RubenduPon\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [87]:
pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)

In [None]:
chapters = chapters[1:]

In [90]:
for chapter in chapters:
    scores = analyzer.polarity_scores(chapter)
    print(scores)

{'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
{'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
{'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
{'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
{'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
{'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
{'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
{'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
{'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
{'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
