# Load the book

In [1]:
with open("miracle_in_the_andes.txt", "r", encoding="utf-8") as file:
    book = file.read()

In [2]:
type(book)

str

# How many chapters?

### With string methods (incorrect)

In [3]:
book.count("Chapter")

11

### With regex

In [4]:
import re

In [5]:
pattern = re.compile("Chapter [0-9]+")

In [6]:
chapters = re.findall(pattern, book)
len(chapters)

10

# Which are the sentences where "love" was used?

In [7]:
pattern = re.compile(r"[A-Z]{1}[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*\.")
findings = re.findall(pattern, book)
len(findings)

67

# What are the most used words?

In [8]:
pattern = re.compile("[a-zA-z]+")
words = re.findall(pattern, book.lower())

In [9]:
occurrences = {}
for word in words:
    if word in occurrences.keys():
        occurrences[word] += 1
    else:
        occurrences[word] = 1

occurrence_list = [(value, key) for (key, value) in occurrences.items()]
sorted(occurrence_list, reverse=True)[:3]

[(5346, 'the'), (2795, 'and'), (2729, 'i')]

# The most used words (non-articles)

In [10]:
occurrences = {}
for word in words:
    if word in occurrences.keys():
        occurrences[word] += 1
    else:
        occurrences[word] = 1

occurrence_list = [(value, key) for (key, value) in occurrences.items()]
occurrence_list = sorted(occurrence_list, reverse=True)

occurrence_list[:3]

[(5346, 'the'), (2795, 'and'), (2729, 'i')]

In [11]:
import nltk
from nltk.corpus import stopwords

#nltk.download('stopwords')

english_stopwords = stopwords.words("english")

In [13]:
filtered_words = []

for count, word in occurrence_list:
    if word not in english_stopwords:
        filtered_words.append((count, word))

filtered_words[:5]

[(575, 'would'), (519, 'us'), (292, 'said'), (284, 'roberto'), (252, 'could')]

# Sentiment analysis: What is the most positive and the most negative chapter?

In [27]:
from nltk.sentiment import SentimentIntensityAnalyzer

#nltk.download('vader_lexicon')

### An example

In [28]:
analyzer = SentimentIntensityAnalyzer()

In [29]:
scores = analyzer.polarity_scores("I love the trees outside.")

In [30]:
if scores["pos"] > scores["neg"]:
    print("It is a positive text")
else:
    print("It is a negative text")

It is a positive text


In [31]:
analyzer.polarity_scores(book)

{'neg': 0.116, 'neu': 0.76, 'pos': 0.125, 'compound': 1.0}

### Chapters sentiment analysis

In [36]:
import re

pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)
chapters = chapters[1:]

In [40]:
for nr, chapter in enumerate(chapters):
    scores = analyzer.polarity_scores(chapter)
    print(nr + 1, scores)

1 {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
2 {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
3 {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
4 {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
5 {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
6 {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
7 {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
8 {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
9 {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
10 {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
