In [1]:
# Cleaning up
%reset -f

# Importing libraries
from os import makedirs, path
from lzma import decompress
from requests import get
from math import log

### I. `Data Exploration`


#### I.1. Load data


In [2]:
url = 'https://raw.githubusercontent.com/DsMix/Books/main/war_peace_processed.txt.xz'

data_dir = 'data/'

# Create data folder if not exists
if not path.exists('data'):
    makedirs(data_dir)

data_path = data_dir + \
  url.split("/")[-1].rsplit('.', 1)[0]

# Download file
response = get(url)
content = response.content
unzip_file = decompress(content)

# Save file to data folder
with open(data_path, 'wb') as file:
    file.write(unzip_file)
    print(f'Data saved to project folder:\t {data_path}')

Data saved to project folder:	 data/war_peace_processed.txt


#### I.2. Data overview


In [3]:
# Read file from data folder
with open(data_path, 'r') as file:
  data = file.read()

display(data[:43])

'1\nв\nдва\nраза\nкороче\nи\nв\nпять\nраз\nинтереснее'

#### I.3. Data preprocessing


In [4]:
# Split by chapters

chapters = data.split('[new chapter]')
chapters_words = []

for each in chapters:
  words = [el for el in each.split('\n') if el]
  chapters_words.append(words)

chapters_words[2]

['первая']

### II. `Data Analysis`


#### II.1. Word Count


In [5]:
words_all = []
chapters_words_count = {}

for each in chapters_words:
  # add all words from each chapter to words_all
  words_all.extend(each)  
  
words_unique = set(words_all) # unique words

words_count = len(words_all)
words_unique_count = len(words_unique)
chapters_count = len(chapters)

print('Chapters:\t', chapters_count)
print('Words all:\t', words_count)
print('Words unique:\t', words_unique_count)

Chapters:	 171
Words all:	 299909
Words unique:	 38210


#### II.2. Frequency of Words in the Text


In [6]:
data_freq = {}

for el in words_all:
  if el not in data_freq:
    data_freq[el] = 1
  else:
    data_freq[el] += 1

# Show most frequent words in the text
sorted(data_freq.items(), key=lambda x: x[1], reverse=True)[:5]


[('и', 14592), ('--', 9680), ('в', 6997), ('не', 5922), ('он', 5266)]

#### II.3. Frequency of Words in Each Document (Chapter)


In [7]:
chapters_words_freq = []

for i, chapter in enumerate(chapters_words):
  chapters_words_freq.append({})
  for word in chapter:
    if word not in chapters_words_freq[i]:
      chapters_words_freq[i][word] = 1
    else:
      chapters_words_freq[i][word] += 1

In [8]:
# Check frequency of word in a chapter

def get_word_freq(word, chapter_index):
  if not 0 <= chapter_index < chapters_count:
    print('Chapter index out of range')
    return
  if word not in chapters_words_freq[chapter_index]:
    return 0
  return chapters_words_freq[chapter_index][word]

get_word_freq('анна', 4)

7

#### II.4. **`TF`** (term frequency)

<p>shows how often a term occurs in a document (chapter)</p>

This measure is used to `score` the `relevance` of a term in a document (chapter)

$$ tf*{word, chapter} = \frac {n*{word, chapter}} {n\_{chapter}}$$

where

- $tf_{word, chapter}$ - term frequency of the word in the chapter
- $n_{word, chapter}$ - number of times the word occurs in the chapter, $n_{word, chapter} \leq n_{chapter}$
- $n_{chapter}$ - number of words in the chapter,


In [9]:
tf = []
for i, chapter in enumerate(chapters_words_freq):
  tf.append({})
  for word in chapter:
    tf[i][word] = chapters_words_freq[i][word] / len(chapters_words[i])
    tf[i][word] = tf[i][word]

In [10]:
# Check the result

def get_tf_word(word='', chapter_index=''):
  if chapter_index == '':
    chapter_index = input('Enter a chapter index: ')  
  try:
    if type(chapter_index) == str:
      chapter_index = int(chapter_index)
  except:
    return 'Chapter index must be an integer'    
  if chapter_index > len(tf) - 1:
    return 'Chapter index must be less than ' + str(chapters_count)
  if word == '': 
    word = input('Enter a word: ')
  word = word.lower().strip()
  if word not in tf[chapter_index]:
    return 'Word not found in the chapter[' + str(chapter_index) + ']'  
  return tf[chapter_index][word]


round(get_tf_word('гостья', 15) , 6)

0.007358

#### II.5. **`DF`** (Document Frequency)

<p>represents how widespread a term is across a collection of documents (chapters)</p>

- `Absolute DF`:<br>
  counts the number of documents (chapters) that contain the term (word).
  $$ df*{\text{word}}^{\text{abs}} = N*{\text{word}} $$

- `Normalized DF`:<br>
  gives the proportion of documents (chapters) in the corpus (book) that contain the term (word).
  $$ df*{\text{term}}^{\text{norm}} = \frac{N*{\text{word}}}{N} $$

Where:

- $df_{\text{term}}^{\text{abs}}$ - absolute document frequency of the term (word)
- $df_{\text{term}}^{\text{norm}}$ - normalized document frequency of the term (word)
- $N_{\text{term}}$ - number of documents (chapters) containing the term (word)
- $N$ - total number of documents (chapters)


In [11]:
# Calculate if_abs and df_norm

df_abs = {}
df_norm = {}


for word in words_unique:
  count = 0
  for document in chapters_words_freq:
    if word in document:
      count += 1
  df_abs[word] = count
  df_norm[word] = count / chapters_count  

In [12]:
# Check the result

def get_df_word(word, N=False):
  word = word.lower().strip()  
  if N == False and word in df_abs:
    return 'df absolute', word, df_abs[word], 
  if N == True and word in df_norm: 
    return 'df normalized', word, df_norm[word]
  return 'Word not found'

print(get_df_word('человек', True))
print(get_df_word('анна'))

('df normalized', 'человек', 0.672514619883041)
('df absolute', 'анна', 32)


#### II.6. **`IDF`** (inverse document frequency)

<p>A measure used to calculate the importance of a term relative to its frequency across multiple documents (chapters).</p>

- `Raw idf`:
  <p>Used to score the importance of a term across numerous documents.</p>
$$idf_{term} = \frac{N}{df_{term}}$$

- `Logarithmic idf`:
  <p>Used to dampen the effects of terms with very high or very low DF.</p>  
$$idf_{term} = \log\left(\frac{N}{df_{term}}\right)$$

- `Smoothed idf`:
  <p>Used to avoid division by zero.</p>

$$idf_{term} = \log\left(\frac{N}{1 + df_{term}}\right)$$

where:

- $idf_{term}$ - inverse document frequency of the term.
- $df\_{term} $ - document frequency of the term.
- $N$ - total number of documents (chapters) in the corpus (book).
- $\log$ - natural logarithm: $\log_e$


#### II.7. **`TF-IDF`** (term frequency - inverse document frequency)

<p>Measure of importance of a word to a document in a collection or corpus</p>

$tf-idf = \text{term frequency} \times \text{inverse document frequency}$

$$ tf-idf*{term, chapter} = tf*{term, chapter} \times \log\left(\frac{N}{df\_{term}}\right) $$

where

- $tf-idf_{term, chapter}$ - the tf-idf of the term in the document (chapter).
- $tf_{term, chapter}$ - the term frequency of the term in the document (chapter).
- $df_{word}$ - the absolute document frequency of the term.
- $N$ - the total number of documents (chapters) in the corpus (book).
- $\log$ - the natural logarithm: $\log_e$


In [13]:
tf_idf = []
N = chapters_count

for i, chapter in enumerate(chapters_words_freq):
  tf_idf.append({})
  for word in chapter:
    tf_idf[i][word] = tf[i][word] * log(N / df_abs[word])


In [14]:
# Check the result

def get_tf_idf_word(word, chapter_index):
  if not 0 <= chapter_index < chapters_count:
    print('Chapter index out of range')
    return
  word = word.lower().strip()
  if word not in tf_idf[chapter_index]:
    return 'Word not found in the chapter[' + str(chapter_index) + ']'
  return tf_idf[chapter_index][word]

target_word = 'анна'
target_chapter = 4
round(get_tf_idf_word(target_word, target_chapter), 6)

0.011067

In [15]:
# Get top n tf-idf values for a chapter

n = 3 # number of top values
target_chapter = 3

sorted_tf_idf = sorted(tf_idf[target_chapter].items(), key=lambda x: x[1], reverse=True)

# Print top 3 tf-idf values
for i, (term, score) in enumerate(sorted_tf_idf[:n]):
    round_score = f"{score:.6f}"
    print(f"{i+1}\t {round_score}\t {term}")


1	 0.014655	 павловна
2	 0.009814	 анна
3	 0.006948	 функе
