In [1]:
from faker import Faker
import re
from collections import defaultdict
from typing import Iterator
from remove_non_word_signs import removeNonWordSigns

---

In [4]:
fake = Faker()
text = fake.text(max_nb_chars=1000)

### Text to Words

In [8]:
def toWords(text: str) -> list[str]:
    return sorted(re.findall(r'\w+', text.lower()))

## Frequencies

### Simple

In [11]:
def toFrequencyDictionary(words: list[str]) -> dict[str, int]:
    wordsFrequencies = defaultdict(int)

    for word in words:
        wordsFrequencies[word] += 1
    
    return wordsFrequencies

### Accumulate

In [13]:
def reduceWordsFrequencies(acc: defaultdict[str, int], wordsFrequencies: dict[str, int]) -> dict[str, int]:
    for word in wordsFrequencies:
        acc[word] += wordsFrequencies[word]
    
    return acc

### Sorting

In [15]:
def sortWordsFrequenciesByFrequency(wordsFrequencies: dict[str, int]) -> dict[str, int]:
    sortedAlphabetically = dict(sorted(wordsFrequencies.items()))
    sortedByFrequency = sorted(sortedAlphabetically, key=wordsFrequencies.get, reverse=True)
    wordsFrequenciesDict = dict([(word, wordsFrequencies[word]) for word in sortedByFrequency])

    return wordsFrequenciesDict

## Text to words and frequencies

### Text

In [18]:
def getWordsFrequencies(text: str) -> dict[str, int]:
    clearedText = removeNonWordSigns(text)
    words = toWords(clearedText)
    wordsFrequencies = toFrequencyDictionary(words)

    return wordsFrequencies

In [19]:
def getWordsFrequenciesFromText(text: str) -> dict[str, int]:
    wordsFrequencies = getWordsFrequencies(text)
    sortedWordsFrequencies = sortWordsFrequenciesByFrequency(wordsFrequencies)

    return sortedWordsFrequencies

### Lines

In [21]:
def getAndReduceWordsFrequencies(lines: Iterator[str]) -> dict[str, int]:
    allWordsFrequencies = defaultdict(int)
    
    for line in lines:
        wordsFrequencies = getWordsFrequencies(line)
        allWordsFrequencies = reduceWordsFrequencies(allWordsFrequencies, wordsFrequencies)

    return allWordsFrequencies

In [22]:
def getWordsFrequenciesFromLines(lines: Iterator[str]) -> dict[str, int]:
    allWordsFrequencies = getAndReduceWordsFrequencies(lines)
    sortedAllWordsFrequencies = sortWordsFrequenciesByFrequency(allWordsFrequencies)
    
    return sortedAllWordsFrequencies