In [19]:
import nltk
nltk.download('punkt')
from nltk.probability import FreqDist
import math

norm_files_list = ['norm_wiki_en.txt', 'norm_wiki_eo.txt', 'norm_wiki_et.txt', 'norm_wiki_ht.txt', 'norm_wiki_la.txt', 'norm_wiki_nv.txt', 'norm_wiki_so.txt']
sample_files_list = ['sample0.txt', 'sample1.txt', 'sample2.txt', 'sample3.txt', 'sample4.txt', 'sample5.txt']

#initialize lists to store entropy values
entropy_words_list = []
entropy_chars_list = []

#clear the result file
with open('sample_results.txt', 'w', encoding='utf-8') as f:
    f.write('')

#repeat operation for all files in the list
for file in sample_files_list:
    # Open and read the file
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()

    # Tokenize the text into words and characters
    words = nltk.word_tokenize(text)
    chars = list(text)

    # Calculate the frequency distribution
    fdist_words = FreqDist(words)
    fdist_chars = FreqDist(chars)

    # Calculate the probabilities
    prob_words = {word: freq / len(words) for word, freq in fdist_words.items()}
    prob_chars = {char: freq / len(chars) for char, freq in fdist_chars.items()}

    # Calculate the entropy
    entropy_words = -sum(prob * math.log2(prob) for prob in prob_words.values())
    entropy_chars = -sum(prob * math.log2(prob) for prob in prob_chars.values())

    print(f'{file}, Entropy of words: {entropy_words}')
    print(f'{file}, Entropy of characters: {entropy_chars}')

    #save the results to a file
    with open('sample_results.txt', 'a', encoding='utf-8') as f:
        f.write(f'{file}, Entropy of words: {entropy_words}\n')
        f.write(f'{file}, Entropy of characters: {entropy_chars}\n')

    #add values to a list for later use
    entropy_words_list.append(entropy_words)
    entropy_chars_list.append(entropy_chars)

#calculate the average,min,max entropy for words and characters across files
average_entropy_words = sum(entropy_words_list) / len(entropy_words_list)
average_entropy_chars = sum(entropy_chars_list) / len(entropy_chars_list)
min_entropy_words = min(entropy_words_list)
min_entropy_chars = min(entropy_chars_list)
max_entropy_words = max(entropy_words_list)
max_entropy_chars = max(entropy_chars_list)

#add values to a file
with open('entropy_results.txt', 'a', encoding='utf-8') as f:
    f.write(f'\nAverage entropy of words: {average_entropy_words}\n')
    f.write(f'Average entropy of characters: {average_entropy_chars}\n')
    f.write(f'Min entropy of words: {min_entropy_words}\n')
    f.write(f'Min entropy of characters: {min_entropy_chars}\n')
    f.write(f'Max entropy of words: {max_entropy_words}\n')
    f.write(f'Max entropy of characters: {max_entropy_chars}\n\n')





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rtoit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


sample0.txt, Entropy of words: 7.7487413861401295
sample0.txt, Entropy of characters: 4.273001240566633
sample1.txt, Entropy of words: 11.500700199498642
sample1.txt, Entropy of characters: 4.1270061355497205
sample2.txt, Entropy of words: 8.023869815826425
sample2.txt, Entropy of characters: 3.9933118002325836
sample3.txt, Entropy of words: 9.061122986146984
sample3.txt, Entropy of characters: 3.9302978341579875
sample4.txt, Entropy of words: 17.129669111070662
sample4.txt, Entropy of characters: 4.253809567379015
sample5.txt, Entropy of words: 16.509437287775633
sample5.txt, Entropy of characters: 4.441688018481797


In [22]:
#initialize lists for bigrams and trigrams
entropy_bigrams_list = []
entropy_trigrams_list = []

#repeat operation for all files in the list
for file in norm_files_list:
     # Open and read the file
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
           
    # Function to generate n-grams
    def generate_ngrams(text, n):
        return [text[i:i+n] for i in range(len(text) - n + 1)]

    # Generate bigrams and trigrams
    bigrams = generate_ngrams(text, 2)
    trigrams = generate_ngrams(text, 3)

    # Calculate the frequency distribution
    fdist_bigrams = FreqDist(bigrams)
    fdist_trigrams = FreqDist(trigrams)

    # Calculate the conditional probabilities
    prob_bigrams = {bigram: freq / len(bigrams) for bigram, freq in fdist_bigrams.items()}
    prob_trigrams = {trigram: freq / len(trigrams) for trigram, freq in fdist_trigrams.items()}

    # Calculate the conditional entropy
    entropy_bigrams = -sum(prob * math.log2(prob / prob_chars[bigram[0]]) for bigram, prob in prob_bigrams.items())
    entropy_trigrams = -sum(prob * math.log2(prob / prob_bigrams[trigram[:2]]) for trigram, prob in prob_trigrams.items())

    print(f'Conditional entropy of bigrams: {entropy_bigrams}')
    print(f'Conditional entropy of trigrams: {entropy_trigrams}')

    #save the results to a file
    with open('entroopy_results.txt', 'a', encoding='utf-8') as f:
        f.write(f'{file}, Conditional entropy of bigrams: {entropy_bigrams}\n')
        f.write(f'{file}, Conditional entropy of trigrams: {entropy_trigrams}\n')

    #add values to a list for later use
    entropy_bigrams_list.append(entropy_bigrams)
    entropy_trigrams_list.append(entropy_trigrams)

#calculate the average,min,max entropy for bigrams and trigrams across files
average_entropy_bigrams = sum(entropy_bigrams_list) / len(entropy_bigrams_list)
average_entropy_trigrams = sum(entropy_trigrams_list) / len(entropy_trigrams_list)
min_entropy_bigrams = min(entropy_bigrams_list)
min_entropy_trigrams = min(entropy_trigrams_list)
max_entropy_bigrams = max(entropy_bigrams_list)
max_entropy_trigrams = max(entropy_trigrams_list)

#add values to a file
with open('entropy_results.txt', 'a', encoding='utf-8') as f:
    f.write(f'\nAverage conditional entropy of bigrams: {average_entropy_bigrams}\n')
    f.write(f'Average conditional entropy of trigrams: {average_entropy_trigrams}\n')
    f.write(f'Min conditional entropy of bigrams: {min_entropy_bigrams}\n')
    f.write(f'Min conditional entropy of trigrams: {min_entropy_trigrams}\n')
    f.write(f'Max conditional entropy of bigrams: {max_entropy_bigrams}\n')
    f.write(f'Max conditional entropy of trigrams: {max_entropy_trigrams}\n\n')

KeyboardInterrupt: 