# Chapter 6 Sample Code
This file produces the letter frequencies, ngrams, and uses them to produce sequences of letters using a statistical approach.

In [1]:
import pandas as pd
import os
import re
import pickle
import random

### Download the Reuters dataset 

The next few files parse the Reuters dataset files available here:
https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html

The files should be unzipped and untarred into the directory listed below with many .sgm files.

The frequency data are then written to .pkl files.

In [None]:
directory = 'ch6/'

### Parsing the Reuters sgm files into pkl frequency files

In [3]:
def extractBodyTextFromFile(filename):
    '''Extracts text from a filename, looking for <BODY> tags and returning the content between them.'''
    inBody = False
    textblock = []
    with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.readline()
        while len(text) > 0:
            if '<BODY>' in text and '</BODY>' in text:
                inBody = False
                text = re.sub('^.*\<BODY\>', '', text)
                text = re.sub('\<\/BODY\>.*$', '', text)
                textblock.append(text)
            elif '<BODY>' in text and inBody is False:
                text = re.sub('^.*\<BODY\>', '', text)
                textblock.append(text)
                inBody = True
            elif '</BODY>' in text and inBody is True:
                text = re.sub('\<\/BODY\>.*$', '', text)
                textblock.append(text)
                inBody = False
            elif inBody:
                textblock.append(text)
            text = f.readline()
    return textblock

In [4]:
def preprocess(text_block):
    '''Lowercase text removing newlines, leading and trailing space, and multiple spaces.'''
    text = ' '.join(text_block)
    text = re.sub('[\s\r\n]+', ' ', text).lower()
    text = re.sub('\s+', ' ', text)
    return text.strip()
    

In [5]:
def computeFrequencies(text, ngram_len, frequencies):
    '''Updates and returns frequencies dictionary with ngrams of ngram_len from text.'''
    for i in range(len(text)-ngram_len):
        key = text[i:i+ngram_len]
        if key in frequencies:
            frequencies[key] += 1 
        else:
            frequencies[key] = 1
    return frequencies

In [6]:
def purge_nonletters(ngrams):
    ''' Returns updated ngrams, removing those with non-letter non-space characters'''
    to_delete = []
    for key in ngrams:
        if re.match('^[a-z\s]+$', key):
            pass
        else:
            to_delete.append(key)
    for rejected in to_delete:
        ngrams.pop(rejected, None)
    return ngrams

In [7]:
def computeFreq(directory, ngram_len):
    '''Primary routine to iterate through all .sigm files in directory and compute ngram frequencies.'''
    sgm_files = [f for f in os.listdir(directory) if f.endswith('.sgm')]
    ngram_freq = {}
    for file in sgm_files:
        extract_text_blocks = extractBodyTextFromFile(os.path.join(directory, file))
        cleaned_block = preprocess(extract_text_blocks)
        ngram_freq = computeFrequencies(cleaned_block, ngram_len, ngram_freq)
        purge_nonletters(ngram_freq)
    return ngram_freq

Computes ngram frequencies for 1 to MAX_NGRAM_SIZE and write to freq#.pkl.

In [8]:
MAX_NGRAM_SIZE = 7
for ngrami in range(1,MAX_NGRAM_SIZE+1):
    print('Processing ngram size', ngrami)
    freqOutput = computeFreq(directory, ngrami)
    with open(os.path.join(directory, 'freq' + str(ngrami) + '.pkl'), 'wb') as f:
        pickle.dump(freqOutput, f)
    print('Length of freq dictionary size', ngrami, len(freqOutput))    

Processing ngram size 1
Length of freq dictionary size 1 27
Processing ngram size 2
Length of freq dictionary size 2 703
Processing ngram size 3
Length of freq dictionary size 3 9860
Processing ngram size 4
Length of freq dictionary size 4 53912
Processing ngram size 5
Length of freq dictionary size 5 172338
Processing ngram size 6
Length of freq dictionary size 6 420485
Processing ngram size 7
Length of freq dictionary size 7 823342


In [9]:
def checkTrainingSetSize(directory):
    ''' Checks the size of the training set before and after removing non-letter characters. '''
    sgm_files = [f for f in os.listdir(directory) if f.endswith('.sgm')]
    training_set_size = 0
    cleaned_set_size = 0
    for file in sgm_files:
        extract_text_blocks = extractBodyTextFromFile(os.path.join(directory, file))
        training_set_size += len(' '.join(extract_text_blocks))
        cleaned_block = preprocess(extract_text_blocks)
        cleaned_set_size += len(cleaned_block)
    print('Uncleaned', training_set_size, 'Cleaned', cleaned_set_size)
checkTrainingSetSize(directory)


Uncleaned 16372459 Cleaned 15620895


Generate frequency dict of the letters within the text files.
Write the file to letter_frequencies.csv

In [10]:
def computeIndividualLetterFrequencies(directory):
    '''Computes letter frequencies in the text files in the given directory.'''
    sgm_files = [f for f in os.listdir(directory) if f.endswith('.sgm')]
    training_set_size = 0
    cleaned_set_size = 0
    letter_frequencies = {}
    for u in 'abcdefghijklmnopqrstuvwxyz':
        letter_frequencies[u] = 0
    for file in sgm_files:
        extract_text_blocks = extractBodyTextFromFile(os.path.join(directory, file))
        training_set_size += len(' '.join(extract_text_blocks))
        cleaned_block = preprocess(extract_text_blocks)
        for u in cleaned_block:
            if u in letter_frequencies:
                letter_frequencies[u] += 1
        cleaned_set_size += len(cleaned_block)
    return letter_frequencies

letter_frequencies = computeIndividualLetterFrequencies(directory)
df = pd.DataFrame(list(letter_frequencies.items()), columns=['letter', 'frequency'])
df = df.sort_values(by='frequency', ascending=False)
df.to_csv('letter_frequencies.csv', index=False)


### Generate the next letters from the ngram frequency data

In [11]:
def create_next_letter_from_current_frequency_table():
    ''' Generates pairwise frequency table for current letter and next letter'''
    with open(os.path.join(directory, 'freq2.pkl'), 'rb') as f:
        freq2 = pickle.load(f)
    set1 = set()
    set2 = set()
    for key in freq2:
        set1.add(key[0])
        set2.add(key[1])
    lst1 = list(set1)
    lst2 = list(set2)
    lst1.sort()
    lst2.sort()
    freqTable = pd.DataFrame(index=lst2, columns=lst1)    
    freqTable.fillna(0, inplace=True)
    for key in freq2:
        freqTable.at[key[1], key[0]] = freq2[key]
    freqTable.to_csv(os.path.join(directory, 'freq2tbl.csv'))
create_next_letter_from_current_frequency_table()

The file written out was freq2tbl was used to create the histogram of the most frequent single entries and the pairs of entries.

In [12]:
def loadNormalizePkl1(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    totalsum = 0.0
    for key, val in data.items():
        totalsum += val
    for key in data:
        data[key] = data[key] / totalsum
    return data



Writes text using the letter frequencies alone without ngrams.

In [None]:
data = loadNormalizePkl1(os.path.join(directory,'freq1.pkl'))
random.seed(41)
for iter in range(10):
    outputstr = ''
    for index in range(70):
        prob = random.uniform(0.0, 1.0)
        sum = 0
        done = False
        prevkey = ''
        for key in data:
            if not done:
                sum += data[key]
                if sum >= prob:
                    done = True
                    outputstr += key

    print(iter+1, outputstr)

1  eelnin ungia wwuetthgsaaeuuoiolifs ttelaroi ic so u dnli ohsaadoiolid
2 t ape ui  preirl y ldueeseihenw eioxtincplastmtcteughct ivlrwataartrbi
3 ectee hcmasauisdeb  eaeacwinsarsdu enti nuee dba io haqid kh  aeeoa lr
4 aelplilvprag el nl einrileadneirearoafr ndstey m iser e hnpatfut hllbi
5 appdsmn  rstthylan   mvlccirefrrchneahbrgbmefw schnygtof mormde neehc 
6     dwo r amave orre mclnditytceo  oms e ap ilr  arleits a s n  mi  de
7  dowsibits   e ehns  ai siutfc notoitieimfdwdefss hmyra aom e ss txc r
8 svirdit ayde tmc hhriedtrnastdp tbrgdm ocildnaatrsts friu tlu fiebgzab
9  ddaefnlcgtr aryugnit snin nothohmltmei onoroelcnowoindte  yiaipdshsle
10  terlea i ii r dse icelnl mn wbeaaouiodm n eeiposnrarsr   rpseaee tage


#### Produce text using the n-gram letter frequencies of different lengths.

In [14]:
def loadNormalizePkln(filename):
    ''' Creates normalization by first n-1 letters of ngram for the last letter in ngrams'''
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    norm = {}
    for key in data.keys():
        prefix = key[0:len(key)-1]
        norm[prefix] = 0
    for key,value in data.items():
        prefix = key[0:len(key)-1]
        norm[prefix] += value
    for key,value in data.items():
        prefix = key[0:len(key)-1]
        data[key] = data[key] / norm[prefix]
    return data

In [15]:
def find_prob_string(prob, data, ngram):
    ''' Selects random entry using prob from the data keys starting with ngram that have been normalized.'''
    sum = 0.0
    for key in data:
        if key.startswith(ngram):
            sum += data[key]
            if sum >= prob:
                return key[-1]  
    print('No match found for', ngram, prob)
    return None

In [16]:
def return_random_key(data):
    ''' Returns a random key from the data dictionary.'''
    key = random.choice(list(data.keys()))  
    return key
    

In [17]:
def createNgramStr(directory, ngram):
    '''Generates lines of random text based on n-gram frequencies stored in directory.'''
    NUM_LINES = 5
    LETTERS_PER_LINE = 70
    data = loadNormalizePkln(os.path.join(directory, 'freq' + str(ngram) + '.pkl'))
    random.seed(422)
    for iter in range(NUM_LINES):
        outputstr = return_random_key(data)
        for index in range(LETTERS_PER_LINE):
            prob = random.uniform(0.0, 1.0)
            nextchar = find_prob_string(prob, data, outputstr[-ngram+1:])
            outputstr += nextchar
        print(iter+1, outputstr)

##### Main call to generate ngram-based letter sequences using problabilities

In [19]:
ngram_len = 5  # use ngram_len of 2 to MAX_NGRAM_SIZE = 7
createNgramStr(directory, ngram_len)

1 frn in and corp told keeping an from years to market of about there listerd
2 ow fluctane cited under fight pct on the large share into mazda ways to the
3 orb states all year oper shr longer board if of worldwident effort said und
4 logan outstandar years for that they said the fund dives for the right pres
5 ority and its proving exposures said internative to tights income for throu
