# Working With Files

Another useful component of Python is reading files from the filesystem. Sometimes we want to analyze text data. Maybe they are log files, maybe they are HTML dumps (more on that later) or maybe they are somethign else. In the following example we'll be working with the text of the book Frankenstein, with special thanks to Project Guttenberg for making the text freely available. 

In [10]:
# The prefered way to open a text file in Python looks like this:
# The long string is the path to the file, the 'r' means "read mode" which can also be "w" for write mode
# 'rb' and 'wb' are available for "raw byte" mode, which you may eventually have reason to use but for now 
# we're going to ignore those options.
with open('book-texts/frankenstein-no-header-footer.txt', 'r') as franken_reader:
    # The type of franken_reader is a <class '_io.TextIOWrapper'>
    print(type(franken_reader))
    
    # This type has some interesting methods.
    ## .read() will process the whole file and we can put it in a string
    whole_text = franken_reader.read()
    print(whole_text)

    # NOTE: Once the file has been "read" it cannot be read a second time
    # in fact reading any part of the file "consumes" that section. So if you run the followingg
    # code without commenting out the above call to "read()" nothing will be output.

    # Read, with a parameter, will read the specified number of bytes
    first_50_bytes = franken_reader.read(50)
    print(first_50_bytes)
    
    # Readline can be used to read one line at a time. 
    first_line = franken_reader.readline()
    print(first_line)
    
    

<class '_io.TextIOWrapper'>
Frankenstein;



# Good Habits: Line by Line

Especially when working with large files, it's a good idea to read the file line by line if possible. This saves memory, and is usually much faster. Sometimes you need ALL of the data at once (this was the case with our CSV files).

For example, lets say we wanted to know how often each word occured in this book... We can do that one line at a time:

In [14]:
# A counter is a dictionary where the default value for every key is 0.
from collections import Counter

word_counts = Counter()

with open('book-texts/frankenstein-no-header-footer.txt', 'r') as franken_reader:
    line = franken_reader.readline()
    
    # The final line will be an empty string
    # No other lines (even blank ones) will be the empty string
    # An empty line will instead be the newline character "\n"
    while line != '':
        # The split function turns a string into an array of words based on 
        # where the whitespace characters are. You can split on other characters too!
        words = line.split() 
        
        for word in words:
            word_counts[word] += 1
            
        line = franken_reader.readline()
        
        
# Now that we have the word counts... Lets check them out!
for count in word_counts.most_common():
    print(count)
    
# If you look carefully there are some odd ones that we'd need better processing
# to handle. Here are some examples:
    ('contumely?', 1)
    ('at,', 1)
    ('“Fear', 1)
    ('“Farewell!', 1)

('the', 3898)
('and', 2903)
('I', 2719)
('of', 2634)
('to', 2072)
('my', 1631)
('a', 1338)
('in', 1071)
('was', 992)
('that', 974)
('had', 679)
('with', 654)
('which', 540)
('but', 538)
('me', 529)
('his', 500)
('not', 479)
('as', 477)
('for', 463)
('he', 446)
('by', 441)
('on', 425)
('you', 400)
('from', 373)
('it', 362)
('have', 356)
('be', 339)
('her', 313)
('this', 298)
('were', 298)
('is', 296)
('at', 289)
('when', 261)
('The', 255)
('your', 237)
('an', 208)
('so', 196)
('could', 187)
('will', 185)
('been', 182)
('would', 177)
('their', 174)
('one', 174)
('all', 172)
('she', 172)
('or', 169)
('they', 166)
('are', 164)
('if', 153)
('should', 152)
('who', 150)
('more', 149)
('me,', 148)
('him', 147)
('no', 146)
('some', 136)
('these', 130)
('now', 130)
('But', 128)
('He', 126)
('into', 124)
('upon', 123)
('before', 122)
('its', 120)
('My', 120)
('only', 119)
('our', 118)
('am', 114)
('we', 114)
('did', 112)
('yet', 109)
('than', 109)
('might', 107)
('me.', 107)
('myself', 105)
('eve

('depart', 4)
('fortnight', 4)
('Heaven', 4)
('encompassed', 4)
('sympathise', 4)
('bitterly', 4)
('impatient', 4)
('becoming', 4)
('rugged', 4)
('well-known', 4)
('usual', 4)
('necessary,', 4)
('equally', 4)
('fortune,', 4)
('tears,', 4)
('informed', 4)
('pursuit.', 4)
('money,', 4)
('“What', 4)
('so;', 4)
('renders', 4)
('suppose,', 4)
('know,', 4)
('permit', 4)
('confide', 4)
('conception', 4)
('preparing', 4)
('attributed', 4)
('attachment', 4)
('ocean', 4)
('besides', 4)
('Remember', 4)
('again.', 4)
('dangers', 4)
('shores', 4)
('accidents', 4)
('Be', 4)
('sake,', 4)
('thus.', 4)
('So', 4)
('leaving', 4)
('situation', 4)
('drawn', 4)
('gigantic', 4)
('guided', 4)
('watched', 4)
('rapid', 4)
('traveller', 4)
('wonder.', 4)
('apparition', 4)
('freed', 4)
('light,', 4)
('someone', 4)
('astonishment', 4)
('exchanged', 4)
('Good', 4)
('fainted.', 4)
('animation', 4)
('speak,', 4)
('removed', 4)
('lighted', 4)
('benevolence', 4)
('tormented', 4)
('depended', 4)
('me.”', 4)
('travel', 4

('woman’s', 2)
('refused,', 2)
('thinking', 2)
('inclinations.', 2)
('fellow!”', 2)
('embarkation.', 2)
('severe,', 2)
('promises', 2)
('expected.', 2)
('communicate', 2)
('sensation,', 2)
('pleasurable', 2)
('fearful,', 2)
('unexplored', 2)
('kill', 2)
('woeful', 2)
('“Ancient', 2)
('mysteries', 2)
('understand.', 2)
('perseverance', 2)
('marvellous,', 2)
('projects,', 2)
('seas,', 2)
('reverse', 2)
('tenderly.', 2)
('Sister,', 2)
('lines', 2)
('safe—and', 2)
('voyage.', 2)
('land,', 2)
('bold', 2)
('breathe', 2)
('letter.', 2)
('navigators', 2)
('record,', 2)
('yours,', 2)
('rashly', 2)
('danger.', 2)
('gone,', 2)
('involuntarily', 2)
('forbear', 2)
('sides,', 2)
('dangerous,', 2)
('hoping', 2)
('atmosphere', 2)
('irregular', 2)
('plains', 2)
('comrades', 2)
('watchful', 2)
('diverted', 2)
('situation.', 2)
('dogs,', 2)
('north,', 2)
('inequalities', 2)
('land;', 2)
('ship.', 2)
('profited', 2)
('hours.', 2)
('talking', 2)
('fact,', 2)
('alive;', 2)
('persuading', 2)
('be,', 2)
('isl

('expanse', 2)
('assemblage', 2)
('bolt', 2)
('tomb', 2)
('illustrious', 2)
('chains', 2)
('hopeless,', 2)
('Switzerland;', 2)
('rocky', 2)
('Scotch', 2)
('on.', 2)
('fears;', 2)
('destroyer.', 2)
('Edinburgh', 2)
('languid', 2)
('beaten', 2)
('persons,', 2)
('squalidness', 2)
('ordered', 2)
('occasioned', 2)
('clothes', 2)
('roared', 2)
('roarings', 2)
('ocean.', 2)
('engaged.', 2)
('blinded', 2)
('situated,', 2)
('persecutor.', 2)
('behold.', 2)
('conclusion', 2)
('remorse.', 2)
('malignant', 2)
('deserts,', 2)
('precarious', 2)
('right,', 2)
('Yes,', 2)
('creating', 2)
('motionless,', 2)
('fishermen', 2)
('silence,', 2)
('presentiment', 2)
('rouse', 2)
('dwelt', 2)
('hateful', 2)
('Are', 2)
('passions,', 2)
('sign', 2)
('swiftness', 2)
('silent,', 2)
('ears.', 2)
('conjured', 2)
('sacrificed', 2)
('endless', 2)
('streamed', 2)
('sinks', 2)
('isle', 2)
('composure;', 2)
('ears', 2)
('reality.', 2)
('utensils', 2)
('film', 2)
('voluntary', 2)
('basket', 2)
('commission', 2)
('shudderi

('Whence,', 1)
('proceed?', 1)
('mystery;', 1)
('acquainted,', 1)
('inquiries.', 1)
('physiology.', 1)
('Unless', 1)
('intolerable.', 1)
('recourse', 1)
('anatomy,', 1)
('sufficient;', 1)
('precautions', 1)
('horrors.', 1)
('superstition', 1)
('churchyard', 1)
('receptacle', 1)
('vaults', 1)
('charnel-houses.', 1)
('delicacy', 1)
('wasted;', 1)
('brain.', 1)
('analysing', 1)
('minutiae', 1)
('causation,', 1)
('exemplified', 1)
('me—a', 1)
('brilliant', 1)
('wondrous,', 1)
('simple,', 1)
('immensity', 1)
('illustrated,', 1)
('madman.', 1)
('affirm', 1)
('miracle', 1)
('stages', 1)
('probable.', 1)
('generation', 1)
('matter.', 1)
('gratifying', 1)
('progressively', 1)
('obliterated,', 1)
('wisest', 1)
('once:', 1)
('buried', 1)
('glimmering', 1)
('express,', 1)
('acquainted;', 1)
('patiently', 1)
('infallible', 1)
('precepts,', 1)
('aspires', 1)
('allow.', 1)
('intricacies', 1)
('fibres,', 1)
('muscles,', 1)
('labour.', 1)
('organization;', 1)
('exalted', 1)
('ability', 1)
('complex', 1

('morrow;', 1)
('Nought', 1)
('mutability!', 1)
('ascent.', 1)
('overlooks', 1)
('dissipated', 1)
('glacier.', 1)
('uneven,', 1)
('descending', 1)
('low,', 1)
('rifts', 1)
('width,', 1)
('crossing', 1)
('Montanvert', 1)
('opposite,', 1)
('league;', 1)
('majesty.', 1)
('recesses.', 1)
('sunlight', 1)
('clouds.', 1)
('“Wandering', 1)
('wander,', 1)
('beds,', 1)
('superhuman', 1)
('crevices', 1)
('caution;', 1)
('faintness', 1)
('gale', 1)
('(sight', 1)
('abhorred!)', 1)
('combat.', 1)
('approached;', 1)
('bespoke', 1)
('ugliness', 1)
('utterance,', 1)
('contempt.', 1)
('“Devil,”', 1)
('“do', 1)
('wreaked', 1)
('head?', 1)
('vile', 1)
('insect!', 1)
('stay,', 1)
('dust!', 1)
('oh!', 1)
('diabolically', 1)
('murdered!”', 1)
('reception,”', 1)
('“All', 1)
('wretched;', 1)
('hated,', 1)
('things!', 1)
('detest', 1)
('dissoluble', 1)
('conditions,', 1)
('peace;', 1)
('refuse,', 1)
('maw', 1)
('friends.”', 1)
('“Abhorred', 1)
('Fiend', 1)
('art!', 1)
('Wretched', 1)
('negligently', 1)
('bestow

('siroc', 1)
('Morning', 1)
('Chamounix;', 1)
('sensations—they', 1)
('mountain’s', 1)
('haggard', 1)
('ban—as', 1)
('sympathies—as', 1)
('companionship', 1)
('adoration;', 1)
('dedicate', 1)
('disappointed', 1)
('devoting', 1)
('disquisition.', 1)
('philosopher,', 1)
('material', 1)
('purpose;', 1)
('shrank', 1)
('absolute', 1)
('restored;', 1)
('proportionably.', 1)
('eradicating', 1)
('fits,', 1)
('blackness', 1)
('overcast', 1)
('sunshine.', 1)
('rippling', 1)
('listless.', 1)
('composure,', 1)
('salutations', 1)
('readier', 1)
('aside,', 1)
('founded,', 1)
('avow', 1)
('Reserve', 1)
('useless,', 1)
('treble', 1)
('exordium,', 1)
('continued—', 1)
('tie', 1)
('declining', 1)
('infancy;', 1)
('tastes,', 1)
('suited', 1)
('assistants', 1)
('considering', 1)
('sincerely.', 1)
('excited,', 1)
('does,', 1)
('prospects', 1)
('union.”', 1)
('experienced.', 1)
('everyday', 1)
('infirmities.', 1)
('younger;', 1)
('competent', 1)
('uneasiness.', 1)
('Interpret', 1)
('candour', 1)
('sincerity

('madness.', 1)
('reverted', 1)
('delirium.', 1)
('“Man,”', 1)
('wisdom!', 1)
('Cease;', 1)
('say.”', 1)
('fury;', 1)
('calculating', 1)
('periods', 1)
('portion.', 1)
('ever;', 1)
('adversity,', 1)
('wont', 1)
('sandy', 1)
('unsettled,', 1)
('uncertain', 1)
('pursue.', 1)
('entrance', 1)
('cemetery', 1)
('reposed.', 1)
('graves.', 1)
('uninterested', 1)
('observer.', 1)
('mourner.', 1)
('lived;', 1)
('drag', 1)
('weary', 1)
('existence.', 1)
('quivering', 1)
('sacred', 1)
('swear;', 1)
('Night,', 1)
('preside', 1)
('herbage', 1)
('agony;', 1)
('torments', 1)
('adjuration', 1)
('approved', 1)
('devotion,', 1)
('furies', 1)
('concluded,', 1)
('choked', 1)
('utterance.', 1)
('stillness', 1)
('laugh.', 1)
('heavily;', 1)
('re-echoed', 1)
('laughter.', 1)
('Surely', 1)
('ear,', 1)
('whisper,', 1)
('satisfied.”', 1)
('Guided', 1)
('clue,', 1)
('vainly.', 1)
('Black', 1)
('Sea.', 1)
('escaped,', 1)
('how.', 1)
('Amidst', 1)
('Tartary', 1)
('Russia,', 1)
('evaded', 1)
('track.', 1)
('peasants

In [26]:
# It turns out, python has a lot of great stuff built in, including a solution to this problem...
import string
word_counts = Counter()

with open('book-texts/frankenstein-no-header-footer.txt', 'r') as franken_reader:
    line = franken_reader.readline()
    
    # The final line will be an empty string
    # No other lines (even blank ones) will be the empty string
    # An empty line will instead be the newline character "\n"
    while line != '':
        # Lets lowercase everything so we don't count A and a separately.
        line = line.lower()
        
        # explaination of maketrans: https://www.geeksforgeeks.org/python-maketrans-translate-functions/
        # replace hyphen (EM AND EN DASH) with space, 
        line = line.translate(str.maketrans('-—', '  '))
        
        # remove anything in string.punctuation and the two weird quotes
        line = line.translate(str.maketrans('', '', string.punctuation + '“' + '”'))
        
        # The split function turns a string into an array of words based on 
        # where the whitespace characters are. You can split on other characters too!
        words = line.split() 
        
        for word in words:
            word_counts[word] += 1
            
        line = franken_reader.readline()
        
        
# Now that we have the word counts... Lets check them out!
for count in word_counts.most_common():
    print(count)

('the', 4194)
('and', 2975)
('i', 2846)
('of', 2642)
('to', 2094)
('my', 1776)
('a', 1391)
('in', 1128)
('was', 1021)
('that', 1015)
('me', 864)
('but', 687)
('had', 686)
('with', 667)
('he', 608)
('you', 572)
('which', 558)
('it', 546)
('his', 535)
('as', 528)
('not', 510)
('for', 498)
('by', 460)
('on', 460)
('this', 402)
('from', 385)
('her', 373)
('have', 365)
('be', 360)
('when', 328)
('at', 317)
('were', 308)
('is', 307)
('she', 255)
('your', 252)
('him', 221)
('an', 211)
('so', 210)
('they', 209)
('one', 206)
('all', 200)
('could', 197)
('will', 194)
('if', 193)
('been', 190)
('their', 186)
('would', 184)
('or', 177)
('are', 175)
('we', 173)
('who', 172)
('no', 170)
('more', 165)
('these', 154)
('now', 154)
('should', 153)
('yet', 152)
('some', 147)
('before', 146)
('myself', 136)
('what', 132)
('man', 132)
('am', 126)
('upon', 126)
('our', 126)
('them', 126)
('into', 124)
('its', 123)
('only', 123)
('did', 119)
('do', 115)
('life', 114)
('father', 113)
('than', 110)
('every', 1

('reward', 5)
('help', 5)
('french', 5)
('female', 5)
('exile', 5)
('image', 5)
('soothed', 5)
('driven', 5)
('chair', 5)
('everlasting', 5)
('anger', 5)
('blast', 5)
('insurmountable', 5)
('agreed', 5)
('islands', 5)
('lakes', 5)
('scotland', 5)
('oxford', 5)
('feverish', 5)
('beach', 5)
('deserts', 5)
('wedding', 5)
('adversary', 5)
('hardships', 5)
('1', 4)
('2', 4)
('3', 4)
('4', 4)
('mrs', 4)
('saville', 4)
('disaster', 4)
('yesterday', 4)
('confidence', 4)
('advancing', 4)
('try', 4)
('pole', 4)
('region', 4)
('wafted', 4)
('heavenly', 4)
('induce', 4)
('expedition', 4)
('voyages', 4)
('surround', 4)
('poets', 4)
('six', 4)
('theory', 4)
('advantage', 4)
('mate', 4)
('second', 4)
('dignity', 4)
('luxury', 4)
('depressed', 4)
('theirs', 4)
('failing', 4)
('wrapped', 4)
('seated', 4)
('veins', 4)
('march', 4)
('encompassed', 4)
('sympathise', 4)
('reply', 4)
('romantic', 4)
('impatient', 4)
('becoming', 4)
('eight', 4)
('rugged', 4)
('cultivation', 4)
('equally', 4)
('moderate', 4)

('completed', 2)
('unjust', 2)
('ascribed', 2)
('justified', 2)
('exception', 2)
('greece', 2)
('reproach', 2)
('supreme', 2)
('unwholesome', 2)
('trade', 2)
('startled', 2)
('amounted', 2)
('panes', 2)
('glimmer', 2)
('dull', 2)
('skin', 2)
('watery', 2)
('complexion', 2)
('exceeded', 2)
('moderation', 2)
('worms', 2)
('folds', 2)
('chattered', 2)
('shutters', 2)
('inarticulate', 2)
('announce', 2)
('mummy', 2)
('unfinished', 2)
('joints', 2)
('sleepless', 2)
('impelled', 2)
('hurry', 2)
('lonely', 2)
('doth', 2)
('turns', 2)
('continuing', 2)
('sprung', 2)
('incredulous', 2)
('dislike', 2)
('guessed', 2)
('pace', 2)
('entreating', 2)
('stand', 2)
('waiting', 2)
('fearfully', 2)
('empty', 2)
('clapped', 2)
('sensitiveness', 2)
('unrestrained', 2)
('heartless', 2)
('god’s', 2)
('recover', 2)
('concealing', 2)
('disorder', 2)
('attentive', 2)
('shaded', 2)
('contributed', 2)
('attacked', 2)
('repay', 2)
('agitates', 2)
('handwriting', 2)
('reassure', 2)
('persuasions', 2)
('encountering

('mexico', 1)
('peru', 1)
('moralizing', 1)
('inquiring', 1)
('blossom', 1)
('expanding', 1)
('engrossed', 1)
('withered', 1)
('mines', 1)
('artist', 1)
('leaf', 1)
('energy', 1)
('incipient', 1)
('november', 1)
('accomplishment', 1)
('infuse', 1)
('pattered', 1)
('dismally', 1)
('candle', 1)
('convulsive', 1)
('delineate', 1)
('arteries', 1)
('flowing', 1)
('pearly', 1)
('whiteness', 1)
('luxuriances', 1)
('dun', 1)
('shrivelled', 1)
('straight', 1)
('changeable', 1)
('infusing', 1)
('traversing', 1)
('lassitude', 1)
('kiss', 1)
('hue', 1)
('shroud', 1)
('crawling', 1)
('flannel', 1)
('dew', 1)
('forehead', 1)
('convulsed', 1)
('curtain', 1)
('jaws', 1)
('muttered', 1)
('detain', 1)
('downstairs', 1)
('courtyard', 1)
('catching', 1)
('dante', 1)
('wretchedly', 1)
('palpitation', 1)
('artery', 1)
('dismal', 1)
('aching', 1)
('church', 1)
('clock', 1)
('indicated', 1)
('sixth', 1)
('porter', 1)
('issued', 1)
('pacing', 1)
('drenched', 1)
('comfortless', 1)
('palpitated', 1)
('daring', 1

('anticipation', 1)
('discontent', 1)
('ennui', 1)
('elasticity', 1)
('spectacle', 1)
('pitiable', 1)
('identify', 1)
('animating', 1)
('successive', 1)
('hampden', 1)
('patriot', 1)
('debasing', 1)
('monuments', 1)
('remembrancers', 1)
('iron', 1)
('scale', 1)
('cave', 1)
('cabinets', 1)
('curiosities', 1)
('collections', 1)
('derby', 1)
('patches', 1)
('acquaintances', 1)
('capacities', 1)
('inferiors', 1)
('traveller’s', 1)
('includes', 1)
('stretch', 1)
('begins', 1)
('finds', 1)
('engages', 1)
('forsakes', 1)
('appointment', 1)
('dæmon’s', 1)
('superscription', 1)
('ascertain', 1)
('expedite', 1)
('remissness', 1)
('murdering', 1)
('fancied', 1)
('antiquity', 1)
('pleasing', 1)
('regularity', 1)
('arthur’s', 1)
('bernard’s', 1)
('pentland', 1)
('termination', 1)
('coupar', 1)
('andrew’s', 1)
('tay', 1)
('rendezvous', 1)
('congenial', 1)
('dissuade', 1)
('parted', 1)
('remotest', 1)
('orkneys', 1)
('soil', 1)
('pasture', 1)
('cows', 1)
('oatmeal', 1)
('gaunt', 1)
('scraggy', 1)
('t

# Counting with Context...

What if we wanted to know how many paragraph breaks there are in the book, and how many of those begin with a character speaking?

In [27]:
paragraphs = 0
starts_with_quote = 0
last_was_linebreak = True # I'm assuming the start of the book counts as a linebreak

with open('book-texts/frankenstein-no-header-footer.txt', 'r') as franken_reader:
    line = franken_reader.readline()
    
    while line != '':

        if line[0] == '“' and last_was_linebreak:
            starts_with_quote += 1
        
        last_was_linebreak = False
        if line == '\n':
            paragraphs += 1
            last_was_linebreak = True
        
        line = franken_reader.readline()
        
print(paragraphs, starts_with_quote)

936 310
