# File I/O

# Pathname manipulation

In [52]:
# Most pathname functions will NOT expand '~"
# use expanduser

import os

ep = os.path.expanduser('~/foo/bar/zap.txt')
ep

'/Users/lstead/foo/bar/zap.txt'

In [53]:
# get pieces of paths

[os.path.split(ep), os.path.splitext(ep)]

[('/Users/lstead/foo/bar', 'zap.txt'), ('/Users/lstead/foo/bar/zap', '.txt')]

In [54]:
# put paths together

os.path.join('/Users', 'lstead', 'foo/', 'bar', 'zap.txt')

'/Users/lstead/foo/bar/zap.txt'

In [55]:
# leading /  on foo eliminates components on the left
os.path.join('/Users', 'lstead', '/foo', 'bar', 'zap.txt')

'/foo/bar/zap.txt'

# Getting file status

In [56]:
# os.path.exists and os.access reports file status without throwning errors
# os.stat throws an error if the path doesn't exist. 
import os

# similar to touch
path = '/tmp/touch'
open(path, 'w').close()
os.utime(path, None)


def ac(p):
    # can check exists, readable, writeable, executable
    return([ os.access(p, m) for m in [os.F_OK, os.R_OK, os.W_OK, os.X_OK] ])

ac(path)


[True, True, True, False]

In [57]:
# last accessed time, last modified time
[os.path.getatime(path), os.path.getmtime(path)]

[1474660206.0, 1474660206.0]

In [58]:
# file exists predicate

os.path.exists(path)

True

In [59]:
[os.path.isfile(path), os.path.isdir(path)]

[True, False]

In [60]:
[os.path.isfile('/tmp'), os.path.isdir('/tmp')]

[False, True]

In [61]:
os.stat(path)

os.stat_result(st_mode=33206, st_ino=23774452, st_dev=16777219, st_nlink=1, st_uid=501, st_gid=0, st_size=0, st_atime=1474660206, st_mtime=1474660206, st_ctime=1474660206)

In [62]:
# removes a file, but throws error if it doesn't exist

os.remove(path)
ac(path)

[False, False, False, False]

In [63]:
# file is gone

os.path.exists(path)

False

In [64]:
# stat gets upset and throws an error if the file doesn't exist

os.stat(path)

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/touch'

In [65]:
# Returns list of files and dirs in a directory
# can use isfile and isdir to figure out which is which

os.listdir( os.path.join(os.path.expanduser('~'), 'anaconda'))

['.DS_Store',
 'bin',
 'conda-meta',
 'envs',
 'etc',
 'include',
 'lib',
 'Navigator.app',
 'pkgs',
 'python.app',
 'share',
 'ssl']

# 'walk' and get all the files and dirs under a start dir

In [66]:
g = os.walk('~/anaconda/ssl')

In [67]:
# didn't work - walk doesn't like '~'

next(g)

StopIteration: 

In [68]:
# returns a generator...

e = os.path.expanduser('~/anaconda/ssl')
print(e)
g = os.walk(e)
g

/Users/lstead/anaconda/ssl


<generator object walk at 0x10786bf68>

In [69]:
# nicer than os.listdir() in that files and dirs are in separate lists
# returns (dirpath, dirs in dirpath, files in dir)

next(g)

('/Users/lstead/anaconda/ssl',
 ['misc'],
 ['cacert.pem', 'cert.pem', 'openssl.cnf'])

In [70]:
# descend into 'misc' directory

next(g)

('/Users/lstead/anaconda/ssl/misc',
 [],
 ['c_hash', 'c_info', 'c_issuer', 'c_name', 'CA.pl', 'CA.sh', 'tsget'])

In [71]:
# finished

next(g)

StopIteration: 

# open function
- used to open files for reading and writing

# Writing files 
- no automatic newlines

In [72]:
# open file, write to file descriptor, close file descriptor
# can be error prone - easy to forget to close. also, if there
# is an error, the close call could be skipped
# not closing file descriptors can cause a server to crash
# 'w' is the 'open mode' - tells 'open' to open the file for writing

path = '/tmp/four.txt'
fd = open(path, 'w')
for e in ['one', 'two', 'three', 'four']:
    fd.write(e + '\n')
fd.close()

# with 
- 'with' is a 'context manager'
- binds return value from open to 'fd'
- 'with' will automatically close the file when the 'with' block is exited, even if by error
- note ':' and indenting defines a statement block over which 'fd' will be bound

In [73]:
with open(path, 'w') as fd:
    for e in ['one', 'two', 'three', 'four']:
        fd.write(e + '\n')

In [74]:
# could do one write with join

with open(path, 'w') as fd:
    fd.write('\n'.join(['one', 'two', 'three', 'four']))

In [75]:
# or write out the string with newlines

with open(path, 'w') as fd:
    fd.write("one\ntwo\nthree\nfour\n")

In [76]:
# can append(open mode 'a') to an existing file

path = '/tmp/four.txt'
with open(path, 'a') as f:
    for l in ['five', 'six']:
        f.write(l + '\n')

# Reading files - eager
- read the entire file immediately

In [77]:
# eager read - read the entire file into one string
# 'r' tells 'open' to open the file for reading

with open(path, 'r') as fd:    
    print( fd.read())

one
two
three
four
five
six



In [78]:
# eager read - get a list of all the lines 

with open(path,'r') as fd:
    print(fd.readlines())

['one\n', 'two\n', 'three\n', 'four\n', 'five\n', 'six\n']


# Reading files - lazy
- suppose you are looking for a substring in a huge unsorted file of text lines
    - lazy read probably wins
    - don't have to read in entire file before you can start search
    - don't have to allocate memory to hold the whole file
    - once you find the substring, you don't have to read the rest of the file

In [79]:
# read one line at a time 

with open(path, 'r') as fd:
    while True:
        x = fd.readline()
        # returns empty string when finished
        if x == '':
            break;
        print(x)

one

two

three

four

five

six



In [80]:
# note double spacing
# each line in the file has a newline, plus print is adding one
# can turn off the print newline with keyword arg 'end'

with open(path, 'r') as fd:
    while True:
        x = fd.readline()
        # returns empty string when finished
        if x == '':
            break;
        print(x, end='')

one
two
three
four
five
six


In [81]:
fd = open('/tmp/four.txt')
fd

<_io.TextIOWrapper name='/tmp/four.txt' mode='r' encoding='UTF-8'>

In [82]:
# a file descriptor is an iterator over the file lines

[fd, iter(fd), fd is iter(fd)]

[<_io.TextIOWrapper name='/tmp/four.txt' mode='r' encoding='UTF-8'>,
 <_io.TextIOWrapper name='/tmp/four.txt' mode='r' encoding='UTF-8'>,
 True]

In [83]:
next(fd)

'one\n'

In [84]:
# don't have to finish iterator...

next(fd)

'two\n'

In [85]:
# note with readline and readlines each line has a trailing '\n', 
# which you usually don't want
# use strip() to remove
# can this cause a problem?

'one\n'.strip()

'one'

In [86]:
# read N chars at a time

with open(path, 'r')  as f:
    while True:
        s = f.read(3)
        if s == '':
            break;
        print(s)
        

one

tw
o
t
hre
e
f
our

fi
ve

six




In [87]:
# ... or can finish iterator later on

[next(fd), next(fd), next(fd), next(fd)]

['three\n', 'four\n', 'five\n', 'six\n']

In [88]:
# exhausted, can not be used again

next(fd)

StopIteration: 

# Python has utilities for reading and writing:
- compressed files - 'gzip', 'bz2' 
- file archives - 'zip', 'tar', 'hdf5'