## RCS Python File Operations 2

# Binary Files

In [2]:
with open("myfile.bin", "wb") as f:
    f.write(b'\x30\x31\x09\x32\x20\x52\x43\x53\x0A\x51\xFE\x00\xFF') # notice b prefix!!


In [3]:
with open("myfile.bin", "r") as f:
    lines=f.readlines()
    f.seek(0)
    text=f.read()
print(lines)
print(lines[0])
print(text)

['01\t2 RCS\n', 'Qþ\x00ÿ']
01	2 RCS

01	2 RCS
Qþ ÿ


ASCII Codes
https://en.wikipedia.org/wiki/ASCII

In [4]:
with open("myfile.bin", "rb") as f:
    blines=f.readlines()

In [5]:
print(blines)

[b'01\t2 RCS\n', b'Q\xfe\x00\xff']


In [None]:
blines

## For more complicated binary writing and reading of binaries 
### pickle standard library is recommended

https://docs.python.org/3/library/pickle.html

In [7]:
import pickle

In [8]:
with open("myfile.bin", "wb") as f:
    myint = 42
    mystring = "Hello, RCS!"
    mylist = ["sun", "moon", "earth"]
    mydict = { "name": "Val", "job": "Teacher" }   
    pickle.dump(myint, f)
    pickle.dump(mystring, f)
    pickle.dump(mylist, f)
    pickle.dump(mydict, f)


In [9]:
with open("myfile.bin", "r", encoding=None) as f:
    mf=f.read()
mf

'€\x03K*.€\x03X\x0b\x00\x00\x00Hello, RCS!q\x00.€\x03]q\x00(X\x03\x00\x00\x00sunq\x01X\x04\x00\x00\x00moonq\x02X\x05\x00\x00\x00earthq\x03e.€\x03}q\x00(X\x04\x00\x00\x00nameq\x01X\x03\x00\x00\x00Valq\x02X\x03\x00\x00\x00jobq\x03X\x07\x00\x00\x00Teacherq\x04u.'

### Not very helpful is it? Better to use pickle to retrieve the data and "unpickle" it

In [10]:
with open("myfile.bin", "rb") as f:
    myint = pickle.load(f)
    mystring = pickle.load(f)
    mylist = pickle.load(f)
    mydict = pickle.load(f)  


In [11]:
myint,mystring

(42, 'Hello, RCS!')

In [12]:
mylist

['sun', 'moon', 'earth']

In [13]:
mydict

{'name': 'Val', 'job': 'Teacher'}

In [15]:
# Recipe for opening a pickled file with unknown number of variables
with open('myfile.bin', "rb") as f:
    mylist = []
    while True:
        try:
            mylist.append(pickle.load(f))
        except EOFError:
            print("End of file reached!")
            break
    print("Going to close file now")

End of file reached!
Going to close file now


In [16]:
len(mylist)

4

In [17]:
mylist[3]

{'name': 'Val', 'job': 'Teacher'}

In [19]:
# write a for loop printing all data types in mylist
for item in mylist:
    print(type(item))

<class 'int'>
<class 'str'>
<class 'list'>
<class 'dict'>


In [18]:
print(myint,mystring,mylist,mydict)

42 Hello, RCS! [42, 'Hello, RCS!', ['sun', 'moon', 'earth'], {'name': 'Val', 'job': 'Teacher'}] {'name': 'Val', 'job': 'Teacher'}


In [20]:
import os
print(os.getcwd()) # cwd - current working directory

C:\Users\val-p1\Github\RCS_Data_Analysis_Python_2019_July


In [None]:
mycwd = os.getcwd()
mycwd

In [None]:
%pwd

In [None]:
!dir

In [21]:
os.getlogin()

'val-p1'

In [22]:
os.getcwd()

'C:\\Users\\val-p1\\Github\\RCS_Data_Analysis_Python_2019_July'

In [23]:
os.rename('numbers.txt', 'bignumbers.txt')

In [24]:
myfiles = os.listdir()
myfiles[:5]

['.git',
 '.ipynb_checkpoints',
 'All_Any.ipynb',
 'biglist.txt',
 'bignumbers.txt']

In [None]:
myfiles

In [25]:
myfiles[4]

'bignumbers.txt'

In [26]:
'.txt' in myfiles[4]

True

In [27]:
'.txt' in myfiles[2]

False

In [None]:
# How do we select only text files

In [28]:
# We can create a new list of only text files (with extension .txt)
mytextfiles = [file for file in myfiles if '.txt' in file]
mytextfiles

['biglist.txt',
 'bignumbers.txt',
 'list.txt',
 'numbers16_48_38.txt',
 'numbers16_48_9.txt',
 'somefile.txt']

In [29]:
mytxtlist = []
for item in myfiles:
    if '.txt' in item:
        mytxtlist.append(item)
        # we could do more stuff here not just make a list
mytxtlist

['biglist.txt',
 'bignumbers.txt',
 'list.txt',
 'numbers16_48_38.txt',
 'numbers16_48_9.txt',
 'somefile.txt']

In [None]:
os

In [30]:
result = []
for file in mytextfiles:
    with open(file) as f:
        txt = f.read() # careful here, we might not want to read the full file
        result.append(len(txt))
        

In [31]:
result

[11888883, 275, 78, 158, 158, 80]

In [32]:
filesizes = []
for file in mytextfiles:
    filesizes.append((file, os.path.getsize(file)))
filesizes

[('biglist.txt', 12888882),
 ('bignumbers.txt', 285),
 ('list.txt', 107),
 ('numbers16_48_38.txt', 167),
 ('numbers16_48_9.txt', 167),
 ('somefile.txt', 84)]

In [33]:
# list comprehension will be shorter
filesizes = [(file, os.path.getsize(file)) for file in mytextfiles]

In [34]:
filesizes

[('biglist.txt', 12888882),
 ('bignumbers.txt', 285),
 ('list.txt', 107),
 ('numbers16_48_38.txt', 167),
 ('numbers16_48_9.txt', 167),
 ('somefile.txt', 84)]

In [35]:
filedict = {}
for file in mytextfiles:
    filedict[file] = os.path.getsize(file)
filedict

{'biglist.txt': 12888882,
 'bignumbers.txt': 285,
 'list.txt': 107,
 'numbers16_48_38.txt': 167,
 'numbers16_48_9.txt': 167,
 'somefile.txt': 84}

In [36]:
print('mytextfiles is a ', type(mytextfiles))
# one line dictionary comprehension
fdict = {f:os.path.getsize(f) for f in mytextfiles}
fdict

mytextfiles is a  <class 'list'>


{'biglist.txt': 12888882,
 'bignumbers.txt': 285,
 'list.txt': 107,
 'numbers16_48_38.txt': 167,
 'numbers16_48_9.txt': 167,
 'somefile.txt': 84}

In [None]:
result

In [37]:
os.listdir('C:\\')

['$Recycle.Bin',
 'bin',
 'Documents and Settings',
 'DRIVERS',
 'hiberfil.sys',
 'Intel',
 'MSOCache',
 'pagefile.sys',
 'PerfLogs',
 'Program Files',
 'Program Files (x86)',
 'ProgramData',
 'Recovery',
 'swapfile.sys',
 'System Volume Information',
 'TVicPortPersonal',
 'Users',
 'Windows',
 'xampp']

In [38]:
os.chdir("c:\\")
os.getcwd()


'c:\\'

In [39]:
os.chdir(mycwd)
os.getcwd()

NameError: name 'mycwd' is not defined

In [41]:
os.path.expanduser('~')

'C:\\Users\\val-p1'

In [43]:
os.chdir(os.path.expanduser('~')+"\\Github\\RCS_Data_Analysis_Python_2019_July")
os.getcwd()

'C:\\Users\\val-p1\\Github\\RCS_Data_Analysis_Python_2019_July'

	The os.path.join() function constructs a pathname out of one or more partial pathnames
   **Don’t fuss with slashes; always use os.path.join() and let Python do the right thing.**

In [45]:
# OS neutral path join should work on all OSes
os.path.join(os.getcwd(), "README.md")

'C:\\Users\\val-p1\\Github\\RCS_Data_Analysis_Python_2019_July\\README.md'

In [None]:
os.path.join("C:\\Users\\vsd\\Documents\\Github\\RCS_Python","README.md")

In [46]:
print(os.path.join(os.path.expanduser('~'), 'Github', 'RCS_Data_Analysis_Python_2019_July', 'README.md')) 

C:\Users\val-p1\Github\RCS_Data_Analysis_Python_2019_July\README.md


In [47]:
print(os.path.join(os.getcwd(),'Documents', 'Github', 'RCS_Python', 'README.md')) 

C:\Users\val-p1\Github\RCS_Data_Analysis_Python_2019_July\Documents\Github\RCS_Python\README.md


In [48]:
newpath=os.path.join(os.path.expanduser('~'), 'Github')
newpath

'C:\\Users\\val-p1\\Github'

In [None]:
print(newpath) ## Aha pretty print!!

In [None]:
mypath=os.getcwd()
mypath

In [None]:
mysplit = os.path.split(mypath)
mysplit

In [None]:
mydir, myfname = os.path.split(mypath)
print(mydir,":",myfname)

The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell, although results are returned in arbitrary order.

In [None]:
glob.

In [49]:
# we can get a list of all files in current directory matching certaing wildcards
from glob import glob as gl
ifiles=gl('Python*.ipynb')
ifiles

['Python Classes.ipynb',
 'Python Cleaning Up Text Files.ipynb',
 'Python Data Structures Exercises.ipynb',
 'Python Dictionaries.ipynb',
 'Python Errors.ipynb',
 'Python File IO.ipynb',
 'Python File Operations 2 Binary Files and Pickle in class 21.05.2019.ipynb',
 'Python Flow Control.ipynb',
 'Python Functions.ipynb',
 'Python Introduction.ipynb',
 'Python Learning Resources.ipynb',
 'Python List Comprehension.ipynb',
 'Python Lists.ipynb',
 'Python Modules and Imports.ipynb',
 'Python Sets.ipynb',
 'Python Strings.ipynb',
 'Python Tuples.ipynb',
 'Python Variables and Data Types.ipynb',
 'Python_List_Exercise_1.ipynb']

In [None]:
ipyth=glob.glob('*Python*.ipynb')
ipyth

In [None]:
ifile2=glob.glob('*File*.*,*.md')
ifile2

In [50]:
# Find Matching Files recursively
from pathlib import Path

In [53]:
# We find all matching text files with path and split the path from the filename
for filename in Path('').glob('**/*.txt'):
    print(os.path.split(filename))

('', 'biglist.txt')
('', 'bignumbers.txt')
('', 'list.txt')
('', 'numbers16_48_38.txt')
('', 'numbers16_48_9.txt')
('', 'somefile.txt')
('data', 'cleaned.txt')
('data', 'Veidenbaums.txt')


### New in version 3.4.

For example, consider a directory containing the following files: 1.gif, 2.txt, card.gif and a subdirectory sub which contains only the file 3.txt. glob() will produce the following results. Notice how any leading components of the path are preserved.

In [None]:
glob.glob('./*V*.*')

In [None]:
! mkdir Text # ! is Jupyter command for running OS commands b

# File Operations directly from Python

https://docs.python.org/3/library/subprocess.html#module-subprocess

subprocess.run(args, *, stdin=None, input=None, stdout=None, stderr=None, shell=False, cwd=None, timeout=None, check=False, encoding=None, errors=None)

In [None]:
%pwd

In [54]:
import sys
sys.path

['C:\\Users\\val-p1\\Github\\RCS_Data_Analysis_Python_2019_July',
 'C:\\ProgramData\\Anaconda3\\python37.zip',
 'C:\\ProgramData\\Anaconda3\\DLLs',
 'C:\\ProgramData\\Anaconda3\\lib',
 'C:\\ProgramData\\Anaconda3',
 '',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\win32',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\win32\\lib',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\Pythonwin',
 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\val-p1\\.ipython']

In [55]:
import subprocess
print(subprocess.run("calculator", shell=True, stdout=subprocess.PIPE))

CompletedProcess(args='calculator', returncode=1, stdout=b'')


In [56]:
import subprocess
print(subprocess.run("dir", shell=True, stdout=subprocess.PIPE)) ## Without pipe we will get no output

CompletedProcess(args='dir', returncode=0, stdout=b' Volume in drive C is Windows\r\n Volume Serial Number is 5AA0-2A07\r\n\r\n Directory of C:\\Users\\val-p1\\Github\\RCS_Data_Analysis_Python_2019_July\r\n\r\n07/18/2019  06:00 PM    <DIR>          .\r\n07/18/2019  06:00 PM    <DIR>          ..\r\n07/18/2019  05:24 PM    <DIR>          .ipynb_checkpoints\r\n07/13/2019  10:17 AM             7,614 All_Any.ipynb\r\n07/18/2019  05:07 PM        12,888,882 biglist.txt\r\n07/18/2019  04:52 PM               285 bignumbers.txt\r\n07/18/2019  04:03 PM    <DIR>          data\r\n07/13/2019  10:17 AM         7,435,248 Data_Analysis_Python_Introduction.pdf\r\n07/13/2019  10:17 AM             1,713 Git_Workflow.md\r\n07/16/2019  04:04 PM    <DIR>          img\r\n07/13/2019  10:17 AM               217 Jupyter with Python.md\r\n07/13/2019  10:17 AM             1,084 LICENSE\r\n07/18/2019  05:05 PM               107 list.txt\r\n07/18/2019  05:29 PM               120 myfile.bin\r\n07/18/2019  04:48 PM   

In [57]:
print(subprocess.run("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"))

CompletedProcess(args='C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe', returncode=0)


In [None]:
print(subprocess.run("chrome.exe"))

In [None]:
subprocess.run(["mkdir", "testdir"], shell=True, stdout=subprocess.PIPE)

In [None]:
import sys


In [None]:
! dir

In [None]:
%%writefile ./Text/Test.txt
Just a simple text file
Nothing special

In [None]:
! dir Text

In [None]:
glob.glob('**/*.txt', recursive=True) #We should also get subdirectory name

In [None]:
glob.glob('./*.md')

In [None]:
glob.glob('./?.md') # requires a single char only so we wont get a match for longer file names

In [None]:
meta = os.stat('README.md')
print(type(meta),meta) # os.stat returns a class containing file meta information

In [None]:
import time
time.localtime(meta.st_mtime) #mtime last modified tiem

In [None]:
## Homework 2 for file operations
# Process 'resources\\cleaned.txt',
# Generate a dictionary of words and their frequency - "Un" : 76
# Save this dictionary in a text file, each word and frequency in a new line
# Bonus for pickling the dictionary