# File processing in python 
## Somewhat aligned to the PCPP1 exam
##### TODO: add table of contents
##### Reading this top to bottom is the best way of reading

In [1]:
import csv 
import xml, xml.etree.ElementTree as et
import sqlite3
import json
import logging
import os
import configparser
from io import StringIO

In [2]:
# Helper to produce a list of attribs and values if not dunder
def get_object_attributes(obj):
    [print(fr'{type(obj)}: {i}: {getattr(obj,i)}',end ='\n') for i in dir(obj) if not i.startswith('_')]

## CSV module exploration
#### Exploring module functionality

1. [Reader](http://localhost:8888/notebooks/File%20Processing%20-%20PCPP1.ipynb#CSV-Reader)
2. [DictReader](http://localhost:8888/notebooks/File%20Processing%20-%20PCPP1.ipynb#CSV-Dict-Reader)

- [Pep for CSV file API](https://peps.python.org/pep-0305/)
- [List of Kwargs](https://docs.python.org/3/library/csv.html#csv-fmt-params) to pass in

### CSV Reader

- Escapechar will be treated as escape and will void the char in any string unless escaped with the delim.
    - `escapechar = 'r' ~ 'mothe,222-555-101' 'mothe-in-law','222-555-104'`
    - `escapechar = '' ~ 'mother', '222-555-101' 'mother-in-law', '222-555-104'`
- [`lineterminator`](https://docs.python.org/3/library/csv.html#csv.Dialect.lineterminator) is ignored

In [14]:
csv_file = 'Downloads/contacts.csv'
with open(csv_file) as f:
    reader = csv.reader(f,delimiter = ',', doublequote = csv.QUOTE_NONE)

    for row in reader:
        print(row)
print('\nATTRIBUTES:')
get_object_attributes(reader.dialect)
get_object_attributes(reader)

['Name', 'Phone']
['mother', '222-555-101']
['father', '222-555-102']
['wife', '222-555-103']
['mother-in-law', '222-555-104']

ATTRIBUTES:
<class '_csv.Dialect'>: delimiter: ,
<class '_csv.Dialect'>: doublequote: True
<class '_csv.Dialect'>: escapechar: None
<class '_csv.Dialect'>: lineterminator: 

<class '_csv.Dialect'>: quotechar: "
<class '_csv.Dialect'>: quoting: 0
<class '_csv.Dialect'>: skipinitialspace: False
<class '_csv.Dialect'>: strict: False
<class '_csv.reader'>: dialect: <_csv.Dialect object at 0x105ddf960>
<class '_csv.reader'>: line_num: 5


### CSV Dict Reader

- Reads a csv and returns a dict object
- Takes header as field names if None.
- Field names are positional.

In [17]:
csv_file = 'Downloads/contacts.csv'
with open(csv_file) as f:
    reader = csv.DictReader(f,fieldnames=['asdfasd','dfgsdfgd','tristan'])
    get_object_attributes(reader)
    print()
    for row in reader:
        if reader.line_num == 1:
            continue
        else:
            print(row)

<class 'csv.DictReader'>: dialect: excel
<class 'csv.DictReader'>: fieldnames: ['asdfasd', 'dfgsdfgd', 'tristan']
<class 'csv.DictReader'>: line_num: 0
<class 'csv.DictReader'>: reader: <_csv.reader object at 0x1065c0510>
<class 'csv.DictReader'>: restkey: None
<class 'csv.DictReader'>: restval: None

{'asdfasd': 'mother', 'dfgsdfgd': '222-555-101', 'tristan': None}
{'asdfasd': 'father', 'dfgsdfgd': '222-555-102', 'tristan': None}
{'asdfasd': 'wife', 'dfgsdfgd': '222-555-103', 'tristan': None}
{'asdfasd': 'mother-in-law', 'dfgsdfgd': '222-555-104', 'tristan': None}


### CSV Writer

- [Docs](https://docs.python.org/3/library/csv.html#writer-objects)
- Takes an iterable and writes to a file.
- `writerows` will take nested iterables [['a','b'],['x','y']]
- `writerow` will take a single iterable or nested iterables, but a nested iterable will be written on a single line. ['a','b']

In [None]:
dummy_string = '''first_name,last_name,age
tristan, crudge, 32
roman, gravitas,78
youthful, penguin, 67'''

dummy_iterdict = [{'first_name':'Gertrude','last_name':'Bertrude','age':34},{'first_name':'Thertrude','last_name':'Blertrude','age':48}]
dummy_iterlist = [['jeremy','waterhouse-manfried-the-fourth','1'],['golfy','mcWagon',None]]
dummy_itertuple = (('Dr','Cabbage',54),('Dr','Babbage',234))

In [None]:
# Write rows
with open('sample.csv','w') as csv_out:
    writer = csv.writer(csv_out)
    writer.writerows([row.split(',') for row in dummy_string.splitlines()])
    writer.writerows([d.values() for d in dummy_iterdict])
get_object_attributes(writer)

In [None]:
# Write row 
with open('sample.csv','a') as csv_out:
    writer = csv.writer(csv_out)
    for i in dummy_iterlist:
        writer.writerow(i)
with open('sample.csv','a') as csv_out:
    writer = csv.writer(csv_out)
    for i in dummy_itertuple:
        writer.writerow(i)

### CSV DictWriter

- Takes a dict and writes to a file
- `fieldnames` is required for `DictWriter`
- Can omit `writeheader` if column headers is the first in the iterable. Else inferred from the iterable
- `writerows` can take a list of dicts
- `writerow` can only take a dict

In [None]:
with open('sample_b.csv','w') as out_csv:
    field_names = ['first_name','last_name','age']
    writer = csv.DictWriter(out_csv,fieldnames=field_names)
    writer.writeheader()
    writer.writerows(dummy_iterdict)
get_object_attributes(writer)

In [None]:
with open('sample_c.csv','w') as out_csv:
    field_names = ['first_name','last_name','age']
    writer = csv.DictWriter(out_csv,fieldnames=field_names)
    writer.writeheader()
    writer.writerow(dummy_iterdict) #will fail as passing an iterable when dict expected.

In [None]:
get_object_attributes(csv)

## XML Processing

In [1]:
xml_file = 'downloads/forecast.xml'

In [12]:
import xml.etree.ElementTree as et

In [14]:
et.parse(xml_file).getroot()

<xml.etree.ElementTree.ElementTree at 0x10d2ec890>

In [13]:
dir(et)


['Comment',
 'Element',
 'ElementPath',
 'ElementTree',
 'HTML_EMPTY',
 'PI',
 'ParseError',
 'ProcessingInstruction',
 'QName',
 'SubElement',
 'TreeBuilder',
 'VERSION',
 'XML',
 'XMLID',
 'XMLParser',
 'XMLPullParser',
 '_Element_Py',
 '_ListDataStream',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_escape_attrib',
 '_escape_attrib_html',
 '_escape_cdata',
 '_get_writer',
 '_namespace_map',
 '_namespaces',
 '_raise_serialization_error',
 '_sentinel',
 '_serialize',
 '_serialize_html',
 '_serialize_text',
 '_serialize_xml',
 'collections',
 'contextlib',
 'dump',
 'fromstring',
 'fromstringlist',
 'io',
 'iselement',
 'iterparse',
 'parse',
 're',
 'register_namespace',
 'sys',
 'tostring',
 'tostringlist',