In [3]:
import pandas as pd

# Working with Files

In [4]:
# data is often loaded from database, csv files, APIs, large data stores

In [None]:
# loading data with Pandas

# CSV files

In [6]:
# comma separated value files
# structured tabular data

df = pd.read_csv('purchases.csv')
# takes a string and returns a dataframe object
print(df)

  Unnamed: 0 country  ad_views  items_purchased
0     George      US        16                2
1       John     CAN        42                1
2     Thomas     CAN        32                0
3      James      US        13                8
4     Andrew     CAN        63                0
5     Martin      US        19                5
6    William      US        65                7
7    Zachary      US        23                3
8    Millard     CAN        16                0
9   Franklin      US        77                5


In [8]:
# to output data to a file
df.to_csv('my_data.csv')

# creates a new file and adds our data to it

# JSON

In [9]:
# JavaScript Object Notation
# JSON and XML allows for more customizable and flexible data storage
# semi-structured files
# additional complexity as a result

In [10]:
# a collection of key-value pairs, like dictionaries

In [11]:
df = pd.read_json('purchases.json')

In [12]:
df

Unnamed: 0.1,Unnamed: 0,ad_views,country,items_purchased
0,George,16,US,2
1,John,42,CAN,1
2,Thomas,32,CAN,0
3,James,13,US,8
4,Andrew,63,CAN,0
5,Martin,19,US,5
6,William,65,US,7
7,Zachary,23,US,3
8,Millard,16,CAN,0
9,Franklin,77,US,5


In [13]:
# JSON output
df.to_json('my_data.json')

In [14]:
# to simply create a JSON string
serialized_purchases = df.to_json()

# this allows us to send this data over the web

# XML

In [17]:
# extensible markup language
# hierarchical semi-structured data format

# pandas has no read_xml equivalent
# need xml module

In [19]:
import xml.etree.ElementTree as ET


# load and parse the XML file into a tree
tree = ET.parse('purchases.xml')

# Find the root of the tree. This is the node of the tree where
#   we start the iteration
root = tree.getroot()

# Define a custom function to loop over our tree, extract value
#   return a two-dimensional list
def xml_to_list(root):
    result = []
    for row in root:
        row_list = []
        for column in row:
            row_list.append(column.text)
        result.append(row_list)
    return result

# Feed our two-dimensional 
# pandas 
df = pd.DataFrame(xml_to_list(root))
print(df)

          0    1   2  3
0    George   US  16  2
1      John  CAN  42  1
2    Thomas  CAN  32  0
3     James   US  13  8
4    Andrew  CAN  63  0
5    Martin   US  19  5
6   William   US  65  7
7   Zachary   US  23  3
8   Millard  CAN  16  0
9  Franklin   US  77  5


In [20]:
# xml library reads in xml files and convert them to an element tree
# manually processed into a list

# when generating data files, use CSV and NOT xml

# Python open()

In [27]:
with open('poem.txt') as poem_file:
    text = poem_file.readlines()
    print("This file is {} lines long".format(len(text)))
    for line in text:
        print(line)

This file is 19 lines long
Beautiful is better than ugly.

Explicit is better than implicit.

Simple is better than complex.

Complex is better than complicated.

Flat is better than nested.

Sparse is better than dense.

Readability counts.

Special cases aren't special enough to break the rules.

Although practicality beats purity.

Errors should never pass silently.

Unless explicitly silenced.

In the face of ambiguity, refuse the temptation to guess.

There should be one-- and preferably only one --obvious way to do it.

Although that way may not be obvious at first unless you're Dutch.

Now is better than never.

Although never is often better than *right* now.

If the implementation is hard to explain, it's a bad idea.

If the implementation is easy to explain, it may be a good idea.

Namespaces are one honking great idea -- let's do more of those!


In [28]:
with open('purchases.csv') as csv_file:
    lines = csv_file.readlines()
    for line in lines:
        print(line)

,country,ad_views,items_purchased

George,US,16,2

John,CAN,42,1

Thomas,CAN,32,0

James,US,13,8

Andrew,CAN,63,0

Martin,US,19,5

William,US,65,7

Zachary,US,23,3

Millard,CAN,16,0

Franklin,US,77,5


In [29]:
# readlines() creates a list of strings
# each element is a line of text from the input file

# using 'with' automatically closes the file

# A word about encoding

In [None]:
# all strings are Python 3 are unicode strings
# Utf-8 is the default encoding Python uses

# if you cannot use utf-8, you will need to guess the encoding and decode based
#    on educated guesses

# English-language versions of Windows use the cp1252 encoding