# Quick review writing/reading files

### Quick review on writing to a csv file

In [11]:
from faker import Faker
import csv

In [2]:
output = open('/home/parallels/local/data-engineering-practice/data1.csv', 'w')

In [3]:
fake = Faker()

In [4]:
headers = ['name', 'age', 'street', 'city', 'state', 'zipcode', 'phone', 'email']

In [5]:
myWriter = csv.writer(output)
myWriter.writerow(headers)

48

In [6]:
for i in range(100):
    data = [fake.name(), 
            fake.random_int(min=0, max=120), 
            fake.street_address(), 
            fake.city(), 
            fake.state(), 
            fake.zipcode(), 
            fake.phone_number(), 
            fake.email()]
    
    myWriter.writerow(data)

In [7]:
output.close()

### Reading csv files using the context manager "with"

In [10]:
with open('/home/parallels/local/data-engineering-practice/data1.csv', 'r') as f:
    myReader = csv.DictReader(f)
        
    for row in myReader:
        print(row['name'])

Katherine Davis
Shawn Jones
Michael Bryant
Michelle Jackson
James Nunez
Courtney Bailey
Gary Mcmillan
Betty Webb
Jamie Payne
Bryan Robles
Rebecca Rice
Marissa Lewis
Laurie Owens
Kevin Warner
Michelle Quinn
Duane Gray
Jeffrey May
Andrew Valencia
Alejandro Thompson
Daniel Harris
Dale Hall
Steven Warner
Jordan Silva
Anna Mays
Shannon Edwards
Marilyn Johnson
Wendy Rodriguez
Alexandria Parsons
Anthony Garcia
Charles Jones
Amanda Rosario
Linda May
Sue Dunn
Sydney Anderson
Wendy Small
Shawn Dunn
Jacob Greene
Juan Thompson
Brittany Osborne
David Smith
Kevin Johnson
Philip Mckay
Michael Underwood
Katie Weeks
Monica Hood
Jacob Williams
Heather Castro
Cindy Jackson
Amy Moore
Thomas Mason
Jessica Wall
Heather Ayers
Brian Stewart
Raymond Fisher
Karen Kelley
Arthur Anderson
Carol Greer
William Robertson
Jennifer Mahoney
Christopher Tapia
Linda Klein
Karen Pollard
Jade Gibbs
Jeffrey Moore
Elizabeth Mendez
Mary Sparks
Teresa Brooks
Ashley Jackson
Christopher Johnson PhD
Tracey Clark
Dr. Robert Boyd
Ri

### JSON Files

In [14]:
from faker import Faker
import json

In [16]:
faker = Faker()

with open('data2.json', 'w') as f:
    data = {}
    data['records'] = []
    
    for _ in range(10):
        d = {'name': faker.name(),
            'age': faker.random_int(min=0, max=120)}
        
        data['records'].append(d)
    
    json.dump(data, f)

In [41]:
with open('data2.json', 'r') as f:
    data = json.load(f)
    
    print(data['records'][1]['name'])

Gabriel Petersen


In [19]:
import pandas.io.json as pd_json

In [28]:
with open('data2.json', 'r') as f:
    data = f.read()
    print(type(data)) 
    data = pd_json.loads(data) # loads a json string to a dictionary
    print(type(data))
    
    df = pd_json.json_normalize(data, record_path='records') # dictionary to dataframe
    print(type(df))
    print(df.head())

<class 'str'>
<class 'dict'>
<class 'pandas.core.frame.DataFrame'>
                 name  age
0        Olivia Boyle  112
1    Gabriel Petersen   23
2   William Henderson  100
3      Robert Johnson   20
4  Angelica Zimmerman   88


  df = pd_json.json_normalize(data, record_path='records') # dictionary to dataframe


In [51]:
df = pd.read_json('data2.json')
df

Unnamed: 0,records
0,"{'name': 'Olivia Boyle', 'age': 112}"
1,"{'name': 'Gabriel Petersen', 'age': 23}"
2,"{'name': 'William Henderson', 'age': 100}"
3,"{'name': 'Robert Johnson', 'age': 20}"
4,"{'name': 'Angelica Zimmerman', 'age': 88}"
5,"{'name': 'Joshua Bailey', 'age': 105}"
6,"{'name': 'Russell Mann', 'age': 16}"
7,"{'name': 'James Klein', 'age': 16}"
8,"{'name': 'Maria Hood', 'age': 88}"
9,"{'name': 'Latasha Moon', 'age': 30}"


In [52]:
df['records']

0         {'name': 'Olivia Boyle', 'age': 112}
1      {'name': 'Gabriel Petersen', 'age': 23}
2    {'name': 'William Henderson', 'age': 100}
3        {'name': 'Robert Johnson', 'age': 20}
4    {'name': 'Angelica Zimmerman', 'age': 88}
5        {'name': 'Joshua Bailey', 'age': 105}
6          {'name': 'Russell Mann', 'age': 16}
7           {'name': 'James Klein', 'age': 16}
8            {'name': 'Maria Hood', 'age': 88}
9          {'name': 'Latasha Moon', 'age': 30}
Name: records, dtype: object

Simply reading by read_json will have the structure above, and this is why normalization is needed.

https://pandas.pydata.org/docs/reference/api/pandas.json_normalize.html

In [58]:
df = pd.json_normalize(df['records'])

In [60]:
df.head(3).to_json()

'{"name":{"0":"Olivia Boyle","1":"Gabriel Petersen","2":"William Henderson"},"age":{"0":112,"1":23,"2":100}}'

In [61]:
df.head(3).to_json(orient='records')

'[{"name":"Olivia Boyle","age":112},{"name":"Gabriel Petersen","age":23},{"name":"William Henderson","age":100}]'