# Data processing workflow

## Sacramento Real Estate Transactions Dataset

In [1]:
import csv

In [16]:
data = []
with open("Sacramentorealestatetransactions.csv") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(dict(row))

In [13]:
data[0]

{'street': '3526 HIGH ST',
 'city': 'SACRAMENTO',
 'zip': '95838',
 'state': 'CA',
 'beds': '2',
 'baths': '1',
 'sq__ft': '836',
 'type': 'Residential',
 'sale_date': 'Wed May 21 00:00:00 EDT 2008',
 'price': '59222',
 'latitude': '38.631913',
 'longitude': '-121.434879'}

Tasks
- turn 'zip' into `int`
- turn 'beds' into `int`
- turn 'price' into `int`
- turn 'latitude' into `float`
- turn 'longitude' into `float`
- capitalize 'city'
- rename 'sq__ft' to 'sq_ft'
- turn 'sale_date' into date format 'YYYY-MM-DD'

In [17]:
len(data)

985

Start this exercise with only one row

In [20]:
sample = data[0]

In [21]:
sample

{'street': '3526 HIGH ST',
 'city': 'SACRAMENTO',
 'zip': '95838',
 'state': 'CA',
 'beds': '2',
 'baths': '1',
 'sq__ft': '836',
 'type': 'Residential',
 'sale_date': 'Wed May 21 00:00:00 EDT 2008',
 'price': '59222',
 'latitude': '38.631913',
 'longitude': '-121.434879'}

## 1. convert `str` to `int`

In [24]:
int(sample["zip"])

95838

In [25]:
int(sample["beds"])

2

In [26]:
int(sample["price"])

59222

In [29]:
type(int(sample["price"]))

int

## 2. convert `str` to `float`

In [27]:
float(sample["latitude"])

38.631913

In [28]:
float(sample["longitude"])

-121.434879

In [30]:
type(float(sample["longitude"]))

float

## 3. Capitalize 'city'

In [33]:
sample["city"].capitalize()

'Sacramento'

## 4. Rename key 'sq__ft' to 'sq_ft'
- by removing duplicate underscores from keys

In [35]:
sample.keys()

dict_keys(['street', 'city', 'zip', 'state', 'beds', 'baths', 'sq__ft', 'type', 'sale_date', 'price', 'latitude', 'longitude'])

In [36]:
type(sample.keys())

dict_keys

In [37]:
list(sample.keys())

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq__ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

In [43]:
new_keys = []
for key in list(sample.keys()):
    if "__" in key:
        new_keys.append(key.replace("__", "_"))
    else:
        new_keys.append(key)

In [44]:
new_keys

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq_ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

Simplify:

In [45]:
"street".replace("__", "_")

'street'

In [46]:
super_new_keys = []
for key in list(sample.keys()):
    super_new_keys.append(key.replace("__", "_"))

In [47]:
super_new_keys

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq_ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

In [48]:
new_keys == super_new_keys

True

Put it into list comprehension

In [49]:
[key.replace("__", "_") for key in list(sample.keys())]

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq_ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

iterate directly over the dict_keys object

In [50]:
[key.replace("__", "_") for key in sample.keys()]

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq_ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

## 5. Parse date (YYYY-MM-DD) from 'sale_date'

using regex

In [51]:
sample["sale_date"]

'Wed May 21 00:00:00 EDT 2008'

In [52]:
import re

### Parse Year

In [55]:
year_pattern = "\d{4}$"

In [54]:
re.findall(year_pattern, sample["sale_date"])

['2008']

In [58]:
re.findall(year_pattern, sample["sale_date"])[0]

'2008'

### Parse Month

In [None]:
month_to_digit = {"Jan": 1,
                  "Feb": 2,
                  "Mar": 3
                 }

In [60]:
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

In [66]:
months.index("Jun") + 1

6

In [67]:
month_pattern = "[a-zA-Z]{3}.*?([a-zA-Z]{3})"

In [70]:
month_pattern_1 = "\w{3}\s(\w{3})\s"

In [71]:
re.findall(month_pattern_1, sample["sale_date"])[0]

'May'

---
without regex

In [80]:
sample["sale_date"][:7][-3:]

'May'

In [82]:
sample["sale_date"][4:7]

'May'

---