In [22]:
# Dependency imports
import re
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline

# Notebook customizations
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_colwidth = -1

### Properties data prep

In [23]:
properties_df = pd.read_csv("./housingWebScraper/housingWebScraper/output/Property-lastrun.csv", sep="|", \
                            dtype={'zip_code': str}, encoding="latin1")
properties_df

Unnamed: 0,address,area_sqft,basement_type,built_year,city,exterior_type,heating_type,lot_size,no_of_baths,no_of_bedrooms,no_of_parking_spaces,no_of_stories,parking_type,property_type,property_url,state,tax_amount,tax_year,zip_code
0,160 162 Old Peckslip Rd,1352,,1985.0,Holmes,,,0.74 acres,3.0,3.0,,,,Multi-Family,https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,NY,"$7,718.79",,12531
1,10 Cliff Ct,1184,,1997.0,Holmes,,,1.4 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,NY,"$8,320.40",2017,12531
2,26 Donovan Ln,1890,,1992.0,Holmes,,,2.53 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/26-donovan-ln-holmes-ny-12531--2349447123,NY,"$9,260.24",2017,12531
3,3130 Grand Concourse #7R,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7r-bronx-ny-10458--2171936520,NY,,2017,10468
4,3130 Grand Concourse #7S,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7s-bronx-ny-10458--2333495270,NY,,2017,10458
5,3130 Grand Concourse #7P,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7p-bronx-ny-10458--2173798637,NY,,2017,10458
6,3130 Grand Concourse #7N,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7n-bronx-ny-10458--2345496753,NY,,2017,10458
7,3184 Grand Concourse #4E,80811,,1965.0,Bronx,,,0.38 acres,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3184-grand-concourse-4e-bronx-ny-10458--2345502809,NY,,2017,10458
8,4 Wheel Dr,,,,Craryville,,,4.0 acres,,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,NY,"$1,509.69",2017,12521
9,Winding Ln,,,,Craryville,,,"7,405 sqft",,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/winding-ln-craryville-ny-12521--2158446526,NY,$31.29,2017,12521


In [24]:
properties_df.tax_year.value_counts()

2017                     93439
3 Beds Price             12   
4 Beds Price             10   
3 Beds                   4    
6 Beds                   3    
1 day on Trulia          3    
4 Beds                   3    
6 Beds Price             1    
1 day on Trulia Price    1    
5 Beds Price             1    
7 Beds Price             1    
9 Beds                   1    
8 Beds Price             1    
5 Beds                   1    
Name: tax_year, dtype: int64

#### Need to update the crawler to account for these anomalies of these improperly parsed records 
#### It is actually not the crawler's fault since these records come from a different page design for which the crawler support isn't added yet,  but we need to ensure such pages get ignored in the first place

In [25]:
properties_df.loc[properties_df['address'].isnull()]

Unnamed: 0,address,area_sqft,basement_type,built_year,city,exterior_type,heating_type,lot_size,no_of_baths,no_of_bedrooms,no_of_parking_spaces,no_of_stories,parking_type,property_type,property_url,state,tax_amount,tax_year,zip_code
4585,,1674,,1955.0,,,,1458 sqft,4.0,6.0,,,,,https://www.trulia.com/p/ny/bronx/2827-valentine-ave-bronx-ny-10458--68856,,"1,674 sqft",6 Beds Price,
8573,,1825,,1928.0,,,,,1.0,3.0,,,,,https://www.trulia.com/p/ny/new-rochelle/100-pelham-rd-1-b-new-rochelle-ny-10805--1001820853?rd=1,,$148/sqft,3 Beds Price,
8950,,2276,,1924.0,,,,8420 sqft,2.0,3.0,,,,,https://www.trulia.com/p/ny/new-rochelle/38-leland-ave-new-rochelle-ny-10805--2009254748,,"2,276 sqft",3 Beds Price,
9365,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/new-rochelle/address-not-disclosed-new-rochelle-ny-10805--2009254071?rd=1,,,,
9661,,,,,,,,,,,,,,,https://www.trulia.com/c/ny/new-rochelle/harbor-house-3-davenport-ave-new-rochelle-ny-10805--2123143985?rd=1,,,,
11535,,3937,,1955.0,,,,0.40 acres,5.0,6.0,,,,,https://www.trulia.com/p/ny/bronxville/8-oakledge-rd-bronxville-ny-10708--2009226050,,"3,937 sqft",6 Beds,
11614,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/yonkers/128-winnebago-rd-yonkers-ny-10710--2009230033,,,,
14123,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/bronxville/230-pondfield-rd-bronxville-ny-10708--1103311026,,,,
15395,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/yonkers/address-not-disclosed-yonkers-ny-10704--2009207400?rd=1,,,,
17320,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/new-rochelle/17-horton-ave-new-rochelle-ny-10801--2009238629,,,,


#### Remove anomalous records

In [26]:
properties_df = properties_df[~properties_df['address'].isnull()]
properties_df.reset_index(drop=True, inplace=True)
properties_df

Unnamed: 0,address,area_sqft,basement_type,built_year,city,exterior_type,heating_type,lot_size,no_of_baths,no_of_bedrooms,no_of_parking_spaces,no_of_stories,parking_type,property_type,property_url,state,tax_amount,tax_year,zip_code
0,160 162 Old Peckslip Rd,1352,,1985.0,Holmes,,,0.74 acres,3.0,3.0,,,,Multi-Family,https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,NY,"$7,718.79",,12531
1,10 Cliff Ct,1184,,1997.0,Holmes,,,1.4 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,NY,"$8,320.40",2017,12531
2,26 Donovan Ln,1890,,1992.0,Holmes,,,2.53 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/26-donovan-ln-holmes-ny-12531--2349447123,NY,"$9,260.24",2017,12531
3,3130 Grand Concourse #7R,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7r-bronx-ny-10458--2171936520,NY,,2017,10468
4,3130 Grand Concourse #7S,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7s-bronx-ny-10458--2333495270,NY,,2017,10458
5,3130 Grand Concourse #7P,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7p-bronx-ny-10458--2173798637,NY,,2017,10458
6,3130 Grand Concourse #7N,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7n-bronx-ny-10458--2345496753,NY,,2017,10458
7,3184 Grand Concourse #4E,80811,,1965.0,Bronx,,,0.38 acres,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3184-grand-concourse-4e-bronx-ny-10458--2345502809,NY,,2017,10458
8,4 Wheel Dr,,,,Craryville,,,4.0 acres,,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,NY,"$1,509.69",2017,12521
9,Winding Ln,,,,Craryville,,,"7,405 sqft",,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/winding-ln-craryville-ny-12521--2158446526,NY,$31.29,2017,12521


In [27]:
properties_df.property_type.value_counts()

Single-Family Home           60522
Multi-Family                 25539
Lot/Land                     6095 
Townhouse                    1421 
Unknown                      932  
Coop                         479  
Farm/Ranch                   355  
Income/Investment            181  
Condo                        172  
Mobile/Manufactured          148  
Apartment/Condo/Townhouse    2    
Apartment                    2    
Name: property_type, dtype: int64

In [28]:
properties_df.state.value_counts()

NY    95845
Name: state, dtype: int64

#### Convert certain columns like area and currency from strings to number, and make them uniform

In [29]:
properties_df['area_sqft'] = properties_df['area_sqft'].str.replace(',', '').astype(float)
properties_df['tax_amount'] = properties_df['tax_amount'].str.replace(r'\$|,', '').astype(float)
properties_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,address,area_sqft,basement_type,built_year,city,exterior_type,heating_type,lot_size,no_of_baths,no_of_bedrooms,no_of_parking_spaces,no_of_stories,parking_type,property_type,property_url,state,tax_amount,tax_year,zip_code
0,160 162 Old Peckslip Rd,1352.0,,1985.0,Holmes,,,0.74 acres,3.0,3.0,,,,Multi-Family,https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,NY,7718.79,,12531
1,10 Cliff Ct,1184.0,,1997.0,Holmes,,,1.4 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,NY,8320.40,2017,12531
2,26 Donovan Ln,1890.0,,1992.0,Holmes,,,2.53 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/26-donovan-ln-holmes-ny-12531--2349447123,NY,9260.24,2017,12531
3,3130 Grand Concourse #7R,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7r-bronx-ny-10458--2171936520,NY,,2017,10468
4,3130 Grand Concourse #7S,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7s-bronx-ny-10458--2333495270,NY,,2017,10458
5,3130 Grand Concourse #7P,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7p-bronx-ny-10458--2173798637,NY,,2017,10458
6,3130 Grand Concourse #7N,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7n-bronx-ny-10458--2345496753,NY,,2017,10458
7,3184 Grand Concourse #4E,80811.0,,1965.0,Bronx,,,0.38 acres,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3184-grand-concourse-4e-bronx-ny-10458--2345502809,NY,,2017,10458
8,4 Wheel Dr,,,,Craryville,,,4.0 acres,,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,NY,1509.69,2017,12521
9,Winding Ln,,,,Craryville,,,"7,405 sqft",,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/winding-ln-craryville-ny-12521--2158446526,NY,31.29,2017,12521


In [30]:
def make_area_uniform(row):
    '''
        Converts the areas in other units to sqft
    '''
    
    match = re.match(r"\s*(?P<area>[0-9,\.]+)\s+(?P<unit>[a-z]+)\s*$", str(row['lot_size']))
    if match is not None:
        area = match.group('area')
        unit = match.group('unit')
        area_in_float = float(area.replace(',', ''))
        if unit == 'acre' or unit == 'acres':
            row['lot_size'] = area_in_float * 43560
        elif unit == 'sqft':
            row['lot_size'] = area_in_float
        else:
            print("Unhandled unit for area - " + unit + ". Handle it in make_area_uniform function")
    else:
        if pd.notna(row['lot_size']):
            print("Unhandled value in make_area_uniform " + str(row['lot_size']))
    
    return row

properties_df = properties_df.apply(lambda row: make_area_uniform(row), axis=1)
properties_df

Unnamed: 0,address,area_sqft,basement_type,built_year,city,exterior_type,heating_type,lot_size,no_of_baths,no_of_bedrooms,no_of_parking_spaces,no_of_stories,parking_type,property_type,property_url,state,tax_amount,tax_year,zip_code
0,160 162 Old Peckslip Rd,1352.0,,1985.0,Holmes,,,32234.4,3.0,3.0,,,,Multi-Family,https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,NY,7718.79,,12531
1,10 Cliff Ct,1184.0,,1997.0,Holmes,,,60984.0,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,NY,8320.40,2017,12531
2,26 Donovan Ln,1890.0,,1992.0,Holmes,,,110206.8,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/26-donovan-ln-holmes-ny-12531--2349447123,NY,9260.24,2017,12531
3,3130 Grand Concourse #7R,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7r-bronx-ny-10458--2171936520,NY,,2017,10468
4,3130 Grand Concourse #7S,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7s-bronx-ny-10458--2333495270,NY,,2017,10458
5,3130 Grand Concourse #7P,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7p-bronx-ny-10458--2173798637,NY,,2017,10458
6,3130 Grand Concourse #7N,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7n-bronx-ny-10458--2345496753,NY,,2017,10458
7,3184 Grand Concourse #4E,80811.0,,1965.0,Bronx,,,16552.8,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3184-grand-concourse-4e-bronx-ny-10458--2345502809,NY,,2017,10458
8,4 Wheel Dr,,,,Craryville,,,174240.0,,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,NY,1509.69,2017,12521
9,Winding Ln,,,,Craryville,,,7405.0,,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/winding-ln-craryville-ny-12521--2158446526,NY,31.29,2017,12521


In [31]:
properties_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95848 entries, 0 to 95847
Data columns (total 19 columns):
address                 95848 non-null object
area_sqft               61567 non-null float64
basement_type           0 non-null float64
built_year              60198 non-null float64
city                    95845 non-null object
exterior_type           0 non-null float64
heating_type            0 non-null float64
lot_size                88123 non-null float64
no_of_baths             53205 non-null float64
no_of_bedrooms          47393 non-null float64
no_of_parking_spaces    0 non-null float64
no_of_stories           0 non-null float64
parking_type            0 non-null float64
property_type           95848 non-null object
property_url            95848 non-null object
state                   95845 non-null object
tax_amount              29468 non-null float64
tax_year                93439 non-null object
zip_code                95845 non-null object
dtypes: float64(12), object(7

In [32]:
# Convert the columns to their proper data types
properties_df['built_year'] = properties_df['built_year'].fillna(value=0)
properties_df['tax_year'] = properties_df['tax_year'].fillna(value=0)
properties_df = properties_df.astype(dtype={'area_sqft': np.float, 'built_year': np.int, 'lot_size': np.float, \
                                            'no_of_baths': np.float, 'no_of_bedrooms': np.float, \
                                            'tax_amount': np.float, 'tax_year': np.int})
properties_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95848 entries, 0 to 95847
Data columns (total 19 columns):
address                 95848 non-null object
area_sqft               61567 non-null float64
basement_type           0 non-null float64
built_year              95848 non-null int64
city                    95845 non-null object
exterior_type           0 non-null float64
heating_type            0 non-null float64
lot_size                88123 non-null float64
no_of_baths             53205 non-null float64
no_of_bedrooms          47393 non-null float64
no_of_parking_spaces    0 non-null float64
no_of_stories           0 non-null float64
parking_type            0 non-null float64
property_type           95848 non-null object
property_url            95848 non-null object
state                   95845 non-null object
tax_amount              29468 non-null float64
tax_year                95848 non-null int64
zip_code                95845 non-null object
dtypes: float64(11), int64(2), o

### Transactions data prep

In [33]:
transactions_df = pd.read_csv("./housingWebScraper/housingWebScraper/output/Transaction-lastrun.csv", sep="|", \
                              encoding="latin1", parse_dates=['recording_date', 'contract_date'])
transactions_df

Unnamed: 0,contract_date,county_transfer_tax,document_type,price,property_url,recording_date,total_transfer_tax,transaction_type
0,1995-11-17,,Deed,"$146,796",https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,1995-12-12,,Purchase/Resale Arm's Length Residential Transaction
1,2001-06-25,,Deed,"$224,000",https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,2001-07-30,,Purchase/Resale Arm's Length Residential Transaction
2,2013-09-19,,Deed,"$70,000",https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,2013-09-19,,Insured Non-Residential Grant Deed
3,2006-12-28,,Deed,"$42,500",https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,2007-01-03,,Insured Non-Residential Grant Deed
4,2018-11-08,,Deed,"$325,000",https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2018-11-27,,Purchase/Resale Arm's Length Residential Transaction
5,2016-08-12,,Deed,"$122,000",https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2016-08-15,,Purchase/Resale Arm's Length Residential Transaction
6,2004-07-21,,Deed,"$130,000",https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2004-07-21,,Purchase/Resale Arm's Length Residential Transaction
7,1998-08-27,,Deed,"$385,000",https://www.trulia.com/p/ny/craryville/126-taghkanic-churchtown-rd-craryville-ny-12521--2349607986,1998-08-28,,Non-Arm's Length Transaction
8,2009-04-17,,Deed,"$437,500",https://www.trulia.com/p/ny/craryville/110-taghkanic-churchtown-rd-craryville-ny-12521--2011547172,2009-04-17,,Purchase/Resale Arm's Length Residential Transaction
9,2004-03-08,,Deed,"$363,750",https://www.trulia.com/p/ny/craryville/110-taghkanic-churchtown-rd-craryville-ny-12521--2011547172,2004-03-09,,Insured Non-Residential Grant Deed


In [34]:
transactions_df.transaction_type.value_counts()

Purchase/Resale Arm's Length Residential Transaction    59427
Insured Non-Residential Grant Deed                      6750 
REO and Trustee Deed                                    1085 
Non-Arm's Length Transaction                            797  
New Residential Construction Transaction                626  
Name: transaction_type, dtype: int64

In [35]:
transactions_df.document_type.value_counts()

Bargain and Sale Deed                                                                                                                                                                                                   41715
Deed                                                                                                                                                                                                                    17104
Other                                                                                                                                                                                                                   3131 
Executor's Deed                                                                                                                                                                                                         2860 
REO Resale                                                                                                      

#### Lets format the currency fields

In [36]:
transactions_df['price'] = transactions_df['price'].str.replace(r'\$|,', '').astype(float)
transactions_df['county_transfer_tax'] = transactions_df['county_transfer_tax'].str.replace(r'\$|,', '').astype(float)
transactions_df['total_transfer_tax'] = transactions_df['total_transfer_tax'].str.replace(r'\$|,', '').astype(float)
transactions_df

Unnamed: 0,contract_date,county_transfer_tax,document_type,price,property_url,recording_date,total_transfer_tax,transaction_type
0,1995-11-17,,Deed,146796.0,https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,1995-12-12,,Purchase/Resale Arm's Length Residential Transaction
1,2001-06-25,,Deed,224000.0,https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,2001-07-30,,Purchase/Resale Arm's Length Residential Transaction
2,2013-09-19,,Deed,70000.0,https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,2013-09-19,,Insured Non-Residential Grant Deed
3,2006-12-28,,Deed,42500.0,https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,2007-01-03,,Insured Non-Residential Grant Deed
4,2018-11-08,,Deed,325000.0,https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2018-11-27,,Purchase/Resale Arm's Length Residential Transaction
5,2016-08-12,,Deed,122000.0,https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2016-08-15,,Purchase/Resale Arm's Length Residential Transaction
6,2004-07-21,,Deed,130000.0,https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2004-07-21,,Purchase/Resale Arm's Length Residential Transaction
7,1998-08-27,,Deed,385000.0,https://www.trulia.com/p/ny/craryville/126-taghkanic-churchtown-rd-craryville-ny-12521--2349607986,1998-08-28,,Non-Arm's Length Transaction
8,2009-04-17,,Deed,437500.0,https://www.trulia.com/p/ny/craryville/110-taghkanic-churchtown-rd-craryville-ny-12521--2011547172,2009-04-17,,Purchase/Resale Arm's Length Residential Transaction
9,2004-03-08,,Deed,363750.0,https://www.trulia.com/p/ny/craryville/110-taghkanic-churchtown-rd-craryville-ny-12521--2011547172,2004-03-09,,Insured Non-Residential Grant Deed


In [37]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72228 entries, 0 to 72227
Data columns (total 8 columns):
contract_date          68608 non-null datetime64[ns]
county_transfer_tax    2143 non-null float64
document_type          68418 non-null object
price                  72228 non-null float64
property_url           72228 non-null object
recording_date         72228 non-null datetime64[ns]
total_transfer_tax     59646 non-null float64
transaction_type       68685 non-null object
dtypes: datetime64[ns](2), float64(3), object(3)
memory usage: 4.4+ MB


### Writing to intermediate files to use for EDA

In [38]:
properties_df.to_csv("./output/engineered_trulia_properties.csv", sep="|", index=False, quoting=csv.QUOTE_NONE)

In [39]:
transactions_df.to_csv("./output/engineered_trulia_transactions.csv", sep="|", index=False, quoting=csv.QUOTE_NONE)