In [27]:
# Dependency imports
import re
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline

# Notebook customizations
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_colwidth = -1
import warnings
warnings.filterwarnings('ignore')

### Properties data prep

In [28]:
properties_df = pd.read_csv("./housingWebScraper/housingWebScraper/output/Property-lastrun.csv", sep="|", \
                            dtype={'zip_code': str}, encoding="latin1")
properties_df.head()

Unnamed: 0,address,area_sqft,basement_type,built_year,city,exterior_type,heating_type,lot_size,no_of_baths,no_of_bedrooms,no_of_parking_spaces,no_of_stories,parking_type,property_type,property_url,state,tax_amount,tax_year,zip_code
0,160 162 Old Peckslip Rd,1352,,1985.0,Holmes,,,0.74 acres,3.0,3.0,,,,Multi-Family,https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,NY,"$7,718.79",,12531
1,10 Cliff Ct,1184,,1997.0,Holmes,,,1.4 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,NY,"$8,320.40",2017.0,12531
2,26 Donovan Ln,1890,,1992.0,Holmes,,,2.53 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/26-donovan-ln-holmes-ny-12531--2349447123,NY,"$9,260.24",2017.0,12531
3,3130 Grand Concourse #7R,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7r-bronx-ny-10458--2171936520,NY,,2017.0,10468
4,3130 Grand Concourse #7S,103883,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7s-bronx-ny-10458--2333495270,NY,,2017.0,10458


In [29]:
properties_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1888154 entries, 0 to 1888153
Data columns (total 19 columns):
address                 object
area_sqft               object
basement_type           float64
built_year              float64
city                    object
exterior_type           float64
heating_type            float64
lot_size                object
no_of_baths             float64
no_of_bedrooms          float64
no_of_parking_spaces    float64
no_of_stories           float64
parking_type            float64
property_type           object
property_url            object
state                   object
tax_amount              object
tax_year                object
zip_code                object
dtypes: float64(9), object(10)
memory usage: 273.7+ MB


In [30]:
properties_df.tax_year.value_counts()

2017                        1348565
2018                        472942 
2016                        1992   
4 Beds Price                323    
3 Beds Price                315    
2 Beds Price                91     
5 Beds Price                86     
3 Beds                      77     
4 Beds                      61     
6 Beds Price                41     
2 Beds                      28     
5 Beds                      26     
1 Bed Price                 23     
7 Beds Price                11     
6 Beds                      8      
1 day on Trulia Price       8      
6 days on Trulia Price      6      
1 Bed                       5      
8 Beds Price                5      
2 days on Trulia Price      4      
1 day on Trulia             4      
<1 day on Trulia Price      4      
2015                        4      
7 Beds                      3      
Built in 1900               3      
9 Beds Price                3      
Built in 1920 Price         3      
10 Beds Price               

#### Need to update the crawler to account for these anomalies of these improperly parsed records 
#### It is actually not the crawler's fault since these records come from a different page design for which the crawler support isn't added yet,  but we need to ensure such pages get ignored in the first place

In [31]:
properties_df.loc[properties_df['address'].isnull()]

Unnamed: 0,address,area_sqft,basement_type,built_year,city,exterior_type,heating_type,lot_size,no_of_baths,no_of_bedrooms,no_of_parking_spaces,no_of_stories,parking_type,property_type,property_url,state,tax_amount,tax_year,zip_code
4585,,1674,,1955.0,,,,1458 sqft,4.0,6.0,,,,,https://www.trulia.com/p/ny/bronx/2827-valentine-ave-bronx-ny-10458--68856,,"1,674 sqft",6 Beds Price,
8573,,1825,,1928.0,,,,,1.0,3.0,,,,,https://www.trulia.com/p/ny/new-rochelle/100-pelham-rd-1-b-new-rochelle-ny-10805--1001820853?rd=1,,$148/sqft,3 Beds Price,
8950,,2276,,1924.0,,,,8420 sqft,2.0,3.0,,,,,https://www.trulia.com/p/ny/new-rochelle/38-leland-ave-new-rochelle-ny-10805--2009254748,,"2,276 sqft",3 Beds Price,
9365,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/new-rochelle/address-not-disclosed-new-rochelle-ny-10805--2009254071?rd=1,,,,
9661,,,,,,,,,,,,,,,https://www.trulia.com/c/ny/new-rochelle/harbor-house-3-davenport-ave-new-rochelle-ny-10805--2123143985?rd=1,,,,
11535,,3937,,1955.0,,,,0.40 acres,5.0,6.0,,,,,https://www.trulia.com/p/ny/bronxville/8-oakledge-rd-bronxville-ny-10708--2009226050,,"3,937 sqft",6 Beds,
11614,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/yonkers/128-winnebago-rd-yonkers-ny-10710--2009230033,,,,
14123,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/bronxville/230-pondfield-rd-bronxville-ny-10708--1103311026,,,,
15395,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/yonkers/address-not-disclosed-yonkers-ny-10704--2009207400?rd=1,,,,
17320,,,,,,,,,,,,,,,https://www.trulia.com/p/ny/new-rochelle/17-horton-ave-new-rochelle-ny-10801--2009238629,,,,


#### Remove anomalous records

In [32]:
properties_df = properties_df[~properties_df['address'].isnull()]
properties_df.reset_index(drop=True, inplace=True)
properties_df.shape

(1886404, 19)

In [33]:
properties_df.property_type.value_counts()

Single-Family Home           1226086
Multi-Family                 208676 
Lot/Land                     172201 
Condo                        96681  
Mobile/Manufactured          48407  
Farm/Ranch                   35092  
Unknown                      33412  
Coop                         24659  
Income/Investment            24654  
Townhouse                    16445  
Apartment/Condo/Townhouse    46     
Apartment                    45     
Name: property_type, dtype: int64

In [34]:
properties_df.state.value_counts()

NY    1372078
CT    402815 
NJ    111457 
Name: state, dtype: int64

#### Convert certain columns like area and currency from strings to number, and make them uniform

In [35]:
properties_df['area_sqft'] = properties_df['area_sqft'].str.replace(',', '').astype(float)
properties_df['tax_amount'] = properties_df['tax_amount'].str.replace(r'\$|,', '').astype(float)
properties_df.head()

Unnamed: 0,address,area_sqft,basement_type,built_year,city,exterior_type,heating_type,lot_size,no_of_baths,no_of_bedrooms,no_of_parking_spaces,no_of_stories,parking_type,property_type,property_url,state,tax_amount,tax_year,zip_code
0,160 162 Old Peckslip Rd,1352.0,,1985.0,Holmes,,,0.74 acres,3.0,3.0,,,,Multi-Family,https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,NY,7718.79,,12531
1,10 Cliff Ct,1184.0,,1997.0,Holmes,,,1.4 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,NY,8320.4,2017.0,12531
2,26 Donovan Ln,1890.0,,1992.0,Holmes,,,2.53 acres,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/26-donovan-ln-holmes-ny-12531--2349447123,NY,9260.24,2017.0,12531
3,3130 Grand Concourse #7R,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7r-bronx-ny-10458--2171936520,NY,,2017.0,10468
4,3130 Grand Concourse #7S,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7s-bronx-ny-10458--2333495270,NY,,2017.0,10458


In [36]:
unhandled_area_values = []
def make_area_uniform(row):
    '''
        Converts the areas in other units to sqft
    '''
    
    match = re.match(r"\s*(?P<area>[0-9,\.]+)\s+(?P<unit>[a-z]+)\s*$", str(row['lot_size']))
    if match is not None:
        area = match.group('area')
        unit = match.group('unit')
        area_in_float = float(area.replace(',', ''))
        if unit == 'acre' or unit == 'acres':
            row['lot_size'] = area_in_float * 43560
        elif unit == 'sqft':
            row['lot_size'] = area_in_float
        else:
            print("Unhandled unit for area - " + unit + ". Handle it in make_area_uniform function")
    else:
        if pd.notna(row['lot_size']):
            unhandled_area_values.append(str(row['lot_size']))
            print("Unhandled value in make_area_uniform " + str(row['lot_size']))
    
    return row

properties_df = properties_df.apply(lambda row: make_area_uniform(row), axis=1)

properties_df = properties_df[~properties_df['lot_size'].isin(unhandled_area_values)]
properties_df.reset_index(drop=True, inplace=True)
properties_df

Unhandled value in make_area_uniform 100x208
Unhandled value in make_area_uniform 89x130
Unhandled value in make_area_uniform 108x135
Unhandled value in make_area_uniform 74x140
Unhandled value in make_area_uniform 50x138
Unhandled value in make_area_uniform 69x119
Unhandled value in make_area_uniform 85x112
Unhandled value in make_area_uniform 130x159
Unhandled value in make_area_uniform 151x133
Unhandled value in make_area_uniform 66x135
Unhandled value in make_area_uniform 70x163
Unhandled value in make_area_uniform 70x125
Unhandled value in make_area_uniform 78x200
Unhandled value in make_area_uniform 65x151
Unhandled value in make_area_uniform 65x331
Unhandled value in make_area_uniform 85x200
Unhandled value in make_area_uniform 100x436
Unhandled value in make_area_uniform 40x100
Unhandled value in make_area_uniform 65x176
Unhandled value in make_area_uniform 100x120


Unnamed: 0,address,area_sqft,basement_type,built_year,city,exterior_type,heating_type,lot_size,no_of_baths,no_of_bedrooms,no_of_parking_spaces,no_of_stories,parking_type,property_type,property_url,state,tax_amount,tax_year,zip_code
0,160 162 Old Peckslip Rd,1352.0,,1985.0,Holmes,,,32234.4,3.0,3.0,,,,Multi-Family,https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,NY,7718.79,,12531
1,10 Cliff Ct,1184.0,,1997.0,Holmes,,,60984,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,NY,8320.40,2017,12531
2,26 Donovan Ln,1890.0,,1992.0,Holmes,,,110207,2.5,3.0,,,,Single-Family Home,https://www.trulia.com/p/ny/holmes/26-donovan-ln-holmes-ny-12531--2349447123,NY,9260.24,2017,12531
3,3130 Grand Concourse #7R,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7r-bronx-ny-10458--2171936520,NY,,2017,10468
4,3130 Grand Concourse #7S,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7s-bronx-ny-10458--2333495270,NY,,2017,10458
5,3130 Grand Concourse #7P,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7p-bronx-ny-10458--2173798637,NY,,2017,10458
6,3130 Grand Concourse #7N,103883.0,,1955.0,Bronx,,,,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3130-grand-concourse-7n-bronx-ny-10458--2345496753,NY,,2017,10458
7,3184 Grand Concourse #4E,80811.0,,1965.0,Bronx,,,16552.8,,,,,,Coop,https://www.trulia.com/p/ny/bronx/3184-grand-concourse-4e-bronx-ny-10458--2345502809,NY,,2017,10458
8,4 Wheel Dr,,,,Craryville,,,174240,,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,NY,1509.69,2017,12521
9,Winding Ln,,,,Craryville,,,7405,,,,,,Lot/Land,https://www.trulia.com/p/ny/craryville/winding-ln-craryville-ny-12521--2158446526,NY,31.29,2017,12521


In [37]:
properties_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1886384 entries, 0 to 1886383
Data columns (total 19 columns):
address                 object
area_sqft               float64
basement_type           float64
built_year              float64
city                    object
exterior_type           float64
heating_type            float64
lot_size                object
no_of_baths             float64
no_of_bedrooms          float64
no_of_parking_spaces    float64
no_of_stories           float64
parking_type            float64
property_type           object
property_url            object
state                   object
tax_amount              float64
tax_year                object
zip_code                object
dtypes: float64(11), object(8)
memory usage: 273.4+ MB


In [38]:
# Convert the columns to their proper data types
properties_df['built_year'] = properties_df['built_year'].fillna(value=0)
properties_df['tax_year'] = properties_df['tax_year'].fillna(value=0)
properties_df = properties_df.astype(dtype={'area_sqft': np.float, 'built_year': np.int, 'lot_size': np.float, \
                                            'no_of_baths': np.float, 'no_of_bedrooms': np.float, \
                                            'tax_amount': np.float, 'tax_year': np.int})
properties_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1886384 entries, 0 to 1886383
Data columns (total 19 columns):
address                 object
area_sqft               float64
basement_type           float64
built_year              int64
city                    object
exterior_type           float64
heating_type            float64
lot_size                float64
no_of_baths             float64
no_of_bedrooms          float64
no_of_parking_spaces    float64
no_of_stories           float64
parking_type            float64
property_type           object
property_url            object
state                   object
tax_amount              float64
tax_year                int64
zip_code                object
dtypes: float64(11), int64(2), object(6)
memory usage: 273.4+ MB


### Transactions data prep

In [39]:
transactions_df = pd.read_csv("./housingWebScraper/housingWebScraper/output/Transaction-lastrun.csv", sep="|", \
                              encoding="latin1", parse_dates=['recording_date', 'contract_date'])
transactions_df.head()

Unnamed: 0,contract_date,county_transfer_tax,document_type,price,property_url,recording_date,total_transfer_tax,transaction_type
0,1995-11-17,,Deed,"$146,796",https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,1995-12-12,,Purchase/Resale Arm's Length Residential Transaction
1,2001-06-25,,Deed,"$224,000",https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,2001-07-30,,Purchase/Resale Arm's Length Residential Transaction
2,2013-09-19,,Deed,"$70,000",https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,2013-09-19,,Insured Non-Residential Grant Deed
3,2006-12-28,,Deed,"$42,500",https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,2007-01-03,,Insured Non-Residential Grant Deed
4,2018-11-08,,Deed,"$325,000",https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2018-11-27,,Purchase/Resale Arm's Length Residential Transaction


In [40]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1789637 entries, 0 to 1789636
Data columns (total 8 columns):
contract_date          datetime64[ns]
county_transfer_tax    object
document_type          object
price                  object
property_url           object
recording_date         datetime64[ns]
total_transfer_tax     object
transaction_type       object
dtypes: datetime64[ns](2), object(6)
memory usage: 109.2+ MB


In [41]:
transactions_df.transaction_type.value_counts()

Purchase/Resale Arm's Length Residential Transaction    1446574
Insured Non-Residential Grant Deed                      170121 
Non-Arm's Length Transaction                            85160  
REO and Trustee Deed                                    30123  
New Residential Construction Transaction                8607   
Name: transaction_type, dtype: int64

In [42]:
transactions_df.document_type.value_counts()

Deed                                                                                                                                                                                                                    777606
Warranty Deed                                                                                                                                                                                                           498337
Bargain and Sale Deed                                                                                                                                                                                                   256455
REO Resale                                                                                                                                                                                                              53084 
Executor's Deed                                                                                             

#### Lets format the currency fields

In [43]:
transactions_df['price'] = transactions_df['price'].str.replace(r'\$|,', '').astype(float)
transactions_df['county_transfer_tax'] = transactions_df['county_transfer_tax'].str.replace(r'\$|,', '').astype(float)
transactions_df['total_transfer_tax'] = transactions_df['total_transfer_tax'].str.replace(r'\$|,', '').astype(float)
transactions_df

Unnamed: 0,contract_date,county_transfer_tax,document_type,price,property_url,recording_date,total_transfer_tax,transaction_type
0,1995-11-17,,Deed,146796.0,https://www.trulia.com/p/ny/holmes/160-162-old-peckslip-rd-holmes-ny-12531--2349467107,1995-12-12,,Purchase/Resale Arm's Length Residential Transaction
1,2001-06-25,,Deed,224000.0,https://www.trulia.com/p/ny/holmes/10-cliff-ct-holmes-ny-12531--2305202587,2001-07-30,,Purchase/Resale Arm's Length Residential Transaction
2,2013-09-19,,Deed,70000.0,https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,2013-09-19,,Insured Non-Residential Grant Deed
3,2006-12-28,,Deed,42500.0,https://www.trulia.com/p/ny/craryville/4-wheel-dr-craryville-ny-12521--2204807965,2007-01-03,,Insured Non-Residential Grant Deed
4,2018-11-08,,Deed,325000.0,https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2018-11-27,,Purchase/Resale Arm's Length Residential Transaction
5,2016-08-12,,Deed,122000.0,https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2016-08-15,,Purchase/Resale Arm's Length Residential Transaction
6,2004-07-21,,Deed,130000.0,https://www.trulia.com/p/ny/craryville/10-wang-dr-craryville-ny-12521--2011547166,2004-07-21,,Purchase/Resale Arm's Length Residential Transaction
7,1998-08-27,,Deed,385000.0,https://www.trulia.com/p/ny/craryville/126-taghkanic-churchtown-rd-craryville-ny-12521--2349607986,1998-08-28,,Non-Arm's Length Transaction
8,2009-04-17,,Deed,437500.0,https://www.trulia.com/p/ny/craryville/110-taghkanic-churchtown-rd-craryville-ny-12521--2011547172,2009-04-17,,Purchase/Resale Arm's Length Residential Transaction
9,2004-03-08,,Deed,363750.0,https://www.trulia.com/p/ny/craryville/110-taghkanic-churchtown-rd-craryville-ny-12521--2011547172,2004-03-09,,Insured Non-Residential Grant Deed


In [44]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1789637 entries, 0 to 1789636
Data columns (total 8 columns):
contract_date          datetime64[ns]
county_transfer_tax    float64
document_type          object
price                  float64
property_url           object
recording_date         datetime64[ns]
total_transfer_tax     float64
transaction_type       object
dtypes: datetime64[ns](2), float64(3), object(3)
memory usage: 109.2+ MB


### Writing to intermediate files to use for EDA

In [45]:
properties_df.to_csv("./output/engineered_trulia_properties.csv", sep="|", index=False, quoting=csv.QUOTE_NONE)

In [46]:
transactions_df.to_csv("./output/engineered_trulia_transactions.csv", sep="|", index=False, quoting=csv.QUOTE_NONE)