In [1]:
from pymongo import MongoClient
import os, re
from sklearn import linear_model
import numpy as np

MONGO_USER = os.environ['MONGO_USER']
MONGO_PASS = os.environ['MONGO_PASS']
MONGO_URL = os.environ['MONGO_URL']

### Connecting to the database

In [2]:
uri = "mongodb://%s:%s@%s" % (
    MONGO_USER, MONGO_PASS, MONGO_URL)

client = MongoClient(uri)
db = client['HonoluluProperty']

### Finding the houses sold in 2017

In [3]:
cursor = db['hnl_county_data'].find({'SalesInformation.0.saleDate': { '$regex': '.*201[0-8]' }})

houses2017 = list(cursor)

houses2017[0]

{'OwnerAndParcelInformation': {'OwnerName': 'WEBER,TEKLA S TR  Fee Owner',
  'ParcelNumber': '430040310000',
  'dataCurrentAsOf': 'January 30, 2018',
  'landArea(Acres)': '0.2513',
  'landArea(ApproximateSqFt)': '10,948',
  'legalInformationLOT11410,948SFMAP4LCAPP615TOG/R/O/WESMT': '',
  'locationAddress': '1407 AALAPAPA DR',
  'parcelMap': '',
  'platMap': '',
  'projectName': '',
  'propertyClass': 'RESIDENTIAL A'},
 'PermitInformation': [{'date': ' 03/16/1998 ',
   'permitAmount': ' $ 6,000 ',
   'permitNumber': '415064',
   'reason': ' '},
  {'date': ' 03/05/1993 ',
   'permitAmount': ' $ 125,000 ',
   'permitNumber': '332191',
   'reason': ' '},
  {'date': ' 01/11/1993 ',
   'permitAmount': ' $ 15,000 ',
   'permitNumber': '329726',
   'reason': ' '},
  {'date': ' 10/30/1991 ',
   'permitAmount': ' $ 18,000 ',
   'permitNumber': '309529',
   'reason': ' '}],
 'SalesInformation': [{'book/Page': ' ',
   'cert#': '333026 ',
   'dateOfRecording': '10/11/2010 ',
   'instrument#': ' ',


In [12]:
def assessmentInfo(house, saleYear):
    if not 'assessmentInformation' in house:
        return np.nan
    for info in house['assessmentInformation']:
        if int(info['assessmentYear'].strip()[-4:]) < int(saleYear[1:]):
            return info['totalPropertyAssessedValue']
        
def normalizePrice(priceString):
    return int(priceString[2:].strip().replace(',', ''))

def saleInfoExtraction(house):
    for sale in house['SalesInformation']:
        if sale['saleAmount']:
            return sale['saleDate'].strip()[-4:], normalizePrice(sale['saleAmount'])
        
    return np.nan, np.nan

def featureExtraction(house):
    tmk = house['tmk']
    if 'OwnerAndParcelInformation' in house:
        propertyClass = house['OwnerAndParcelInformation']['propertyClass']
        landArea = house['OwnerAndParcelInformation']['landArea(ApproximateSqFt)'] if 'landArea(ApproximateSqFt)' in house['OwnerAndParcelInformation'] else 0
        # assessmentInfo(house, saleInfoExtraction(house)[0]),
    else:
        propertyClass = 0
        landArea = 0
    
    if 'assessmentInformation' in house:
        numAssessments = len(house['assessmentInformation'])    
    else:
        numAssessments = 0
        
    if 'landInformation' in house and len(house['landInformation']) > 0:
        footage = house['landInformation'][0]['squareFootage']
        rebuilds = len(house['landInformation'])
    else:
        footage = 0
        rebuilds = 0
    
    if 'residentialImprovementInformation' in house and len(house['residentialImprovementInformation']) > 0:
        occupancy = house['residentialImprovementInformation'][0]['occupancy']
        houseArea = house['residentialImprovementInformation'][0]['squareFeet']
        yearBuilt = house['residentialImprovementInformation'][0]['yearBuilt']
        bedrooms = house['residentialImprovementInformation'][0]['bedrooms']
        baths = house['residentialImprovementInformation'][0]['fullBaths']
        halfBaths = house['residentialImprovementInformation'][0]['halfBaths']
    else:
        occupancy = 0
        houseArea = 0
        yearBuilt = 0
        bedrooms = 0
        baths = 0
        halfBaths = 0
    
    numPermits = len(house['PermitInformation']) if 'PermitInformation' in house else 0
    numSales = len(house['SalesInformation']) if 'SalesInformation' in house else 0
    
    return [tmk,
            propertyClass,
            landArea,
            numAssessments,
            footage,
            rebuilds,
            occupancy,
            houseArea,
            yearBuilt,
            bedrooms,
            baths,
            halfBaths,
            numPermits,
            numSales]

In [13]:
labels = [saleInfoExtraction(house) for house in houses2017]
features = [featureExtraction(house) for house in houses2017]
print(labels[:10], features[:10])

ValueError: invalid literal for int() with base 10: '550,000'

In [None]:
client.close()