Import necessary libraries:

In [1]:
import pandas as pd
import numpy as np
import re
import string
from difflib import get_close_matches, SequenceMatcher
#!pip install python-Levenshtein
from fuzzywuzzy import fuzz

from sqlalchemy import create_engine



Import data:

In [2]:
houses = pd.read_csv('D:/Data/JacksonvilleHouses.csv')
print(houses.shape)
houses.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(252718, 91)


Unnamed: 0,RE,section,township,range,tile,mailing_address_1,mailing_address_2,mail_city,mail_state,mail_zipcode,...,sub_Unfin Open Porch,sub_Unfinished Garage,sub_Unfinished Storage,street_num,direction,street_name,street_type,unit,city,zipcode
0,0000060030R,1,2S,23E,3401.0,2503 SUMMERFIELD LN,,JACKSONVILLE,FL,32234,...,0.0,0.0,0.0,2503.0,,SUMMERFIELD,LN,,JACKSONVILLE,32234
1,0000070010R,1,2S,23E,3401.0,C/O TAX DEPARTMENT,9540 SAN JOSE BLVD,JACKSONVILLE,FL,32257,...,0.0,0.0,0.0,2610.0,N,US 301,HWY,,JACKSONVILLE,32234
2,0000090100R,12,2S,23E,3412.0,2495 U S 301 HWY N,,BALDWIN,FL,32234,...,1.0,0.0,0.0,2495.0,N,US 301,HWY,,JACKSONVILLE,32234
3,0000110000R,12,2S,23E,3412.0,2425 US HWY 301 N,,BALDWIN,FL,32234,...,0.0,0.0,0.0,2435.0,N,US 301,HWY,,JACKSONVILLE,32234
4,0000120010R,12,2S,23E,3412.0,2204 US HIGHWAY 301 N,,JACKSONVILLE,FL,32234,...,0.0,0.0,0.0,2204.0,N,US 301,HWY,,JACKSONVILLE,32234


Remove spaces in column names:

In [3]:
new_cols = houses.columns.to_list()
new_cols = [x.replace(' ','_') for x in new_cols]
houses.rename(columns=dict(zip(houses.columns.to_list(),new_cols)), inplace=True)

Convert parcel IDs to same format as shapefiles for future visualizations:

In [4]:
houses['RE'] = houses['RE'].str.replace('R','')
houses['RE'] = [x[:6] + ' ' + x[-4:] for x in houses['RE']]
houses['RE_b'] = houses['RE'] + ' ' + houses['building'].map(str)

Combine section, township and range columns into a single column:

In [5]:
houses['section'] = houses['section'].astype('str')
houses['sec_township_range'] = houses['section'] + ' ' + houses['township'].map(str) + ' ' + \
        houses['range'].map(str)
houses.drop(columns=['section', 'township', 'range'], inplace=True)

Remove unneeded columns:

In [6]:
houses.drop(columns=['tile','style'], inplace=True)

Standardize addresses of most common home owners and add investment group column:

In [7]:
engine = create_engine('sqlite:///D:\\Data\\DuvalProperties.sqlite')
con = engine.connect()
invs1 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), 'PO BOX 4090' AS fixed, '' AS mail_2, 'SCOTTSDALE' AS city, 
    'AZ' AS state, 'Progress Residential' AS inv_group
    FROM Parcel
    WHERE mailing_address_1 LIKE 'P%O%4090%' AND property_use = '0100'
    """
    )
invs1 = pd.DataFrame(invs1.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs2 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '30601 AGOURA RD 200' AS fixed, '' AS mail_2, 
    'AGOURA HILLS' AS city, 'CA' AS state, 'American Homes 4 Rent' AS inv_group
    FROM Parcel
    WHERE TRIM(mailing_address_1) LIKE '30%AG%R%D%' AND property_use = '0100'
    """
    )
invs2 = pd.DataFrame(invs2.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs3 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '1717 MAIN ST 2000' AS fixed, '' AS mail_2, 'DALLAS' AS city, 
    'TX' AS state, 'Invitation Homes' AS inv_group
    FROM Parcel
    WHERE (TRIM(mailing_address_1) LIKE '1717%MAIN%2000%' OR TRIM(mailing_address_2) 
    LIKE '1717%MAIN%2000%') AND property_use = '0100'
    """
    )
invs3 = pd.DataFrame(invs3.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs4 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '3721 DUPONT STATION CT S' AS fixed, '' AS mail_2, 
    'JACKSONVILLE' AS city, 'FL' AS state, 'Heritage Florida Property Holdings' AS inv_group
    FROM Parcel
    LEFT JOIN Owner USING (RE)
    WHERE TRIM(mailing_address_1) LIKE '3721%DUPONT%ST%CT%' AND mailing_address_1 NOT LIKE '%1' 
        AND owner NOT LIKE 'RIBER%' AND owner NOT LIKE 'EIKNER%' AND property_use = '0100'
    """
    )
invs4 = pd.DataFrame(invs4.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs5 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '113361 ATLANTIC BLVD' AS fixed, '' AS mail_2, 
    'JACKSONVILLE' AS city, 'FL' AS state, 'Heritage Florida Property Holdings' AS inv_group
    FROM Parcel
    WHERE (TRIM(mailing_address_1) LIKE '13%61%AT%C%B%L%V%' OR TRIM(mailing_address_2) 
    LIKE '13%61%AT%C%B%L%V%') AND property_use = '0100'
    """
    )
invs5 = pd.DataFrame(invs5.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs6 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '5001 PLZ ON THE LAKE 200' AS fixed, '' AS mail_2, 
    'AUSTIN' AS city, 'TX' AS state, 'Amherst Residential' AS inv_group
    FROM Parcel
    LEFT JOIN Owner USING (RE)
    WHERE TRIM(mailing_address_1) LIKE '5001%PL%Z%ON%%LAKE%' AND property_use = '0100'
    """
    )
invs6 = pd.DataFrame(invs6.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs7 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '7563 PHILIPS HWY 208' AS fixed, '' AS mail_2, 
    'JACKSONVILLE' AS city, 'FL' AS state, 'JWB Real Estate Capital' AS inv_group
    FROM Parcel
    WHERE TRIM(mailing_address_1) LIKE '7563%PHILIPS%' AND property_use = '0100'
    """
    )
invs7 = pd.DataFrame(invs7.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs8 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '1508 BROOKHOLLOW DR' AS fixed, '' AS mail_2, 'SANTA ANA' AS city,
    'CA' AS state, 'Tricon American Homes' AS inv_group
    FROM Parcel
    WHERE (TRIM(mailing_address_1) LIKE '%TRICON%' OR TRIM(mailing_address_1) LIKE '1508%BROOK%HOLLOW')
        AND property_use = '0100'
    """
    )
invs8 = pd.DataFrame(invs8.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs9 = con.execute(
    """
    SELECT mailing_address_1, '6836 MORRISON BLVD 320' AS fixed,'' AS mail_2,'CHARLOTTE' AS city,
    'NC' AS state, 'Morrocroft Neighborhood Stabilization Fund' AS inv_group
    FROM Parcel
    WHERE TRIM(mailing_address_1) LIKE '6836%MOR%N%B%L%V%' AND property_use = '0100'
    """
    )
invs9 = pd.DataFrame(invs9.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs10 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '1850 PARKWAY PL 900' AS fixed, '' AS mail_2, 'MARIETTA' AS city, 
    'GA' AS state, 'First Key Homes' AS inv_group
    FROM Parcel
    WHERE (TRIM(mailing_address_1) LIKE '1850%P%Y%PL%' OR TRIM(mailing_address_2) LIKE '1850%P%Y%PL%')
    AND property_use = '0100'
    """
    )
invs10 = pd.DataFrame(invs10.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs11 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), 'PO BOX 1459' AS fixed, '' AS mail_2, 'LA JOLLA' AS city, 
    'CA' AS state, 'Reven Housing' AS inv_group
    FROM Parcel
    WHERE (TRIM(mailing_address_1) LIKE 'P%O%B%X%1459%' OR TRIM(mailing_address_2) LIKE 'P%O%B%X%1459%')
    AND property_use = '0100'
    """
    )
invs11 = pd.DataFrame(invs11.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs12 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '3500 S DUPONT HWY' AS fixed, '' AS mail_2, 'NEW YORK' AS city, 
    'NY' AS state, 'Lafayette Real Estate' AS inv_group
    FROM Parcel
    WHERE (TRIM(mailing_address_1) LIKE '3500%DUPONT%H%W%Y' OR TRIM(mailing_address_2) 
    LIKE '3500%DUPONT%H%W%Y') AND property_use = '0100'
    """
    )
invs12 = pd.DataFrame(invs12.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs13 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1),'411 NE 2ND AVE' AS fixed,'' AS mail_2, 'HALLANDALE BEACH' AS city,
    'FL' AS state, 'Avalon Group USA' AS inv_group
    FROM Parcel
    WHERE (TRIM(mailing_address_1) LIKE '411%ND%AVE%' OR TRIM(mailing_address_2) LIKE '411%ND%AVE%') 
    AND mailing_address_1 NOT LIKE '%GLENDALE%'
    AND property_use = '0100'
    """
    )
invs13 = pd.DataFrame(invs13.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs14 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '3505 KOGER BLVD 400' AS fixed, '' AS mail_2, 'DULUTH' AS city, 
    'GA' AS state, 'Havenbrook Homes' AS inv_group
    FROM Parcel
    WHERE mailing_address_1 LIKE 'C/O HAVENBROOK%' AND property_use = '0100'
    """
    )
invs14 = pd.DataFrame(invs14.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs15 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '7821 DEERCREEK CLUB RD 200' AS fixed,'' AS mail_2, 
    'JACKSONVILLE' AS city, 'FL' AS state, 'Watson Realty Corp.' AS inv_group
    FROM Parcel
    WHERE (TRIM(mailing_address_1) LIKE '7821%DEER%CLUB%RD%' OR TRIM(mailing_address_2) LIKE '7821%DEER%CLUB%RD%')
    AND property_use = '0100'
    """
    )
invs15 = pd.DataFrame(invs15.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

invs16 = con.execute(
    """
    SELECT DISTINCT(mailing_address_1), '2 BATES TRL' AS fixed, '' AS mail_2, 'EAST GREENWHICH' AS city, 
    'RI' AS state, 'China Equity Inc.' AS inv_group
    FROM Parcel
    WHERE (TRIM(mailing_address_1) LIKE '221%HOGAN%ST%405%' OR TRIM(mailing_address_2) LIKE '221%HOGAN%ST%405%')
    AND property_use = '0100'
    """
    )
invs16 = pd.DataFrame(invs16.fetchall(),
                               columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])

con.close()

invs = pd.DataFrame(columns=['mailing_address_1', 'fixed', 'mail_2', 'city', 'state', 
                                        'inv_group'])
inv_list = [invs1, invs2, invs3, invs4, invs5, invs6, invs7, invs8, invs9, invs10, invs11, invs12, invs13,
           invs14, invs15, invs16]
for i in inv_list:
    invs = invs.append(i)
invs = invs.reset_index(drop=True)
invs

Unnamed: 0,mailing_address_1,fixed,mail_2,city,state,inv_group
0,P O BOX 4090,PO BOX 4090,,SCOTTSDALE,AZ,Progress Residential
1,P O BOX 4090,PO BOX 4090,,SCOTTSDALE,AZ,Progress Residential
2,PO BOX 4090,PO BOX 4090,,SCOTTSDALE,AZ,Progress Residential
3,P O Box 4090,PO BOX 4090,,SCOTTSDALE,AZ,Progress Residential
4,P O OX 4090,PO BOX 4090,,SCOTTSDALE,AZ,Progress Residential
...,...,...,...,...,...,...
395,C/O WM A WATSON JR TRUSTEE,7821 DEERCREEK CLUB RD 200,,JACKSONVILLE,FL,Watson Realty Corp.
396,7821 DEERCREEK CLUB RD,7821 DEERCREEK CLUB RD 200,,JACKSONVILLE,FL,Watson Realty Corp.
397,7821 DEERCREEK CLUB RD #200,7821 DEERCREEK CLUB RD 200,,JACKSONVILLE,FL,Watson Realty Corp.
398,7821 DEERCREEK CLUB RD,7821 DEERCREEK CLUB RD 200,,JACKSONVILLE,FL,Watson Realty Corp.


In [8]:
houses['inv_group'] = None

for address in invs['mailing_address_1']:
    lst = invs.loc[invs['mailing_address_1']==address].iloc[0].to_list()
    houses.loc[houses['mailing_address_1']==address,'mailing_address_2'] = lst[2]
    houses.loc[houses['mailing_address_1']==address,'mail_city'] = lst[3]
    houses.loc[houses['mailing_address_1']==address,'mail_state'] = lst[4]
    houses.loc[houses['mailing_address_1']==address,'inv_group'] = lst[5]
    houses.loc[houses['mailing_address_1']==address,'mailing_address_1'] = lst[1]

houses['inv_group'].value_counts()

Progress Residential                          1242
American Homes 4 Rent                         1238
Invitation Homes                              1054
Heritage Florida Property Holdings             935
Amherst Residential                            439
JWB Real Estate Capital                        342
Tricon American Homes                          309
Morrocroft Neighborhood Stabilization Fund     284
First Key Homes                                252
Reven Housing                                  238
Lafayette Real Estate                          218
Avalon Group USA                               197
Watson Realty Corp.                            139
Havenbrook Homes                               134
China Equity Inc.                              133
Name: inv_group, dtype: int64

Define functions to use in string cleaning:

In [2]:
def remove_punctuation(lst):
    """Removes punctuation from text strings.

    Parameters
    ----------
    lst : list
        List of strings

    Returns
    -------
    cleaned: list
        List of strings without punctuation
    """
    cleaned = []
    for thing in lst:
        for punctuation in string.punctuation:
            thing = thing.replace(punctuation,'')
        cleaned.append(thing)
    return cleaned

In [3]:
def remove_spacing(lst):
    """Removes consecutive spaces.

    Parameters
    ----------
    lst : list
        List of strings

    Returns
    -------
    cleaned: list
        List of strings without consecutive spaces
    """
    cleaned = []
    for thing in lst:
        thing = " ".join(thing.split())
        cleaned.append(thing)
    return cleaned

In [4]:
def replace_special(lst, repl_dictionary):
    """Replaces dictionary keys with dictionary values in strings.

    Parameters
    ----------
    lst : list
        List of strings
        
    repl_dictionary : dictionary
        Dictionary of replacements

    Returns
    -------
    cleaned: list
        List of strings with specified items replaced by specified values
    """
    cleaned = []        
    for thing in lst:
        for k in list(repl_dictionary.keys()):
            thing = thing.replace(k, repl_dictionary[k])
        cleaned.append(thing)
    return cleaned

In [5]:
def clean_strings(lst, repl_dictionary={}):
    """Capitalizes, removes beginning and ending spaces, removes punctuation, removes consecutive
    spaces and replaces specified substrings with specified values.

    Parameters
    ----------
    lst : list
        List of strings
        
    repl_dictionary : dictionary
        Dictionary of replacements

    Returns
    -------
    cleaned: list
        List of cleaned strings
    """
    lst = [x.upper().strip() for x in lst]
    lst = remove_punctuation(lst)
    lst = remove_spacing(lst)
    lst = replace_special(lst, repl_dictionary)
    return lst

Clean text columns:

In [13]:
houses['owner'] = clean_strings(houses['owner'])

In [14]:
### Remove 'care of' lines in mailing address columns:

special = ['C/O 8052 MONCRIEF-DINSMORE RD','C/O 2811 AUBREY AVE','C/O ALAN CAIN USMTM TABUK UNIT 94114',
 'C/O 4006 STARRATT RD','C/O 4305 ROTH DR','C/O OFC WEALTH 7950 JONES BRANCH DR STE 700N']

houses.loc[houses['mailing_address_1'].isin(special)==True,'mailing_address_1'] = \
houses.loc[houses['mailing_address_1'].isin(special)==True,'mailing_address_1'].str.replace('C/O ','')

# houses.loc[houses['mailing_address_1'].str.startswith('C/O')==True,'care_of'] = \
# houses.loc[houses['mailing_address_1'].str.startswith('C/O')==True,'mailing_address_1'].str.replace('C/O ',\
#     '').str.replace('TAX DEPARTMENT','').str.replace('DEPT','')

houses.loc[houses['mailing_address_1'].str.startswith('C/O')==True,'mailing_address_1'] = \
houses.loc[houses['mailing_address_1'].str.startswith('C/O')==True,'mailing_address_2']

houses.loc[houses['mailing_address_1']==houses['mailing_address_2'],'mailing_address_2'] = ''

# houses.loc[houses['mailing_address_2'].str.startswith('C/O')==True,'care_of'] = \
# houses.loc[houses['mailing_address_2'].str.startswith('C/O')==True,'mailing_address_2'].str.replace('C/O ',\
#     '').str.replace('TAX DEPARTMENT','').str.replace('DEPT','')

houses.loc[houses['mailing_address_2'].str.startswith('C/O')==True,'mailing_address_2'] = ''

In [15]:
address_dictionary = \
    {' U S ':' US ', 'P O ':'PO ', 'FLORIDA':'FL', 'STATE ROAD':'SR', 'STATE RD':'SR',
    ' FORT ':' FT ', ' SAINT':' ST', ' TRAIL':' TRL', ' PLACE':' PL', 'CIRCLE':'CIR', ' COURT':' CT', 
    ' BEACH':' BCH',
    ' LANE':' LN', 'STREET':'ST', 'AVENUE':'AVE', ' AV ':' AVE ', 'DRIVE':'DR', ' ROAD':' RD', 
    'HIGHWAY':'HWY',
    'BOULEVARD':'BLVD', ' CREEK':' CK', 'PLAZA':'PLZ',
    'NORTHEAST':'NE', 'NORTHWEST':'NW', 'SOUTHEAST':'SE', 'SOUTHWEST':'SW',
    'NORTH':'N', 'SOUTH':'S', 'EAST':'E', 'WEST':'W', 
    'FIRST':'1ST', 'SECOND':'2ND', 'THIRD':'3RD', 
    'FOURTH':'4TH', 'FIFTH':'5TH', 'SIXTH':'6TH', 'SEVENTH':'7TH', 'EIGHTH':'8TH', 'NINTH':'9TH', 
    'TENTH':'10TH', 'ELEVENTH':'11TH', 'TWELFTH':'12TH', 'THIRTEENTH':'13TH', 'FOURTEENTH':'14TH', 
    'FIFTEENTH':'15TH', 'SIXTEENTH':'16TH', 'SEVENTEENTH':'17TH', 'EIGHTEENTH':'18TH', 'NINETEENTH':'19TH',
    'UNIT':'', 'SUITE':'', ' STE ':' ', ' SET ':'', 'BUILDING':'', 'BLDNG':'', 'APARTMENT':'',
    ' APT ':''}

houses['mailing_address_1'] = clean_strings(houses['mailing_address_1'], address_dictionary)

In [16]:
houses['mailing_address_2'] = houses['mailing_address_2'].fillna('')
houses['mailing_address_2'] = clean_strings(houses['mailing_address_2'], address_dictionary)
houses['mailing_address_2'] = ['' if x.startswith('PO ') else x for x in houses['mailing_address_2']]

### Combine mailing address columns
houses['mailing_address'] = houses['mailing_address_1'] + ' ' + houses['mailing_address_2']
houses['mailing_address'] = remove_spacing(houses['mailing_address'])

houses.drop(columns=['mailing_address_1', 'mailing_address_2'], inplace=True)

The mailing address city column is especially messy and thus needs extra steps to clean:

In [17]:
houses['mail_city'] = houses['mail_city'].fillna('NONE').astype('str')
### Remove numbers
houses['mail_city'] = [re.sub(r'[0-9]+', '', x) for x in houses['mail_city']]

### Fix certain misspellings and abbreviations
first_replacements = \
    {' ON ':' ONTARIO ', 'ONTARIO THE':'ON THE', ' GU ':' GUAM ', 'ISREAL':'ISRAEL', ' IL ':' ISRAEL ',
     'HONG KONG':'CHINA', ' CA ':' CANADA ', 'BCH':'BEACH', 'TOKYO':'JAPAN', 'LONDON':'UNITED KINGDOM',
     'TRINIDAD':'TRINIDAD AND TOBEGO', 'ZURICH':'SWITZERLAND', 'ONTARIO':'CANADA', 'TIKVA':'ISRAEL',
     'HAAYIN':'ISRAEL', 'TEL AVIV':'ISRAEL', 'KOREA':'SOUTH KOREA', 'MAIMI':'MIAMI'
    }

houses['mail_city'] = clean_strings(houses['mail_city'], first_replacements)

### Remove directions and standardize certain words
dirs = ['N', 'S', 'E', 'W', 'NORTH', 'SOUTH', 'EAST', 'WEST']
repl = {'IL':'ISRAEL', 'ST':'SAINT', 'FT':'FORT', 'MT':'MOUNT'}
cities = []
for city in houses['mail_city']:
    c = city.split()
    if c[0] in dirs:
        c = c[1:]
    cities.append(" ".join(c).strip())
cities2 = []
for city in cities:
    c = city.split()
    if c[0] in list(repl.keys()):
        c[0] = repl[c[0]]
    cities2.append(" ".join(c).strip())
houses['mail_city'] = cities2

### Fix two specific abbreviations
ny_index = houses[houses['mail_city'] == 'NEW YORK'].index.to_list()
for i in ny_index:
    houses['mail_city'][i] = 'NEW YORK'
jax_index = houses[houses['mail_city'] == 'JAX'].index.to_list()
for i in jax_index:
    houses['mail_city'][i] = 'JACKSONVILLE'

### If a foreign country is listed in the city column, change the value to 'FOREIGN COUNTRY'
country_list = [
    'SPAIN', 'ISRAEL', 'ETHIOPIA', 'CANADA', 'JAPAN', 'LEBANON','AUSTRALIA', 'COLUMBIA', 'BRAZIL', 'CHINA',
    'KOREA', 'TURKEY', 'BAHAMAS', 'VIETNAM', 'PERU', 'NEW ZEALAND', 'FRANCE', 'BERMUDA', 'SWITZERLAND',
    'TRINIDAD AND TOBEGO', 'ECUADOR', 'UNITED KINGDOM', 'CZECH REPUBLIC', 'JAMAICA']

for country in country_list:
    houses['mail_city'] = \
    ['FOREIGN COUNTRY' if x.__contains__(country) == True else x for x in houses['mail_city']]

### Fix misspellings of common city names
city_list = [
    'JACKSONVILLE', 'JACKSONVILLE BEACH', 'SAINT AUGUSTINE', 'SAINT AUGUSTINE BEACH', 'ATLANTIC BEACH',
    'ALEXANDRIA', 'ATLANTA', 'AMELIA ISLAND', 'BALDWIN', 'BALDWIN PARK', 'BATON ROUGE', 'BOCA RATON',
    'CINCINNATI', 'PALATKA', 'FERNANDINA', 'FERNANDINA BEACH', 'FLEMING ISLAND', 'FLAGLER BEACH', 
    'FRONT ROYAL', 'GRAND PRAIRIE', 'HIGHSTOWN', 'HOBOKEN', 'JAMAICA', 'JAMAICA ESTATES', 'JAMAICA PLAIN',
    'KEY BISCAYNE', 'LAS VEGAS', 'MANHATTAN', 'MIAMI', 'MIAMI BEACH', 'MIAMI GARDENS', 'MIAMI LAKES',
    'MIAMI SHORES', 'MIAMI SPRINGS', 'MIAMISBURG', 'MARIETTA', 'MIDDLEBURG', 'NEPTUNE BEACH', 
    'NEW ORLEANS', 'NEW SMYRNA', 'NEW SMYRNA BEACH', 'OCALA', 'PONTE VEDRA BEACH', 'PONTE VEDRA',
    'SAINT PETERSBURG', 'SAINT PETERSBURG BEACH', 'SARATOGA', 'TALLAHASSEE', 'WARNER ROBBINS', 'YULEE',
    'INDIANAPOLIS', 'LOS ANGELES', 'FORT LAUDERDALE', 'FORT COLLINS', 'LONG BEACH', 'SCOTTSDALE']


houses['mail_city'] = \
    [get_close_matches(x, city_list, n=1, cutoff=0.85)[0] if len(get_close_matches(x, city_list, \
    n=1, cutoff=0.85)) > 0 else x for x in houses['mail_city']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  houses['mail_city'][i] = 'NEW YORK'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  houses['mail_city'][i] = 'JACKSONVILLE'


In [18]:
houses['mail_state'] = houses['mail_state'].fillna('FOREIGN COUNTRY').str.replace('FG','GA')

In [19]:
houses['mail_zipcode'] = houses['mail_zipcode'].fillna('00000').astype('str')
houses['mail_zipcode'] = [x[:5] for x in houses['mail_zipcode']]

In [20]:
houses['subdivision_name'] = houses['subdivision_name'].fillna('SECTION LAND')

In [21]:
houses['neighborhood'] = houses['neighborhood'].astype('str')

In [22]:
houses['street_num'] = houses['street_num'].fillna(0).astype('int64').astype('str')
houses['direction'] = houses['direction'].fillna('')
houses['street_name'] = houses['street_name'].fillna('NOT LISTED')
houses['street_type'] = houses['street_type'].fillna('')
houses['unit'] = houses['unit'].fillna('0')

site_address = houses['street_num'] + ' ' + houses['direction'].map(str) + ' ' + \
    houses['street_name'].map(str) + ' ' + houses['street_type'].map(str) + ' ' + houses['unit'].map(str)
site_address = [x[:-2] if x[-2:] == ' 0' else x for x in site_address]
site_address = [re.sub("\s+"," ", x) for x in site_address]
houses['site_address'] = site_address
houses.drop(columns=['street_num', 'direction', 'street_name', 'street_type', 'unit'], inplace=True)

In [23]:
houses['city'] = remove_spacing(houses['city'].fillna('JACKSONVILLE'))
houses['city'] = houses['city'].str.replace('JAX BEACH', 'JACKSONVILLE BEACH')

In [24]:
houses['zipcode'] = houses['zipcode'].fillna('00000').astype('str')
houses['zipcode'] = [x[:5] for x in houses['zipcode']]

Clean columns with other data types:

In [25]:
houses['just_value'] = houses['just_value'].astype('int64')
houses = houses[houses['just_value'] > 0]

In [26]:
houses['perc_school_deduction'] = houses['perc_school_deduction'].fillna(0)
houses['perc_school_deduction'] = houses['perc_school_deduction']/100

In [27]:
houses['perc_county_deduction'] = houses['perc_county_deduction'].fillna(0)
houses['perc_county_deduction'] = houses['perc_county_deduction']/100

In [28]:
houses['tax_district'] = houses['tax_district'].astype('category')

In [29]:
houses['lot_sf'] = houses['lot_sf'].astype('int64')

In [30]:
houses['type_descr'] = houses['type_descr'].astype('category')

In [31]:
houses['class'] = houses['class'].astype('category')

In [32]:
houses['quality'] = houses['quality'].astype('category')

In [33]:
houses['building_value'] = houses['building_value'].astype('int64')

In [34]:
houses['actual_yr_built'] = houses['actual_yr_built'].astype('int64')
houses['age'] = houses['age'].astype('int64')
houses['effec_yr_built'] = houses['effec_yr_built'].astype('int64')
houses['effec_age'] = houses['effec_age'].astype('int64')
houses['heated_sf'] = houses['heated_sf'].astype('int64')

In [35]:
subs = houses[houses.columns[houses.columns.str.startswith('sub_')==True].to_list()]
for col in subs:
    houses[col] = houses[col].fillna(0)

In [36]:
houses['land_value'] = houses['land_value'].fillna(0)
houses[houses.columns[houses.columns.str.startswith('land_')==True].to_list()].describe()

Unnamed: 0,land_value,land_use_count,land_COMM/RES/OFF,land_RES_CANAL_LD_3-7_UNITS_PER_AC,land_RES_GOLF_LD_3-7_UNITS_PER_AC,land_RES_LD_3-7_UNITS_PER_AC,land_RES_MARSH_LD_3-7_UNITS_PER_AC,land_RES_MD_8-19_UNITS_PER_AC,land_RES_NATURAL_LD_3-7_UNITS_PER_AC,land_RES_POND_LD_3-7_UNITS_PER_AC,land_RES_RIVER_LD_3-7_UNITS_PER_AC,land_RES_RURAL_2_OR_LESS_UNITS_PER_AC
count,252717.0,252717.0,252717.0,252717.0,252717.0,252717.0,252717.0,252717.0,252717.0,252717.0,252717.0,252717.0
mean,62946.12,1.020477,0.004531,0.01252,0.008262,0.707645,0.004618,0.064064,0.057064,0.084644,0.013228,0.021906
std,109305.3,0.164626,0.067158,0.11119,0.090521,0.454845,0.067798,0.244867,0.231965,0.278352,0.114251,0.146377
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25725.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,40000.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64027.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7084800.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [37]:
houses['feature_count'] = houses['feature_count'].fillna(0)
houses['avg_ft_grade'] = houses['avg_ft_grade'].fillna(0)
houses['features_value'] = houses['features_value'].fillna(0)

In [38]:
houses.columns

Index(['RE', 'mail_city', 'mail_state', 'mail_zipcode', 'subdivision_name',
       'neighborhood', 'perc_capped', 'just_value', 'perc_school_deduction',
       'perc_county_deduction', 'tax_district', 'lot_sf', 'building',
       'type_descr', 'class', 'quality', 'actual_yr_built', 'age',
       'effec_yr_built', 'effec_age', 'building_value', 'heated_sf',
       'ft_Carport_Al', 'ft_Cov_Patio', 'ft_Deck_Wd', 'ft_Firep_Gas',
       'ft_Firep_Ms', 'ft_Firep_Prf', 'ft_Pool', 'ft_Scr_Porch',
       'ft_Screen_En', 'ft_Shed_wood', 'bldng_count', 'land_value',
       'feature_count', 'avg_ft_grade', 'features_value', 'sub_ft_count',
       'gross_sf', 'effec_sf', 'parcel_bldng_sf', 'owner', 'baths', 'bedrooms',
       'stories', 'rooms', 'd_Canal', 'd_Golf_Course', 'd_Lake',
       'd_Local_Riverside_Avondale_Historic_District',
       'd_Local_Springfield_Historic_District', 'd_Miscellaneous_Waterway',
       'd_National_Register_Ortega_Historic_District', 'd_Ocean_Front',
       'd_St_Joh

Create adjusted just value column:

In [39]:
houses['bldng_proportion'] = round(houses['gross_sf']/houses['parcel_bldng_sf'],5)
houses['adj_just_value'] = round(houses['bldng_proportion']*houses['land_value']+houses['features_value']+\
    houses['building_value'],0).astype('int64')
houses[['adj_just_value','bldng_proportion']].head()

Unnamed: 0,adj_just_value,bldng_proportion
0,228683,1.0
1,75449,1.0
2,193391,1.0
3,95301,1.0
4,121447,1.0


In [40]:
houses.tail()

Unnamed: 0,RE,mail_city,mail_state,mail_zipcode,subdivision_name,neighborhood,perc_capped,just_value,perc_school_deduction,perc_county_deduction,...,sub_Unfinished_Storage,city,zipcode,RE_b,sec_township_range,inv_group,mailing_address,site_address,bldng_proportion,adj_just_value
252713,181770 0100,JACKSONVILLE,FL,32224,SECTION LAND,314408.27,1.0,586022,-0.1252,-0.1679,...,1.0,JACKSONVILLE,32224,181770 0100 1,38 3S 29E,,5055 DIXIE LANDING DR,5055 DIXIE LANDING DR,1.0,586022
252714,181771 0200,JACKSONVILLE,FL,32224,SECTION LAND,314408.27,1.0,3004603,-0.1757,-0.184,...,0.0,JACKSONVILLE,32224,181771 0200 1,38 3S 29E,,5147 DIXIE LANDING DR,5147 DIXIE LANDING DR,1.0,3004603
252715,181773 0000,JACKSONVILLE,FL,32224,SECTION LAND,314408.27,1.0,2117642,-0.7056,-0.7174,...,0.0,JACKSONVILLE,32224,181773 0000 1,38 3S 29E,,5187 DIXIE LANDING DR,5187 DIXIE LANDING DR,0.79542,1686952
252716,181774 0000,JACKSONVILLE,FL,32224,SECTION LAND,314408.27,1.0,1814844,-0.3788,-0.3926,...,1.0,JACKSONVILLE,32224,181774 0000 1,38 3S 29E,,5161 DIXIE LANDING DR,5161 DIXIE LANDING DR,1.0,1814844
252717,181776 0000,JACKSONVILLE,FL,32250,SECTION LAND,314408.26,1.0,684370,-0.2523,-0.2889,...,0.0,JACKSONVILLE,32250,181776 0000 1,42 3S 29E,,4351 PORT ARTHUR CT,4351 PORT ARTHUR CT,1.0,684370


In [41]:
houses['mailing_address'].value_counts()[:10]

PO BOX 4090                 1242
30601 AGOURA RD 200         1238
1717 MAIN ST 2000           1054
3721 DUPONT STATION CT S     758
5001 PLZ ON THE LAKE 200     439
1508 BROOKHOLLOW DR          408
7563 PHILIPS HWY 208         342
6836 MORRISON BLVD 320       284
1850 PARKWAY PL 900          252
PO BOX 1459                  238
Name: mailing_address, dtype: int64

In [42]:
houses['owner_type'] = None

In [43]:
investors = pd.DataFrame(houses['mailing_address'].value_counts())
investors = investors[investors['mailing_address'] > 3].index.to_list()
houses.loc[houses['mailing_address'].isin(investors)==True,'owner_type'] = 'INVESTOR OWNED'
houses.loc[houses['mail_city']=='FOREIGN COUNTRY','owner_type'] = 'FOREIGN BUYER'
houses.loc[houses['mail_state']=='FOREIGN COUNTRY','owner_type'] = 'FOREIGN BUYER'
houses.loc[houses['mailing_address']==houses['site_address'],'owner_type'] = 'PRIMARY RESIDENCE'
houses['owner_type'].value_counts()

PRIMARY RESIDENCE    145078
INVESTOR OWNED        23872
FOREIGN BUYER           488
Name: owner_type, dtype: int64

In [44]:
ratio_list = []
houses['ratio'] = None
counter = 0
for m, s in zip(houses.loc[houses['owner_type'].isnull()==True,'mailing_address'],\
               houses.loc[houses['owner_type'].isnull()==True,'site_address']):
    counter +=1
    if counter%10000==0:
        print(counter)
#     houses.loc[(houses['owner_type'].isnull()==True)&(houses['site_address']==s)\
#                 &(houses['mailing_address']==m),'ratio'] = \
    ratio_list.append(fuzz.ratio(m,s))
houses.loc[houses['owner_type'].isnull()==True,'ratio'] = ratio_list

10000
20000
30000
40000
50000
60000
70000
80000


In [45]:
houses.loc[houses['ratio']>=70,'owner_type'] = 'PRIMARY RESIDENCE'

In [46]:
houses['owner_type'] = houses['owner_type'].fillna('SECONDARY RESIDENCE').astype('category')

In [47]:
houses.drop(columns=['ratio'],inplace=True)

In [48]:
houses.to_csv('D:/Data/HousingInventory_EDA.csv', index=False)

In [37]:
sales = pd.read_csv('D:/Data/HomeSales.csv')
sales.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,RE,sale_id,seller,sale_date,price,building,actual_yr_built,heated_sf,age_at_sale,neighborhood,zipcode,address
0,0,0,0,017964 0000,1,,1985-09-20,40500,1,1962,1075,23,412602.26,32210.0,6529 HUGH CT
1,1,1,1,112600 0000,6,,2006-11-30,185000,1,1963,1644,43,214701.26,32277.0,5560 MAXINE DR
2,2,2,2,167759 7350,4,REGIONS MORTGAGE,2002-07-10,274000,1,1999,3186,3,314411.28,32256.0,8714 E HAMPTON LANDING DR
3,3,3,3,140237 0000,2,,1988-07-18,77500,1,1961,1924,27,215801.23,32216.0,1340 GROVE PARK BLVD
4,4,4,4,010427 1020,6,POTEET MATTHEW D STEPHANIE J,2006-03-03,179000,1,1986,1533,20,412602.32,32210.0,7213 MIMOSA GROVE TRL


In [39]:
sales['zipcode'] = sales['zipcode'].astype('str')
sales['zipcode'] = [x[:5] for x in sales['zipcode']]

In [40]:
sales['sale_date'] = pd.to_datetime(sales['sale_date'])

In [41]:
sales.seller.value_counts()[:20]

DR HORTON                               923
JWB REAL ESTATE CAPITAL                 792
SECRETARY OF VA                         461
LAB INVESTMENTS                         403
KB HOME                                 397
OPENDOOR PROPERTY                       385
LENNAR HOMES                            361
SEDA CONSTRUCTION                       331
LIGHTHOUSE DEVELOPMENT                  329
MANKUS PROPERTIES                       305
RICHMOND AMERICAN HOMES                 300
ATLANTIC BUILDERS                       238
DUVAL HOME BUYERS                       235
ADAMS HOMES OF NORTHWEST FLORIDA        229
HERITAGE FLORIDA PROPERTY HOLDINGS      204
ATLANTIC ENTERPRISES OF JACKSONVILLE    196
BENJIE SPERLING                         186
CORNER LOT PROPERTIES                   177
MERCEDES HOMES                          173
VIGO MARK                               172
Name: seller, dtype: int64

In [42]:
seller_dict = {'LLC':'','INC':'','CORP':'','LP':'','F V A':'F VA','K B HOME':'KB HOME',
               'D R HORTON':'DR HORTON', 'H U D':'HUD'}
sales['seller'] = clean_strings(sales['seller'].fillna(''),seller_dict)


def replace_seller(df, col, repl_dict):
    """Standardizes names of sellers based on repl_dict.

    Parameters
    ----------
    df : DataFrame
        Pandas DataFrame
        
    col: Series
        Pandas Series
        
    repl_dict: dict
        Dictionary

    Returns
    -------
    df[col]: Series
        Pandas Series
    """
    for k, v in repl_dict.items():
        df[col] = [v if k in x else x for x in df[col]]
    return df[col]

repl_dict = {
    'DR HORT':'DR HORTON', 'KB HOME':'KB HOME', 'BCEL':'JWB REAL ESTATE CAPITAL',
    'HOOSE H':'JWB REAL ESTATE CAPITAL', 'JWB':'JWB REAL ESTATE CAPITAL', 
    'MANKUS':'MANKUS PROPERTIES', 'SEDA ':'SEDA CONSTRUCTION', 'OPENDOOR':'OPENDOOR PROPERTY',
    'RICHMOND AMERICAN':'RICHMOND AMERICAN HOMES', 'ADAMS HOMES':'ADAMS HOMES OF NORTHWEST FLORIDA',
    'TITUS ':'TITUS GROUP', 'BEAZER':'BEAZER HOMES', 'BELLACOOP':'BELLACOOP FUND', 
    'BRIGGS RICH':'RICHARD BRIGGS', 'CENDANT ':'CENDANT MOBILITY FINANCIAL', 'COPPENBARGER':'COPPENBARGER HOMES',
    'FIRST COAST REAL':'FIRST COAST REAL ESTATE', 'LAB INV':'LAB INVESTMENTS', 
    'LIGHTHOUSE DEV':'LIGHTHOUSE DEVELOPMENT', 'SPERLING ':'BENJIE SPERLING',
    'SUNCOAST ':'SUNCOAST PROPERTY INVESTORS', 'WATSON WILLIAM A':'WATSON REALTY CORP',
    '315 REAL ESTATE':'HERITAGE FLORIDA PROPERTY HOLDINGS', 'FIN HOME':'HERITAGE FLORIDA PROPERTY HOLDINGS',
    'FLYNN H':'HERITAGE FLORIDA PROPERTY HOLDINGS', 'PIONEER PRO':'HERITAGE FLORIDA PROPERTY HOLDINGS',
    'ARK INVESTMENT':'PRIME AREA PROPERTIES'
    }

sales['seller'] = replace_seller(sales, 'seller', repl_dict)

sales['seller'] = [None if x == '' else x for x in sales['seller']]

sales['seller'].value_counts()[:20]

DR HORTON                               923
JWB REAL ESTATE CAPITAL                 792
SECRETARY OF VA                         461
LAB INVESTMENTS                         403
KB HOME                                 397
OPENDOOR PROPERTY                       385
LENNAR HOMES                            361
SEDA CONSTRUCTION                       331
LIGHTHOUSE DEVELOPMENT                  329
MANKUS PROPERTIES                       305
RICHMOND AMERICAN HOMES                 300
ATLANTIC BUILDERS                       238
DUVAL HOME BUYERS                       235
ADAMS HOMES OF NORTHWEST FLORIDA        229
HERITAGE FLORIDA PROPERTY HOLDINGS      204
ATLANTIC ENTERPRISES OF JACKSONVILLE    196
BENJIE SPERLING                         186
CORNER LOT PROPERTIES                   177
MERCEDES HOMES                          173
VIGO MARK                               172
Name: seller, dtype: int64

In [43]:
sales['address'] = remove_spacing(sales['address'].fillna(''))
sales['address'] = [None if x == '' else x for x in sales['address']]

In [44]:
sales.tail()

Unnamed: 0,RE,sale_id,seller,sale_date,price,building,actual_yr_built,heated_sf,age_at_sale,neighborhood,zipcode,address
340292,061525 0000,6,TURNER APRIL L,2020-12-16,310000,1,1937,1656,83,502502.21,32205,3598 TRASK ST
340293,049131 0000,8,STEVENS FAMILY TRUST,2020-12-10,78100,1,1949,1892,71,511800.22,32254,3527 ACACIA ST
340294,062509 0000,11,HUTCHENS SEAMUS J,2020-12-07,240000,1,1946,1032,74,502502.21,32205,4620 ATTLEBORO ST
340295,156442 0400,14,CHICOLA STEPHEN,2020-12-07,419900,1,2005,2128,15,316729.2,32257,5340 LOSCO RD
340296,106939 1020,6,WHITE NOAL D,2020-12-21,285000,1,2003,2897,17,610201.34,32218,12704 DAYLIGHT TRL


In [45]:
sales.to_csv('D:/Data/HomeSales.csv', index=False)