## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('Data/Real_Estate_Sales_2001-2021_GL.csv', low_memory=False)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054159 entries, 0 to 1054158
Data columns (total 14 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Serial Number     1054159 non-null  int64  
 1   List Year         1054159 non-null  int64  
 2   Date Recorded     1054157 non-null  object 
 3   Town              1054159 non-null  object 
 4   Address           1054108 non-null  object 
 5   Assessed Value    1054159 non-null  float64
 6   Sale Amount       1054159 non-null  float64
 7   Sales Ratio       1054159 non-null  float64
 8   Property Type     671713 non-null   object 
 9   Residential Type  660275 non-null   object 
 10  Non Use Code      302242 non-null   object 
 11  Assessor Remarks  161472 non-null   object 
 12  OPM remarks       11564 non-null    object 
 13  Location          254643 non-null   object 
dtypes: float64(3), int64(2), object(9)
memory usage: 112.6+ MB


In [3]:
data.duplicated().sum() # no duplicate rows!

np.int64(0)

In [4]:
d1 = data.drop(columns=['Serial Number', 'Assessor Remarks', 'OPM remarks', 'Non Use Code']) # remove some unnecessary columns
d1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054159 entries, 0 to 1054158
Data columns (total 10 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   List Year         1054159 non-null  int64  
 1   Date Recorded     1054157 non-null  object 
 2   Town              1054159 non-null  object 
 3   Address           1054108 non-null  object 
 4   Assessed Value    1054159 non-null  float64
 5   Sale Amount       1054159 non-null  float64
 6   Sales Ratio       1054159 non-null  float64
 7   Property Type     671713 non-null   object 
 8   Residential Type  660275 non-null   object 
 9   Location          254643 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 80.4+ MB


## Property Types

In [5]:
print(d1['Property Type'].value_counts())
print()
print(d1['Residential Type'].value_counts())
print()
print(d1[['Residential Type', 'Property Type']])

Property Type
Single Family     401612
Residential       112099
Condo             105420
Two Family         26408
Three Family       12586
Vacant Land         5746
Commercial          4208
Four Family         2150
Apartments           943
Industrial           533
Public Utility         8
Name: count, dtype: int64

Residential Type
Single Family    480566
Condo            128789
Two Family        32615
Three Family      15542
Four Family        2763
Name: count, dtype: int64

        Residential Type  Property Type
0                    NaN     Commercial
1          Single Family    Residential
2          Single Family    Residential
3                  Condo    Residential
4          Single Family    Residential
...                  ...            ...
1054154    Single Family  Single Family
1054155    Single Family  Single Family
1054156    Single Family  Single Family
1054157    Single Family  Single Family
1054158              NaN            NaN

[1054159 rows x 2 columns]


In [6]:
# Consolidate property type information
d1['Residential Type'] = d1['Residential Type'].fillna('')
d1['Property Type'] = d1['Property Type'].fillna('')
d1['Type'] = (d1['Property Type'] + ' ' + d1['Residential Type'])
d1['Type'] = d1['Type'].str.strip()
print(d1['Type'].unique())
print()
d1['Type'] = d1['Type'].replace(r'(\w+ Family) \1', r'Residential \1', regex=True).replace('Condo Condo', 'Condo')
print(d1['Type'].unique())
print()
d1 = d1.drop(columns=['Residential Type', 'Property Type'])
print(d1.info())

['Commercial' 'Residential Single Family' 'Residential Condo'
 'Residential Two Family' 'Vacant Land' '' 'Apartments'
 'Residential Three Family' 'Industrial' 'Residential Four Family'
 'Public Utility' 'Condo Condo' 'Two Family Two Family'
 'Three Family Three Family' 'Single Family Single Family'
 'Four Family Four Family']

['Commercial' 'Residential Single Family' 'Residential Condo'
 'Residential Two Family' 'Vacant Land' '' 'Apartments'
 'Residential Three Family' 'Industrial' 'Residential Four Family'
 'Public Utility' 'Condo']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054159 entries, 0 to 1054158
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   List Year       1054159 non-null  int64  
 1   Date Recorded   1054157 non-null  object 
 2   Town            1054159 non-null  object 
 3   Address         1054108 non-null  object 
 4   Assessed Value  1054159 non-null  float64
 5   Sale Amount 

## Locations

In [7]:
print(d1['Location'].info())
print()
print(d1['Location'].unique())
print()

<class 'pandas.core.series.Series'>
RangeIndex: 1054159 entries, 0 to 1054158
Series name: Location
Non-Null Count   Dtype 
--------------   ----- 
254643 non-null  object
dtypes: object(1)
memory usage: 8.0+ MB
None

[nan 'POINT (-72.846365959 41.781677018)'
 'POINT (-73.21257902 41.666959986)' ... 'POINT (-72.80758 41.68147)'
 'POINT (-72.96622 41.32883)' 'POINT (-72.07006 41.53315)']



In [17]:
location_df = d1[d1['Location'].notna()].copy()
location_df['Latitude'] = location_df['Location'].str.extract(r'POINT \(([^ ]+) ([^ ]+)\)')[1]
location_df['Longitude'] = location_df['Location'].str.extract(r'POINT \(([^ ]+) ([^ ]+)\)')[0]
location_df = location_df.drop(columns=['Location'])
location_df['Latitude'] = location_df['Latitude'].astype(float)
location_df['Longitude'] = location_df['Longitude'].astype(float)
print(location_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 254643 entries, 2 to 1054152
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   List Year       254643 non-null  int64  
 1   Date Recorded   254643 non-null  object 
 2   Town            254643 non-null  object 
 3   Address         254638 non-null  object 
 4   Assessed Value  254643 non-null  float64
 5   Sale Amount     254643 non-null  float64
 6   Sales Ratio     254643 non-null  float64
 7   Type            254643 non-null  object 
 8   Latitude        254643 non-null  float64
 9   Longitude       254643 non-null  float64
dtypes: float64(5), int64(1), object(4)
memory usage: 21.4+ MB
None


In [19]:
import folium

# Create a map centered around a specific latitude and longitude
map_center = [41.7658, -72.6734]  # Example: Coordinates for Hartford, Connecticut
mymap = folium.Map(location=map_center, zoom_start=12)

lats = location_df['Latitude'].tolist()[:100]
longs = location_df['Longitude'].tolist()[:100]
prices = location_df['Assessed Value'].tolist()[:100]

for lat, long, price in zip(lats,longs,prices):
    # print(lat, lon
    folium.Marker([lat, long], popup=(lat, long, price)).add_to(mymap)

mymap