# Data Science Project - Analysis of Real Estate Data in the US

# Introduction

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
re_data = pd.read_csv('real_estate_db.csv', encoding = 'ISO-8859–1')

In [3]:
re_data.shape

(39030, 80)

In [4]:
re_data.columns

Index(['UID', 'BLOCKID', 'SUMLEVEL', 'COUNTYID', 'STATEID', 'state',
       'state_ab', 'city', 'place', 'type', 'primary', 'zip_code', 'area_code',
       'lat', 'lng', 'ALand', 'AWater', 'pop', 'male_pop', 'female_pop',
       'rent_mean', 'rent_median', 'rent_stdev', 'rent_sample_weight',
       'rent_samples', 'rent_gt_10', 'rent_gt_15', 'rent_gt_20', 'rent_gt_25',
       'rent_gt_30', 'rent_gt_35', 'rent_gt_40', 'rent_gt_50',
       'universe_samples', 'used_samples', 'hi_mean', 'hi_median', 'hi_stdev',
       'hi_sample_weight', 'hi_samples', 'family_mean', 'family_median',
       'family_stdev', 'family_sample_weight', 'family_samples',
       'hc_mortgage_mean', 'hc_mortgage_median', 'hc_mortgage_stdev',
       'hc_mortgage_sample_weight', 'hc_mortgage_samples', 'hc_mean',
       'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
       'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
       'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', '

# Part 1. Cleaning

## The dataset contains 79 columns, so we will take only the most interesting ones that we find useful for our data exploration.

In [5]:
re_data = re_data[['state','city','zip_code','lat','lng','pop','male_pop','female_pop','rent_mean','rent_median',
              'hc_mean', 'hc_mortgage_mean','hc_mortgage_median', 'hi_mean','hi_median','family_mean',
              'home_equity_second_mortgage', 'second_mortgage', 'home_equity',
              'debt', 'hs_degree', 'male_age_mean', 'female_age_mean', 'pct_own','married'
             ]]

In [6]:
re_data.isna().sum().nlargest(20)

hc_mean                        890
hc_mortgage_mean               841
hc_mortgage_median             841
home_equity_second_mortgage    677
second_mortgage                677
home_equity                    677
debt                           677
rent_mean                      462
rent_median                    462
family_mean                    434
hi_mean                        390
hi_median                      390
pct_own                        390
female_age_mean                302
hs_degree                      275
married                        275
male_age_mean                  273
state                            0
city                             0
zip_code                         0
dtype: int64

In [7]:
re_data.dropna(inplace=True)

## We will drop Puerto Rico State, as it's economy is different from other states

In [8]:
re_data = re_data[re_data.state != 'Puerto Rico']

## Drop places with mean rent < 500 USD. Not realistic or really small village areas

In [9]:
re_data = re_data[re_data.rent_mean > 500]

In [10]:
re_data.rent_mean.nsmallest()

32076    500.01272
30717    500.02145
17906    500.13603
32069    500.38186
11155    500.54241
Name: rent_mean, dtype: float64

## We found not correct values where rent_mean is 3962.34229 due to bad record

In [11]:
re_data.groupby(['state'])[ 'rent_mean'].max().sort_values(ascending=False).nlargest(15)

state
New York                3962.34229
Michigan                3962.34229
Hawaii                  3962.34229
California              3962.34229
Florida                 3962.34229
Texas                   3962.34229
Tennessee               3962.34229
Virginia                3829.43383
District of Columbia    3661.71463
Connecticut             3600.28736
Missouri                3509.17617
Arizona                 3432.87803
New Jersey              3408.15112
Maryland                3341.37301
Pennsylvania            3324.65000
Name: rent_mean, dtype: float64

## Excluding records where rent_mean is 3962.34229 

In [12]:
re_data = re_data[re_data.rent_mean <= 3962]

# Part 2. Findings

## 2.1 Top 10 cities where people have the highest income left, but percentage of ownership is the lowest.

## Calculating income left.

In [13]:
re_data['income_left'] = re_data['family_mean'] - re_data['rent_mean']*12

In [14]:
inc_rent = re_data[['city', 'lat', 'lng', 'family_mean','rent_mean','pct_own','hc_mortgage_mean','income_left']].groupby('city').mean()

In [15]:
inc_rent.sort_values(['pct_own', 'income_left'], ascending = [True, False])

Unnamed: 0_level_0,lat,lng,family_mean,rent_mean,pct_own,hc_mortgage_mean,income_left
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Fort Hood,31.115478,-97.843359,54229.84425,1187.26747,0.00824,449.50000,39982.63461
Fort Stewart,31.876190,-81.599531,45136.66552,1181.23053,0.01240,2249.50000,30961.89916
Fort Dix,40.014556,-74.627018,109363.35805,2025.72648,0.01393,1787.00000,85054.64029
Ft Meade,39.118584,-76.735280,70433.74812,1942.58976,0.01465,1147.00000,47122.67100
Yermo,35.402696,-116.644250,62476.28492,1313.20643,0.01495,649.50000,46717.80776
...,...,...,...,...,...,...,...
Tinton Falls,40.297418,-74.101307,156338.64690,2412.09616,0.98475,3031.87157,127393.49298
Boxford,42.683108,-71.018330,153956.80076,1364.30787,0.98575,3015.26631,137585.10632
Marriottsville,39.318977,-76.967660,157966.86087,2954.72892,0.98658,2883.50466,122510.11383
East Islip,40.721850,-73.181128,139672.49545,1589.53932,0.98832,3049.86980,120598.02361


In [16]:
re_data[['city', 'lat', 'lng', 'family_mean','rent_mean','pct_own','hc_mortgage_mean','income_left']].groupby('city').mean().nsmallest(2,'rent_mean')

Unnamed: 0_level_0,lat,lng,family_mean,rent_mean,pct_own,hc_mortgage_mean,income_left
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Revillo,44.873619,-96.552102,81301.30396,500.38186,0.89217,1269.38985,75296.72164
Flemingsburg,38.463259,-83.727673,71008.31228,500.74452,0.77952,1235.29781,64999.37804


In [17]:
import plotly.express as px

In [18]:
px.set_mapbox_access_token('pk.eyJ1IjoiYWxleHZ6bmsiLCJhIjoiY2sza3ZqNmJ6MGUwbzNkbjN3NXM3b2FuZCJ9.aDeGkoLyAC0eiXB0mq4a3g')
carshare = px.data.carshare()
fig = px.scatter_mapbox(mapdf, lat="lat", lon="lng",size='income_left', color="pct_own",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=10, zoom=2)
fig.show()

NameError: name 'mapdf' is not defined