In [None]:
# Import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode,iplot

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

init_notebook_mode(connected=True)

In [None]:
# Load data
df = pd.read_csv('data/data_house.csv')
spending_df = pd.read_csv('data/data_spending.csv')
gdp_df = pd.read_csv('data/data_gdp.csv')
population_df = pd.read_csv('data/data_population.csv')


In [None]:
population_df.head()

In [31]:
# Reading the data
df.info()
df.shape
df.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50574 entries, 5 to 85508
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Price        50574 non-null  object 
 1   Address      50574 non-null  object 
 2   Bedrooms     50574 non-null  object 
 3   Bathrooms    50574 non-null  object 
 4   Size         50574 non-null  object 
 5   Sale Status  50574 non-null  object 
 6   URL          50574 non-null  object 
 7   Raw Price    50574 non-null  float64
dtypes: float64(1), object(7)
memory usage: 3.5+ MB


Unnamed: 0,Price,Address,Bedrooms,Bathrooms,Size,Sale Status,URL,Raw Price
85502,"$68,000,000","0 Del Valle Rd, Livermore, CA 94550",4 bds,2 ba,"2,500 sqft",House for sale,https://www.zillow.com/homedetails/0-Del-Valle...,68000000.0
85503,"$75,000,000","1060 Brooklawn Dr, Los Angeles, CA 90077",13 bds,17 ba,"15,011 sqft",House for sale,https://www.zillow.com/homedetails/1060-Brookl...,75000000.0
85505,"$90,000,000","432 Park Ave #82, New York, NY 10022",6 bds,8 ba,"8,054 sqft",Condo for sale,https://www.zillow.com/homedetails/432-Park-Av...,90000000.0
85507,"$99,000,000","908 Bel Air Rd, Los Angeles, CA 90077",9 bds,20 ba,"34,000 sqft",House for sale,https://www.zillow.com/homedetails/908-Bel-Air...,99000000.0
85508,"$110,000,000","30 Beverly Park Ter, Beverly Hills, CA 90210",8 bds,12 ba,-- sqft,House for sale,https://www.zillow.com/homedetails/30-Beverly-...,110000000.0


In [None]:
# drop NaN, Price, URL, and assign to df1
def metric_deletion(x):
    x.dropna(axis='rows',inplace=True)
    x = x[x.Bedrooms != '-- bds']
    x = x[x.Bathrooms != '-- ba']
    x = x[x.Size != '-- sqft']
    x.drop(['URL', 'Price'], axis=1,inplace=True)
    return x
df1 = metric_deletion(df)
df1.head()

In [None]:
# Converting Bathrooms into float
df1.Bathrooms = df1.Bathrooms.str.replace(' ba','').astype('float')
# Converting Bedrooms into float
df1.Bedrooms = df1.Bedrooms.str.replace(' bds','').astype('float')

In [None]:
# Converting Size to float
def filt_size(s):
    s= s.replace(',','')
    s =s.replace(' sqft','')
    return float(s)
df1.Size = df1.Size.apply(filt_size)

In [None]:
# Spliting Address into Street, City, State, ZipCode, and drop the Address
df1.Address = df1.Address.astype('str')
df1['Street']= df1.Address.apply(lambda x: x.split(', ')[0])
df1['City']= df1.Address.apply(lambda x: x.split(', ')[1])
df1['State']= df1.Address.apply(lambda x: (x.split(', ')[-1]).split(' ')[0])
df1['ZipCode']= df1.Address.apply(lambda x: (x.split(', ')[-1]).split(' ')[1])
df2 = df1.drop(['Address'],axis=1)


In [None]:
#reset the index
df2.reset_index(inplace=True,drop=True)

In [None]:
# Found two rows of abnormal values, so found exact address on google and replace with the right values
df2.loc[28709:28711, 'State']= 'AZ'
df2.loc[28709:28711, 'ZipCode']= '85260'

In [None]:
# Converting the columns as strings for further cleaning
df2[['Street','City','State','ZipCode']].astype('str')

In [None]:
# Finding the weird ZipCode, it is in Canada
df2.loc[df2.ZipCode == 'N9V']

In [None]:
# Dropping the row
df2.drop(df2.iloc[38379].name,inplace=True)

In [None]:
# Now the ZipCode can be converted to Integer
df2.ZipCode = df2.ZipCode.astype('int')


In [None]:
#convert Sale Status into house Types
house_status = list(df2['Sale Status'].unique())
house_type = ['House','Townhouse','Multifamily', 'Condo', 'Others', 'Apartment']
df2['Sale Status'] = df2['Sale Status'].map(dict(zip(house_status,house_type)))
df2.rename(columns={"Sale Status": "Type"}, inplace=True)


In [None]:
# Street Column might not be useful
df3 = df2.drop(['Street'], axis=1)

In [None]:
# Change Raw Price column name to Price, create perSqft column
df3.rename(columns={"Raw Price":"Price"}, inplace=True)
df3['perSqFt'] = df3.Price / df3.Size

In [None]:
# Due to previously dropping rows, reset index again
df3 = df3.reset_index(drop=True)


In [None]:
# Assign Regions
west = ['CA', 'NV', 'AK', 'WA' , 'OR', 'ID', 'MT', 'WY', 'UT', 'CO', 'AZ', 'NM', 'HI']
midwest = ['ND', 'WI','SD', 'NE', 'KS', 'MN', 'IA', 'MO', 'WI', 'IL', 'IN', 'OH','MI']
north = ['PA', 'NY', 'NH', 'MA', 'CT', 'ME', 'DC', 'NJ', 'RI']
south = ['TX', 'OK', 'AR', 'LA', 'MS', 'AL', 'TN', 'KY', 'WV', 'VA', 'MD', 'DE', 'NC', 'SC', 'GA', 'FL']
full_state_list = west + midwest + north + south

# Creating function to assign regions
def find_region(state):
    if state in west:
        state = 'West'
    elif state in north:
        state = 'North'
    elif state in south:
        state = 'South'
    elif state in midwest:
        state = 'MidWest'
    return state

# Create Region column
df3['Region'] = df3.State.apply(find_region)

In [None]:
df3.head()

In [None]:
#created a function that finds states with not enough data to produce analysis
def little_data_states(tab):
    little_info_state = []
    for x in full_state_list:
        if len(tab[tab['State'] == x]) < 30:
            little_info_state.append(x)
    return little_info_state

states_exempted= little_data_states(df3)
        

In [None]:
#created a new dataframe that would filter out these states from the df3 dataframe
df4 = df3[~df3['State'].isin(states_exempted)]
df4.head()

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)
pd.pivot_table(df4, values=['perSqFt'], index=['State'])


In [None]:
'''
NOT WORKING CORRECTLY


data = dict(type = 'choropleth',colorscale='Portland',locations=df3['State'],locationmode='USA-states', z=df3['Price'],text=df3['Price'], colorbar={'title':'perSqFt'})
choromap = go.Figure(data = [data],layout = dict(geo = {'scope':'usa'}))

iplot(choromap,validate=False)
'''

In [None]:
# for GeoPandas later use
# df2 = df2[df2['ZipCode'].between(10000,99999,inclusive='both')]