# Topic: Correlation between Economics and Real Estate
<ul>
<li>College: Mt San Antonio College</li>
<li>Course: CISD41 Introduction to Data Science</li>
<li>by: Alec Phong and Jack Chen</li>
<br><br>

## Overview
<ol>
<li>Questions</li>
<li>Importing Data</li>
<li>Functions and Lambda</li>
<li>Cleaning Data</li>
<li>Organizing Data</li>
<li>Data Visualization</li>
<li>Pivot tables</li>
<li>Descriptive Statistics</li>
<ul><li>Mean, median, standard deviation, precentiles, boxplots</li></ul>
<li>Quantitative Data Exploratory</li>
<ul><li>Correlation, Coefficients, P-values</li></ul>
<li>Testing Hypothesis, ANOVA</li>
<ul><li>Chisquare, ANOVA, Normal-test, Z-test, Pearson Correlation</li></ul>
<li>Summary and Conclusion</li>
<li>References</li>
</ol>


### Questions

Importing and Reading libraries and data

In [100]:
# Import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode,iplot

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

init_notebook_mode(connected=True)

In [101]:
# Load data
df = pd.read_csv('data/data_house.csv')
spending_df = pd.read_csv('data/data_spending.csv')
gdp_df = pd.read_csv('data/data_gdp.csv')
population_df = pd.read_csv('data/data_population.csv')
income_df = pd.read_csv('data/income_spending.csv')



In [102]:
# Reading the data
df.info()
df.shape
df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85509 entries, 0 to 85508
Data columns (total 8 columns):
Price          85509 non-null object
Address        85509 non-null object
Bedrooms       64999 non-null object
Bathrooms      69439 non-null object
Size           73698 non-null object
Sale Status    69960 non-null object
URL            85509 non-null object
Raw Price      85509 non-null float64
dtypes: float64(1), object(7)
memory usage: 5.2+ MB


Unnamed: 0,Price,Address,Bedrooms,Bathrooms,Size,Sale Status,URL,Raw Price
85504,"$79,000,000","2 Park Pl, New York, NY 10007",,1 ba,"9,680 sqft",Condo for sale,https://www.zillow.com/homedetails/2-Park-Pl-N...,79000000.0
85505,"$90,000,000","432 Park Ave #82, New York, NY 10022",6 bds,8 ba,"8,054 sqft",Condo for sale,https://www.zillow.com/homedetails/432-Park-Av...,90000000.0
85506,"$95,000,000","1441 Angelo Dr, Los Angeles, CA 90210",,,,Lot / Land for sale,https://www.zillow.com/homedetails/1441-Angelo...,95000000.0
85507,"$99,000,000","908 Bel Air Rd, Los Angeles, CA 90077",9 bds,20 ba,"34,000 sqft",House for sale,https://www.zillow.com/homedetails/908-Bel-Air...,99000000.0
85508,"$110,000,000","30 Beverly Park Ter, Beverly Hills, CA 90210",8 bds,12 ba,-- sqft,House for sale,https://www.zillow.com/homedetails/30-Beverly-...,110000000.0


### Functions and Lambda

In [103]:
# drop NaN, Price, URL, and assign to df1
def metric_deletion(x):
    x.dropna(axis='rows',inplace=True)
    x = x[x.Bedrooms != '-- bds']
    x = x[x.Bathrooms != '-- ba']
    x = x[x.Size != '-- sqft']
    x.drop(['URL', 'Price'], axis=1,inplace=True)
    return x
df1 = metric_deletion(df)
df1.head()

Unnamed: 0,Address,Bedrooms,Bathrooms,Size,Sale Status,Raw Price
5,"3515 W Thompson Rd, Indianapolis, IN 46217",2 bds,1 ba,814 sqft,House for sale,1.0
53,"3713 Hillside Ave, Indianapolis, IN 46218",2 bds,1 ba,"1,728 sqft",House for sale,775.0
65,"1337 W Livingston St APT 1, Allentown, PA 18102",3 bds,1 ba,"1,000 sqft",House for sale,1050.0
70,"1788 Westwood Dr, Troy, MI 48083",3 bds,2 ba,"1,418 sqft",House for sale,1600.0
72,"390 Rosado Springs St, Henderson, NV 89014",2 bds,2 ba,"1,060 sqft",Townhouse for sale,1700.0


In [104]:
# Converting Bathrooms into float
df1.Bathrooms = df1.Bathrooms.str.replace(' ba','').astype('float')
# Converting Bedrooms into float
df1.Bedrooms = df1.Bedrooms.str.replace(' bds','').astype('float')

In [105]:
# Converting Size to float
def filt_size(s):
    s= s.replace(',','')
    s =s.replace(' sqft','')
    return float(s)
df1.Size = df1.Size.apply(filt_size)

In [106]:
# Spliting Address into Street, City, State, ZipCode, and drop the Address
df1.Address = df1.Address.astype('str')
df1['Street']= df1.Address.apply(lambda x: x.split(', ')[0])
df1['City']= df1.Address.apply(lambda x: x.split(', ')[1])
df1['State']= df1.Address.apply(lambda x: (x.split(', ')[-1]).split(' ')[0])
df1['ZipCode']= df1.Address.apply(lambda x: (x.split(', ')[-1]).split(' ')[1])
df2 = df1.drop(['Address'],axis=1)


### Cleaning Data

In [107]:
#reset the index
df2.reset_index(inplace=True,drop=True)

In [108]:
# Found two rows of abnormal values, so found exact address on google and replace with the right values
df2.loc[28709:28711, 'State']= 'AZ'
df2.loc[28709:28711, 'ZipCode']= '85260'

In [109]:
# Converting the columns as strings for further cleaning
df2[['Street','City','State','ZipCode']].astype('str')

Unnamed: 0,Street,City,State,ZipCode
0,3515 W Thompson Rd,Indianapolis,IN,46217
1,3713 Hillside Ave,Indianapolis,IN,46218
2,1337 W Livingston St APT 1,Allentown,PA,18102
3,1788 Westwood Dr,Troy,MI,48083
4,390 Rosado Springs St,Henderson,NV,89014
5,11209 Grenada Dr,Sterling Heights,MI,48312
6,65710 Adventure Ct #305A,Bend,OR,97701
7,3970 Garland St,Detroit,MI,48214
8,2127 Merrick St,Detroit,MI,48208
9,14383 Hubbell St,Detroit,MI,48227


In [110]:
# Finding the weird ZipCode, it is in Canada
df2.loc[df2.ZipCode == 'N9V']

Unnamed: 0,Bedrooms,Bathrooms,Size,Sale Status,Raw Price,Street,City,State,ZipCode
38379,4.0,4.0,2800.0,House for sale,865000.0,349 Benson Ct,Amherstburg,ON,N9V


In [111]:
# Dropping the row
df2.drop(df2.iloc[38379].name,inplace=True)

In [112]:
# Now the ZipCode can be converted to Integer
df2.ZipCode = df2.ZipCode.astype('int')


In [113]:
#convert Sale Status into house Types
house_status = list(df2['Sale Status'].unique())
house_type = ['House','Townhouse','Multifamily', 'Condo', 'Others', 'Apartment']
df2['Sale Status'] = df2['Sale Status'].map(dict(zip(house_status,house_type)))
df2.rename(columns={"Sale Status": "Type"}, inplace=True)


In [114]:
# Street Column might not be useful
df3 = df2.drop(['Street'], axis=1)

In [115]:
# Change Raw Price column name to Price, create perSqft column
df3.rename(columns={"Raw Price":"Price"}, inplace=True)
df3['perSqFt'] = df3.Price / df3.Size

In [116]:
# Due to previously dropping rows, reset index again

df3 = df3.reset_index(drop=True)
len(df3.State.unique())


48

In [117]:
# Assign Regions, also need to drop Wyoming and West Virgina since the dataset does not contain 
# listings for that certain State in the initial DataFrame 
west = ['CA', 'NV', 'AK', 'WA' , 'OR', 'ID', 'MT', 'UT', 'CO', 'AZ', 'NM', 'HI']
midwest = ['ND', 'WI','SD', 'NE', 'KS', 'MN', 'IA', 'MO', 'IL', 'IN', 'OH','MI']
north = ['PA', 'NY', 'NH', 'MA', 'CT', 'ME', 'DC', 'NJ', 'RI']
south = ['TX', 'OK', 'AR', 'LA', 'MS', 'AL', 'TN', 'KY', 'VA', 'MD', 'DE', 'NC', 'SC', 'GA', 'FL']
full_state_list = west + midwest + north + south

# Creating function to assign regions
def find_region(state):
    if state in west:
        state = 'West'
    elif state in north:
        state = 'North'
    elif state in south:
        state = 'South'
    elif state in midwest:
        state = 'MidWest'
    return state

# Create Region column
df3['Region'] = df3.State.apply(find_region)

In [118]:
#Display the list of full_state_list in order to compare with the DataFrame state column in order to sort missing , 
full_state_list.sort()
full_state_list
#Converted numpy.Array into a list in order to compare with the list above.
comparator = df3['State'].unique().tolist()
comparator.sort()
comparator
compare_dict = dict(zip(full_state_list,comparator))
compare_dict
# Check our list of states and dataframe states are in sync
# df3 is fully cleaned, now we are making a "state_df"

{'AK': 'AK',
 'AL': 'AL',
 'AR': 'AR',
 'AZ': 'AZ',
 'CA': 'CA',
 'CO': 'CO',
 'CT': 'CT',
 'DC': 'DC',
 'DE': 'DE',
 'FL': 'FL',
 'GA': 'GA',
 'HI': 'HI',
 'IA': 'IA',
 'ID': 'ID',
 'IL': 'IL',
 'IN': 'IN',
 'KS': 'KS',
 'KY': 'KY',
 'LA': 'LA',
 'MA': 'MA',
 'MD': 'MD',
 'ME': 'ME',
 'MI': 'MI',
 'MN': 'MN',
 'MO': 'MO',
 'MS': 'MS',
 'MT': 'MT',
 'NC': 'NC',
 'ND': 'ND',
 'NE': 'NE',
 'NH': 'NH',
 'NJ': 'NJ',
 'NM': 'NM',
 'NV': 'NV',
 'NY': 'NY',
 'OH': 'OH',
 'OK': 'OK',
 'OR': 'OR',
 'PA': 'PA',
 'RI': 'RI',
 'SC': 'SC',
 'SD': 'SD',
 'TN': 'TN',
 'TX': 'TX',
 'UT': 'UT',
 'VA': 'VA',
 'WA': 'WA',
 'WI': 'WI'}

In [119]:
# Read gdp_df
gdp_df.head()

Unnamed: 0.1,Unnamed: 0,GeoFips,GeoName,2020
0,0,0,United States,18384687.0
1,1,1000,Alabama,196906.1
2,2,2000,Alaska,50161.0
3,3,4000,Arizona,320550.6
4,4,5000,Arkansas,114943.5


In [120]:
# Checking the States in gdp_df
gdp_df.GeoName.unique()

array(['United States', 'Alabama', 'Alaska', 'Arizona', 'Arkansas',
       'California', 'Colorado', 'Connecticut', 'Delaware',
       'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'New England', 'Mideast',
       'Great Lakes', 'Plains', 'Southeast', 'Southwest',
       'Rocky Mountain', 'Far West'], dtype=object)

In [121]:
# Drop the non-used States/Regions, converting column names, reset column index
gdp_df = gdp_df[(gdp_df.GeoName != 'United States') & (gdp_df.GeoName != 'Southwest') & (gdp_df.GeoName != 'Southeast') & (gdp_df.GeoName != 'Plains') & (gdp_df.GeoName != 'Mideast') & (gdp_df.GeoName != 'Great Lakes') & (gdp_df.GeoName != 'Rocky Mountain') & (gdp_df.GeoName != 'Far West') & (gdp_df.GeoName != 'District of Columbia') & (gdp_df.GeoName != 'New England') & (gdp_df.GeoName != 'West Virginia') & (gdp_df.GeoName != 'Wyoming')]
gdp_df.rename(columns={'GeoName':'State','2020':'GDP'},inplace=True)
gdp_df = gdp_df[['State', 'GDP']]
gdp_df.reset_index(inplace=True,drop=True)
gdp_df

Unnamed: 0,State,GDP
0,Alabama,196906.1
1,Alaska,50161.0
2,Arizona,320550.6
3,Arkansas,114943.5
4,California,2663665.9
5,Colorado,346011.3
6,Connecticut,235888.6
7,Delaware,62056.2
8,Florida,944000.8
9,Georgia,536693.0


In [122]:
# Read population_df
population_df.head()

Unnamed: 0,rank,State,Pop,Growth,Pop2018,Pop2010,growthSince2010,Percent,density
0,1,California,39613493,0.0,39461588,37319502,0.06,0.12,254.29
1,2,Texas,29730311,0.04,28628666,25241971,0.18,0.09,113.81
2,3,Florida,21944577,0.03,21244317,18845537,0.16,0.07,409.22
3,4,New York,19299981,-0.01,19530351,19399878,-0.01,0.06,409.54
4,5,Pennsylvania,12804123,0.0,12800922,12711160,0.01,0.04,286.17


In [123]:
# Checking states in population_df
population_df.State.unique()


array(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania',
       'Illinois', 'Ohio', 'Georgia', 'North Carolina', 'Michigan',
       'New Jersey', 'Virginia', 'Washington', 'Arizona', 'Tennessee',
       'Massachusetts', 'Indiana', 'Missouri', 'Maryland', 'Colorado',
       'Wisconsin', 'Minnesota', 'South Carolina', 'Alabama', 'Louisiana',
       'Kentucky', 'Oregon', 'Oklahoma', 'Connecticut', 'Utah',
       'Puerto Rico', 'Nevada', 'Iowa', 'Arkansas', 'Mississippi',
       'Kansas', 'New Mexico', 'Nebraska', 'Idaho', 'West Virginia',
       'Hawaii', 'New Hampshire', 'Maine', 'Montana', 'Rhode Island',
       'Delaware', 'South Dakota', 'North Dakota', 'Alaska',
       'District of Columbia', 'Vermont', 'Wyoming'], dtype=object)

In [124]:
# Filter out some regions and states, Grabbing State and Pop columns, reset index
population_df = population_df[(population_df.State != 'Wyoming') & (population_df.State != 'West Virginia') & (population_df.State != 'District of Columbia') & (population_df.State != 'Puerto Rico')]
population_df = population_df[['State','Pop']]
population_df.reset_index(inplace=True,drop=True)
population_df

Unnamed: 0,State,Pop
0,California,39613493
1,Texas,29730311
2,Florida,21944577
3,New York,19299981
4,Pennsylvania,12804123
5,Illinois,12569321
6,Ohio,11714618
7,Georgia,10830007
8,North Carolina,10701022
9,Michigan,9992427


In [125]:
income_df.head()

Unnamed: 0,GeoFips,GeoName,2020
0,0,United States,19607447.0
1,1000,Alabama,228748.8
2,2000,Alaska *,46430.3
3,4000,Arizona,368458.6
4,5000,Arkansas,143147.9


In [126]:
income_df = income_df[(income_df.GeoName != 'United States') & (income_df.GeoName != 'Southwest') & (income_df.GeoName != 'Southeast') & (income_df.GeoName != 'Plains') & (income_df.GeoName != 'Mideast') & (income_df.GeoName != 'Great Lakes') & (income_df.GeoName != 'Rocky Mountain') & (income_df.GeoName != 'Far West') & (income_df.GeoName != 'District of Columbia') & (income_df.GeoName != 'New England') & (income_df.GeoName != 'West Virginia') & (income_df.GeoName != 'Wyoming')]
income_df.rename(columns={'GeoName':'State','2020':'income'},inplace=True)
income_df = income_df[['State', 'income']]
income_df.reset_index(inplace=True,drop=True)
income_df

Unnamed: 0,State,income
0,Alabama,228748.8
1,Alaska *,46430.3
2,Arizona,368458.6
3,Arkansas,143147.9
4,California,2763312.0
5,Colorado,370392.1
6,Connecticut,279612.4
7,Delaware,55357.4
8,Florida,1209995.9
9,Georgia,554566.5


In [127]:
spending_df.head()

Unnamed: 0.1,Unnamed: 0,GeoFips,GeoName,LineCode,Description,2020
0,0,0,United States,1,Personal consumption expenditures,14047565.0
1,1,0,United States,2,Goods,4653822.0
2,2,0,United States,3,Durable goods,1616408.0
3,3,0,United States,4,Motor vehicles and parts,541265.0
4,4,0,United States,5,Furnishings and durable household equipment,390382.0


In [128]:
spending_df = spending_df[spending_df['Description'] == 'Personal consumption expenditures']
spending_df = spending_df[(spending_df.GeoName != 'United States') & (spending_df.GeoName != 'Southwest') & (spending_df.GeoName != 'Southeast') & (spending_df.GeoName != 'Plains') & (spending_df.GeoName != 'Mideast') & (spending_df.GeoName != 'Great Lakes') & (spending_df.GeoName != 'Rocky Mountain') & (spending_df.GeoName != 'Far West') & (spending_df.GeoName != 'District of Columbia') & (spending_df.GeoName != 'New England') & (spending_df.GeoName != 'West Virginia') & (spending_df.GeoName != 'Wyoming')]
spending_df.rename(columns={'GeoName':'State','2020':'spending'},inplace=True)
spending_df = spending_df[['State', 'spending']]
spending_df.reset_index(inplace=True,drop=True)
spending_df

Unnamed: 0,State,spending
0,Alabama,176479.8
1,Alaska,35635.7
2,Arizona,287090.1
3,Arkansas,104488.8
4,California,1835980.6
5,Colorado,270883.0
6,Connecticut,179405.9
7,Delaware,44834.5
8,Florida,947905.9
9,Georgia,408752.8


In [129]:
merged_df = gdp_df.merge(spending_df, how = 'inner', on = 'State')
merged_df = gdp_df.merge(population_df, how = 'inner', on = 'State')
merged_df = gdp_df.merge(income_df, how = 'inner', on = 'State')
merged_df

Unnamed: 0,State,GDP,income
0,Alabama,196906.1,228748.8
1,Arizona,320550.6,368458.6
2,Arkansas,114943.5,143147.9
3,California,2663665.9,2763312.0
4,Colorado,346011.3,370392.1
5,Connecticut,235888.6,279612.4
6,Delaware,62056.2,55357.4
7,Florida,944000.8,1209995.9
8,Georgia,536693.0,554566.5
9,Idaho,73655.4,89077.7


In [130]:
#created a function that finds states with not enough data to produce analysis
def little_data_states(tab):
    little_info_state = []
    for x in full_state_list:
        if len(tab[tab['State'] == x]) < 30:
            little_info_state.append(x)
    return little_info_state

states_exempted= little_data_states(df3)
        

In [131]:
#created a new dataframe that would filter out these states from the df3 dataframe
df4 = df3[~df3['State'].isin(states_exempted)]
df4.head()

Unnamed: 0,Bedrooms,Bathrooms,Size,Type,Price,City,State,ZipCode,perSqFt,Region
0,2.0,1.0,814.0,House,1.0,Indianapolis,IN,46217,0.0,MidWest
1,2.0,1.0,1728.0,House,775.0,Indianapolis,IN,46218,0.45,MidWest
2,3.0,1.0,1000.0,House,1050.0,Allentown,PA,18102,1.05,North
3,3.0,2.0,1418.0,House,1600.0,Troy,MI,48083,1.13,MidWest
4,2.0,2.0,1060.0,Townhouse,1700.0,Henderson,NV,89014,1.6,West


In [132]:
pd.set_option('display.float_format', '{:.2f}'.format)
pd.pivot_table(df4, values=['perSqFt'], index=['State'])


Unnamed: 0_level_0,perSqFt
State,Unnamed: 1_level_1
AK,175.69
AL,136.81
AR,142.48
AZ,578.91
CA,498.29
CO,336.63
CT,212.42
DC,574.93
DE,147.46
FL,231.73


### Analyze Data

### Data Visualization

In [None]:
'''
NOT WORKING CORRECTLY


data = dict(type = 'choropleth',colorscale='Portland',locations=df3['State'],locationmode='USA-states', z=df3['Price'],text=df3['Price'], colorbar={'title':'perSqFt'})
choromap = go.Figure(data = [data],layout = dict(geo = {'scope':'usa'}))

iplot(choromap,validate=False)
'''

In [None]:
# for GeoPandas later use
# df2 = df2[df2['ZipCode'].between(10000,99999,inclusive='both')]

### Conclusion

In [None]:
'''



'''

### References

In [None]:
'''
References

Data of:    Housing Prices
https://www.kaggle.com/dataranch/zillow-1

Data of:    GDP,Income,Spending 
https://apps.bea.gov/iTable/iTable.cfm?reqid=70&step=1&isuri=1&acrdn=1#reqid=70&step=1&isuri=1&acrdn=1

Data of:    Population
https://worldpopulationreview.com/states


'''