In [None]:
# Dependencies
import hvplot.pandas

import requests
import pandas as pd
from census import Census
import numpy as np
import matplotlib.pyplot as plt
from config import api_key
import seaborn as sns


In [None]:
## Pulling Median Income By County (CA)

years = [2018,2019,2021,2022] # no 2020 data available due to COVID -19
med_income_var = 'B19013_001E'
base_url = 'https://api.census.gov/data/'
geo = '&for=county:*&in=state:06'  # California = 06
dfs = []

for year in years: 
    url = f'{base_url}{year}/acs/acs1?get={med_income_var}{geo}&key={api_key}'

    data = requests.get(url).json()

    data = pd.DataFrame(data[1:])
   
    data["Year"] = year

    data = data.rename(columns={0: 'Median Income',1: 'State',2:'County'})
    dfs.append(data)

income_data_county = pd.concat(dfs, ignore_index=True)
income_data_county['State'] = pd.to_numeric(income_data_county['State'])
income_data_county['County'] = pd.to_numeric(income_data_county['County'])
income_data_county["Median Income"] = income_data_county["Median Income"].astype('int')
income_data_county



In [None]:
## Pulling Median Income By State (CA)

years = [2018,2019,2021,2022] # no 2020 data available due to COVID -19
med_income_var = 'B19013_001E'
base_url = 'https://api.census.gov/data/'
geo = '&for=state:06'  # California = 06
dfs = []

for year in years: 
    url = f'{base_url}{year}/acs/acs1?get={med_income_var}{geo}&key={api_key}'

    data = requests.get(url).json()

    data = pd.DataFrame(data[1:])
   
    data["Year"] = year

    data = data.rename(columns={0: 'Median Income',1: 'State',2:'County'})
    dfs.append(data)

income_data_state = pd.concat(dfs, ignore_index=True)
income_data_state['State'] = pd.to_numeric(income_data_state['State'])
income_data_state["Median Income"] = income_data_state["Median Income"].astype('int')
income_data_state

In [None]:
#Pulling Geocodes for State(CA)
geocodes_csv = "../Josh/Resources/geocodes.csv"
geo_state = pd.read_csv(geocodes_csv)
geo_state = geo_state.rename(columns={"Area Name (including legal/statistical area description)":"Area","State Code (FIPS)":"State"})
geo_state = geo_state[(geo_state['State']== 6) & (geo_state["County Code (FIPS)"]==0) *(geo_state["Place Code (FIPS)"] == 0)]
geo_state = geo_state[["State","Area"]]
geo_state

In [None]:
#Pulling Geocodes for County(CA)
geocodes_csv = "../Josh/Resources/geocodes.csv"
geo_county = pd.read_csv(geocodes_csv)
geo_county = geo_county.rename(columns={"Area Name (including legal/statistical area description)":"Area","State Code (FIPS)":"State","County Code (FIPS)":"County"})
geo_county = geo_county[['State','County','Area']]
geo_county = geo_county[(geo_county['State']== 6) & (geo_county["County"] > 0)]
geo_county.head()

In [None]:
#Created CSV for Regions in case we want to do Socal/NorCal Analysis
state_split_csv = "../Josh/Resources/California - Counties.csv"
state_split = pd.read_csv(state_split_csv)
state_split.head()

In [None]:
#Merging DF's for County Names
data_county = pd.merge(income_data_county,geo_county,on=['State','County'],how='left')
data_county.drop(columns=["County"], inplace=True)
data_county.rename(columns={'Area': 'County'},inplace=True)
data_county['State'] = 'California'
data_county  = pd.merge(data_county,state_split, on= "County",how='left')
data_county = data_county[['Year', 'State','Region','County','Median Income']]
data_county.head()


In [None]:
#Merging DF's for State Name (Can easily rename column to 'California' but merging will ensure accuracy of pull)
data_state = pd.merge(income_data_state,geo_state,on=['State'],how='left')
data_state.drop(columns=["State"],inplace=True)
data_state.rename(columns={"Area":"State"},inplace=True)
data_state = data_state[["Year","State","Median Income"]]
data_state

In [None]:
#Working with Median Housing Price Data - Cleaning up and getting Mean by year

pd.set_option('display.float_format', '{:.2f}'.format)

csv = "../Josh/Resources/Median(New) - MedianPricesofExistingDetachedHomesHistoricalData.csv"

median_data = pd.read_csv(csv)

median_data = median_data.drop('Unnamed: 54',axis=1)
median_data = median_data.astype('str')

def clean_currency(value):
    try:
        return int(str(value).replace(',', '').replace('$', ''))
    except ValueError:
        # Return np.nan for non-convertible values
        return np.nan

columns_to_clean = ['CA', 'Alameda', 'Amador', 'Butte', 'Calaveras',
       'Contra-Costa', 'Del Norte', 'El Dorado', 'Fresno', 'Glenn', 'Humboldt',
       'Kern', 'Kings', 'Lake', 'Lassen', 'Los Angeles', 'Madera', 'Marin',
       'Mariposa', 'Mendocino', 'Merced', 'Mono', 'Monterey', 'Napa', 'Nevada',
       'Orange', 'Placer', 'Plumas', 'Riverside', 'Sacramento', 'San Benito',
       'San Bernardino', 'San Diego', 'San Francisco', 'San Joaquin',
       'San Luis Obispo', 'San Mateo', 'Santa Barbara', 'Santa Clara',
       'Santa Cruz', 'Shasta', 'Siskiyou', 'Solano', 'Sonoma', 'Stanislaus',
       'Sutter', 'Tehama', 'Trinity', 'Tulare', 'Tuolumne', 'Ventura', 'Yolo',
       'Yuba', 'Condo', 'LA Metro', 'Central Coast', 'Central Valley',
       'Far North', 'Inland Empire', 'S.F. Bay Area', 'SoCal']
median_data[columns_to_clean] = median_data[columns_to_clean].applymap(clean_currency)
median_data["Mon-Yr"] = pd.to_datetime(median_data["Mon-Yr"],format='%b-%y')
median_data = median_data[median_data["Mon-Yr"] >= "2018-01-01"]

median_data["Year"] = median_data["Mon-Yr"].dt.year
median_data = median_data[['Year', 'CA', 'Alameda', 'Amador', 'Butte', 'Calaveras',
       'Contra-Costa', 'Del Norte', 'El Dorado', 'Fresno', 'Glenn', 'Humboldt',
       'Kern', 'Kings', 'Lake', 'Lassen', 'Los Angeles', 'Madera', 'Marin',
       'Mariposa', 'Mendocino', 'Merced', 'Mono', 'Monterey', 'Napa', 'Nevada',
       'Orange', 'Placer', 'Plumas', 'Riverside', 'Sacramento', 'San Benito',
       'San Bernardino', 'San Diego', 'San Francisco', 'San Joaquin',
       'San Luis Obispo', 'San Mateo', 'Santa Barbara', 'Santa Clara',
       'Santa Cruz', 'Shasta', 'Siskiyou', 'Solano', 'Sonoma', 'Stanislaus',
       'Sutter', 'Tehama', 'Trinity', 'Tulare', 'Tuolumne', 'Ventura', 'Yolo',
       'Yuba']]
median_data.reset_index().drop(columns=["index"])
median_grp = median_data.groupby("Year")

median_data = median_grp.mean().reset_index()
median_data


In [None]:
#melt(take columns and make them rows) Housing Median Price DF into two DFS: One For CA and one by County.  
# Also merging with Median Income DF's 

melted_df = pd.melt(median_data, id_vars=['Year'], var_name='Region', value_name='Median Housing Price')
melted_df["Region"] = melted_df["Region"] + " County"
melted_df.loc[melted_df['Region'] == 'CA County', 'Region'] = 'California'

#create county DF(Housing Prices)
housing_county = melted_df[melted_df['Region']!= 'California'].copy()
housing_county.rename(columns={"Region":"County"},inplace=True)

#create state DF (Housing Prices)
housing_state = melted_df[(melted_df['Region']== 'California') & (melted_df['Year'] != 2020) & (melted_df["Year"] != 2023)].copy()

housing_state = housing_state.reset_index().drop(columns=['index'])
housing_state.rename(columns={"Region":"State"},inplace=True)
state_df = pd.merge(data_state,housing_state,on=["Year","State"])
county_df = pd.merge(data_county,housing_county,on=["Year","County","County"])



In [None]:
county_df.head()

In [None]:
state_df

In [None]:
county_scatter = county_df.plot.scatter(x='Median Income', y='Median Housing Price', marker='o', linestyle='-')
plt.title("Median Income VS Median Housing Price (2018-2022 Data)")

In [None]:
state_df.plot.scatter(x='Median Income', y='Median Housing Price', marker='o', linestyle='-')


In [None]:
state_df.plot(x='Year', y='Median Housing Price', marker='o', linestyle='-')
plt.title("Median Housing Income VS Year")
#Having trouble cleaning this up for X axis using xticks. #Help? Lol 

In [None]:
heat_df_income_2018 = county_df[county_df['Year'] == 2018][['County','Median Income']]
heat_df_income_2018.set_index('County', inplace=True)

heat_df_housing_2018 = county_df[county_df['Year'] == 2018][['County','Median Housing Price']]


heat_df_income_2022 = county_df[county_df['Year'] == 2022][['County','Median Income']]
heat_df_income_2022.set_index('County', inplace=True)

heat_df_housing_2022 = county_df[county_df['Year'] == 2022][['County','Median Housing Price']]

In [None]:
# import geopandas as gpd

# # Load the GeoDataFrame for California counties
# california_counties_geojson = "../Josh/Resources/California_County_Boundaries.geojson"  # Replace with the actual path
# gdf_counties = gpd.read_file(california_counties_geojson)
# gdf_counties = gdf_counties[gdf_counties["ISLAND"] != 'Channel Islands']

# # Display the GeoDataFrame to see its structure
# print(gdf_counties.head())

# # Save the GeoDataFrame to a CSV file containing coordinates
# coordinates_table = gdf_counties[['COUNTY_NAME', 'geometry']].copy()
# coordinates_table['Latitude'] = coordinates_table['geometry'].centroid.y
# coordinates_table['Longitude'] = coordinates_table['geometry'].centroid.x
# coordinates_table = coordinates_table[['COUNTY_NAME', 'Latitude', 'Longitude']]

# # Save to CSV
# coordinates_table.to_csv("../Josh/Resources/california_counties_coordinates.csv", index=False)

# # Display the coordinates table
# coordinates_table.head()


In [None]:
df3_2018 = heat_df_housing_2018.rename(columns={'County': 'COUNTY_NAME'})
df3_2022 = heat_df_housing_2022.rename(columns={'County': 'COUNTY_NAME'})

In [None]:
import plotly.express as px
import geopandas as gpd

california_counties_geojson = "../Josh/Resources/California_County_Boundaries.geojson"  # Replace with the actual path
gdf_counties = gpd.read_file(california_counties_geojson)

gdf_counties["County"] = " County"
gdf_counties["COUNTY_NAME"] = gdf_counties["COUNTY_NAME"] + gdf_counties["County"]
gdf_counties.drop(columns="County", inplace=True)






heatmap_data = df3_2022


df_heatmap = pd.DataFrame(heatmap_data)

gdf_counties_heatmap = pd.merge(gdf_counties, df_heatmap, on='COUNTY_NAME', how='left')
color_scale_range = [0,1000000]
fig = px.choropleth_mapbox(
    gdf_counties_heatmap,
    geojson=gdf_counties_heatmap.geometry,
    locations=gdf_counties_heatmap.index,
    color='Median Housing Price',
    hover_name='COUNTY_NAME',
    hover_data={'COUNTY_NAME': False, 'Median Housing Price': ':.2f'},
    color_continuous_scale="RdYlGn_r",
    range_color=color_scale_range,

    mapbox_style="carto-positron",
    center={"lat": 36.7783, "lon": -119.4179},  # Center of California
    zoom=5,
)

fig.show()


In [None]:
import plotly.express as px
import geopandas as gpd

california_counties_geojson = "../Josh/Resources/California_County_Boundaries.geojson"  # Replace with the actual path
gdf_counties = gpd.read_file(california_counties_geojson)

gdf_counties["County"] = " County"
gdf_counties["COUNTY_NAME"] = gdf_counties["COUNTY_NAME"] + gdf_counties["County"]
gdf_counties.drop(columns="County", inplace=True)






heatmap_data = df3_2018


df_heatmap = pd.DataFrame(heatmap_data)

gdf_counties_heatmap = pd.merge(gdf_counties, df_heatmap, on='COUNTY_NAME', how='left')
color_scale_range = [0,1000000]

fig = px.choropleth_mapbox(
    gdf_counties_heatmap,
    geojson=gdf_counties_heatmap.geometry,
    locations=gdf_counties_heatmap.index,
    color='Median Housing Price',
    hover_name='COUNTY_NAME',
    hover_data={'COUNTY_NAME': False, 'Median Housing Price': ':.2f'},
    color_continuous_scale="RdYlGn_r",
    range_color=color_scale_range,
    mapbox_style="carto-positron",
    center={"lat": 36.7783, "lon": -119.4179},  # Center of California
    zoom=5,
)

fig.show()
