In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import numpy as np

In [31]:
connection_params = {"user":"shleiferj", 
                     "password":"", 
                     "dbname":"hmda", 
                     "host":"localhost"}

def connect(params=connection_params):
    """
    This function accepts a dictionary of connection parameters that must include:
    - user: the username to be used for the database session
    - password: the user's password
    - dbname: the name of the database for connection
    - host: the host location of the database
    """
    #attempt a connection with the supplied parameters
    try:
        conn = psycopg2.connect(**params)
        print("I'm connected") #print a success message
        return conn.cursor() #return a cursor object
    except psycopg2.Error as e: 
        print("I am unable to connect to the database: ", e) #print a fail message and the error, if any

In [32]:
def valueCounty(year):
    cur = connect()

    query = f"""
    SELECT property_value, county_code, derived_race
    FROM hmda_public.lar_{year}
    where action_taken='1' and not property_value='Exempt' and not property_value='NA' and total_units = '1';
    """

    cur.execute(query)

    results = cur.fetchall()

    cur.close()

    df = pd.DataFrame(results)
    df.columns = ['value', 'county', 'race']

    return df

In [33]:
df18 = valueCounty(2018)
df19 = valueCounty(2019)
df20 = valueCounty(2020)
df21 = valueCounty(2021)
df22 = valueCounty(2022)
df23 = valueCounty(2023)

I'm connected
I'm connected
I'm connected
I'm connected
I'm connected
I'm connected


In [34]:
# Mean and median by year
def med(list):
    return np.median([int(i) for i in list if i!='NA'])
def mean(list):
    return np.mean([int(i) for i in list if i!='NA'])

medians = [med(df18['value'].tolist()),med(df19['value'].tolist()),med(df20['value'].tolist()),
           med(df21['value'].tolist()),med(df22['value'].tolist()),med(df23['value'].tolist())]
means = [mean(df18['value'].tolist()),mean(df19['value'].tolist()),mean(df20['value'].tolist()),
        mean(df21['value'].tolist()),mean(df22['value'].tolist()),mean(df23['value'].tolist())]
yearMean = pd.DataFrame({'mean':means, 'median':medians})
yearMean.index = ['2018','2019','2020','2021','2022','2023']

In [41]:
import functools as ft
from collections import Counter

def num(df, column = 'value'):
    df[column] = pd.to_numeric(df.copy()[column], errors='coerce')
    return df

def count(series):
    return dict(Counter(series))

def size(series):
    return len(series)

def normalCount(series):
    count = dict(Counter(series))
    total = sum(count.values())
    for i in count.keys():
        count[i] = count[i]/total
    return count

df18c = num(df18).groupby('county').agg(
    mean2018 = ('value', 'mean'),
    median2018 = ('value', 'median'),
    race2018 = ('race', count),
    raceNormal2018 = ('race', normalCount),
    count2018 = ('value', size)
)
df19c = num(df19).groupby('county').agg(
    mean2019 = ('value', 'mean'),
    median2019 = ('value', 'median'),
    race2019 = ('race', count),
    raceNormal2019 = ('race', normalCount),
    count2019 = ('value', 'size')
)
df20c = num(df20).groupby('county').agg(
    mean2020 = ('value', 'mean'),
    median2020 = ('value', 'median'),
    race2020 = ('race', count),
    raceNormal2020 = ('race', normalCount),
    count2020 = ('value', 'size')
)
df21c = num(df21).groupby('county').agg(
    mean2021 = ('value', 'mean'),
    median2021 = ('value', 'median'),
    race2021 = ('race', count),
    raceNormal2021 = ('race', normalCount),
    count2021 = ('value', 'size')
)
df22c = num(df22).groupby('county').agg(
    mean2022 = ('value', 'mean'),
    median2022 = ('value', 'median'),
    race2022 = ('race', count),
    raceNormal2022 = ('race', normalCount),
    count2022 = ('value', 'size')
)
df23c = num(df23).groupby('county').agg(
    mean2023 = ('value', 'mean'),
    median2023 = ('value', 'median'),
    race2023 = ('race', count),
    raceNormal2023 = ('race', normalCount),
    count2023 = ('value', 'size')
)
meansMeds = ft.reduce(lambda left, right: pd.merge(left, right, on='county'), [df18c, df19c, df20c, df21c, df22c, df23c])

# Omits countys where not all years have values
# Should just be Us/peurto rico

del df18c, df19c, df20c, df21c, df22c, df23c

In [None]:
for year in range(2019, 2024):
    meansMeds[f'medIncrease{year}'] = meansMeds[f'median{year}']/meansMeds[f'median{year-1}']

meansMeds[f'medIncrease18-23'] = meansMeds[f'median2023']/meansMeds[f'median2019']

In [48]:
meansMeds.to_csv('meanMeds.csv')

In [42]:
temp = meansMeds.copy()
for i in range(2018, 2024):
    temp = temp[temp[f'count{i}'] >= 25]

In [None]:
temp.to_csv('meanMedsFiltered.csv')