In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

In [None]:
df = pd.read_csv("core/crime-housing-austin-2015.csv")
df_zip = pd.read_csv("core/AustinZipCodes.csv", thousands=',')
df_zip = df_zip.rename(columns={'Zip Code':'Zip_Code_Housing'})
df = pd.merge(df, df_zip, on='Zip_Code_Housing')

In [None]:
# convert str to float
arr_currency = [
    'Medianhouseholdincome',
    'Medianhomevalue'
]
arr_percent = [
    'Populationbelowpovertylevel',
    'Populationwithdisability',
    'Unemployment',
    'Largehouseholds(5+members)',
    'Rentalunitsaffordabletoaverageretail/serviceworker',
    'Rentalunitsaffordabletoaverageartist',
    'Rentalunitsaffordabletoaverageteacher',
    'Rentalunitsaffordabletoaveragetechworker'
]
for col in arr_currency:
    df[col] = df[col].str.replace('$', '', regex=False).astype('float')
for col in arr_percent:
    df[col] = df[col].str.replace('%', '', regex=False).astype('float') / 100
df['People / Sq. Mile'] = df['People / Sq. Mile'].astype('float')

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,5))
ax = plt.subplot(1,1,1)
ax.scatter(
    x=df['People / Sq. Mile'],
    y=df['Medianhouseholdincome'],
)
ax.set_xlabel('People / Sq. Mile')
ax.set_ylabel('Median Household Income')
ax.set_title('Population Density vs. Median Income')

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,5))
ax = plt.subplot(1,1,1)
ax.scatter(
    x=df['Populationwithdisability'],
    y=df['Medianhouseholdincome'],
)
ax.set_xlabel('Proportion of Population with Disability')
ax.set_ylabel('Median Household Income')
ax.set_title('Disablilty vs. Median Income')

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,5))
ax = plt.subplot(1,1,1)
ax.scatter(
    x=df['Largehouseholds(5+members)'],
    y=df['Medianhouseholdincome'],
)
ax.set_xlabel('Large households (5+members)')
ax.set_ylabel('Median Household Income')
ax.set_title('Large Households vs. Median Income')

In [None]:
display(stats.pearsonr(df['Medianhouseholdincome'].dropna(), df['People / Sq. Mile'].dropna()))
display(stats.pearsonr(df['Medianhouseholdincome'].dropna(), df['Populationwithdisability'].dropna()))
display(stats.pearsonr(df['Medianhouseholdincome'].dropna(), df['Largehouseholds(5+members)'].dropna()))

In [None]:
# Select the relevant columns
columns_to_plot = [
    'Rentalunitsaffordabletoaverageretail/serviceworker',
    'Rentalunitsaffordabletoaverageartist',
    'Rentalunitsaffordabletoaverageteacher',
    'Rentalunitsaffordabletoaveragetechworker'
]

# Create a new DataFrame with only the selected columns
df_plot = df[['Medianhouseholdincome'] + columns_to_plot].copy()

# Rename columns to plot
df_plot.rename(columns = {'Rentalunitsaffordabletoaverageretail/serviceworker':'Serviceworker',
                          'Rentalunitsaffordabletoaverageartist':'Artist',
                          'Rentalunitsaffordabletoaverageteacher':'Teacher',
                          'Rentalunitsaffordabletoaveragetechworker':'Techworker'},
               inplace = True)

# Melt the DataFrame
df_melted = pd.melt(df_plot, id_vars='Medianhouseholdincome', var_name='Occupation', value_name='Value')

# Plot using Seaborn
sns.set(style="whitegrid")
# plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(x='Occupation', y='Value', hue='Occupation', data=df_melted,errorbar='sd')

# Set plot labels and title
plt.xlabel('Occupation')
plt.ylabel('Median Household Income')
plt.title('Median Household Income vs. Rental Units Affordability')
plt.show()

grouped_data = df_melted.groupby('Occupation')['Value']
means = grouped_data.mean()
std_devs = grouped_data.std()
print('Mean:',means)
print('\nStandard Deviation:',std_devs)

In [None]:
columns_to_plot = [
    'Rentalunitsaffordabletoaverageartist',
    'Rentalunitsaffordabletoaverageteacher'
]

# Create a new DataFrame with only the selected columns
df_plot = df[['Medianhouseholdincome'] + columns_to_plot].copy()

# Drop rows with missing values and values equal to 0
df_plot = df_plot.dropna().query('Rentalunitsaffordabletoaverageartist != 0 and Rentalunitsaffordabletoaverageteacher != 0')

# Melt the DataFrame to have a 'variable' column for the categories
df_melted = pd.melt(df_plot, id_vars='Medianhouseholdincome', var_name='Category', value_name='Value')

# KDE Plot
plt.figure()
sns.displot(data=df_melted, x='Value', hue='Category', kind='kde')
plt.title('Distribution of Median Household Income vs. Rental Units Affordability')
plt.xlabel('Value')
plt.ylabel('Density')
plt.show()

# T-test
category1 = df_melted[df_melted['Category'] == 'Rentalunitsaffordabletoaverageartist']['Value']
category2 = df_melted[df_melted['Category'] == 'Rentalunitsaffordabletoaverageteacher']['Value']

# Perform the t-test
t_stat = stats.ttest_ind(category1, category2)
print(f"{t_stat}")
print(f"Mean for Artist: {category1.mean()}")
print(f"Mean for Teacher: {category2.mean()}")
