# Commute Analysis

## Questions to Answer:
1. Which neighborhoods have the longest commute times?
2. Is low income correlated with longer commute time?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load census data
df = pd.read_csv('../data/raw/transportation.csv')

# Basic cleaning
df = df.dropna(subset=['commute_time', 'income'])

# Look at relationship
sns.scatterplot(data=df, x='income', y='commute_time')
plt.title('Income vs Commute Time')
plt.xlabel('Income ($)')
plt.ylabel('Commute Time (minutes)')
plt.show()

# Quick stats
print(f"Number of records: {len(df)}")
print(f"Average commute time: {df['commute_time'].mean():.1f} minutes")
print(f"Average income: ${df['income'].mean():,.0f}")

## Quick Start

In [None]:
plt.figure(figsize=(10, 6))
df.boxplot(column='commute_time', by='income_bracket', figsize=(10, 6))
plt.xlabel('Income Bracket')
plt.ylabel('Commute Time (minutes)')
plt.title('Commute Time Distribution by Income Level')
plt.suptitle('')  # Remove default title
plt.tight_layout()
plt.savefig('../visuals/charts/commute_by_income_bracket.png', dpi=300)
plt.show()

## Box Plot: Commute Time by Income Level

In [None]:
# Visualize scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df['income'], df['commute_time'], alpha=0.5)
z = np.polyfit(df['income'], df['commute_time'], 1)
p = np.poly1d(z)
plt.plot(df['income'].sort_values(), p(df['income'].sort_values()), 'r-', linewidth=2, label=f'Trend line (r={correlation:.2f})')
plt.xlabel('Annual Income ($)')
plt.ylabel('Commute Time (minutes)')
plt.title('Income vs Commute Time')
plt.legend()
plt.tight_layout()
plt.savefig('../visuals/charts/income_vs_commute.png', dpi=300)
plt.show()

## Scatter Plot with Trend Line

In [None]:
# Calculate correlation
correlation, p_value = pearsonr(df['income'], df['commute_time'])
print(f'Correlation between income and commute time: {correlation:.3f}')
print(f'P-value: {p_value:.4f}')
print(f'Significant: {"Yes" if p_value < 0.05 else "No"}')

# Create income brackets
df['income_bracket'] = pd.qcut(df['income'], q=4, labels=['Very Low', 'Low', 'Medium', 'High'])
income_commute = df.groupby('income_bracket')['commute_time'].agg(['mean', 'median', 'std'])
print(income_commute)

## Full Analysis: Q2 - Is low income correlated with longer commute time?

In [None]:
import numpy as np
from scipy.stats import pearsonr

# Calculate average commute time by neighborhood
commute_by_neighborhood = df.groupby('neighborhood')['commute_time'].agg(['mean', 'median', 'std', 'count'])
commute_by_neighborhood = commute_by_neighborhood.sort_values('mean', ascending=False)
print(commute_by_neighborhood)

# Visualize
plt.figure(figsize=(12, 6))
commute_by_neighborhood['mean'].plot(kind='barh', color='steelblue')
plt.xlabel('Average Commute Time (minutes)')
plt.ylabel('Neighborhood')
plt.title('Average Commute Times by Neighborhood')
plt.tight_layout()
plt.savefig('../visuals/charts/commute_by_neighborhood.png', dpi=300)
plt.show()

## Full Analysis: Q1 - Which neighborhoods have the longest commute times?