# Exploratory Data Analysis (EDA) for Trip Purpose PredictionIn this notebook, we will perform exploratory data analysis on the datasets to understand the distributions and relationships between household and individual characteristics and trip purposes.

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set visualisation style
sns.set(style='whitegrid')

In [ ]:
# Load the datasets
household_data = pd.read_csv('../data/processed/household_data.csv')
person_data = pd.read_csv('../data/processed/person_data.csv')
trip_data = pd.read_csv('../data/processed/trip_data.csv')
# Display the first few rows of each dataset
print('Household Data:')
display(household_data.head())
print('Person Data:')
display(person_data.head())
print('Trip Data:')
display(trip_data.head())

In [ ]:
# Check for missing values in the datasets
missing_household = household_data.isnull().sum()
missing_person = person_data.isnull().sum()
missing_trip = trip_data.isnull().sum()
print('Missing Values in Household Data:')
print(missing_household[missing_household > 0])
print('\nMissing Values in Person Data:')
print(missing_person[missing_person > 0])
print('\nMissing Values in Trip Data:')
print(missing_trip[missing_trip > 0])

In [ ]:
# Visualize the distribution of trip purposes
plt.figure(figsize=(10, 6))
sns.countplot(data=trip_data, x='trip_purpose', order=trip_data['trip_purpose'].value_counts().index)
plt.title('Distribution of Trip Purposes')
plt.xlabel('Trip Purpose')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [ ]:
# Analyze the relationship between household income and trip purpose
plt.figure(figsize=(12, 6))
sns.boxplot(data=trip_data.merge(household_data, on='household_id'), x='trip_purpose', y='HHFAMINC')
plt.title('Household Income by Trip Purpose')
plt.xlabel('Trip Purpose')
plt.ylabel('Household Income')
plt.xticks(rotation=45)
plt.show()

In [ ]:
# Analyze the relationship between age and trip purpose
plt.figure(figsize=(12, 6))
sns.boxplot(data=trip_data.merge(person_data, on='person_id'), x='trip_purpose', y='AGE')
plt.title('Age by Trip Purpose')
plt.xlabel('Trip Purpose')
plt.ylabel('Age')
plt.xticks(rotation=45)
plt.show()

### Conclusion
In this exploratory analysis, we have visualized the distributions of trip purposes and analyzed how household income and age relate to trip purposes. Further analysis can be conducted to refine our understanding and prepare for predictive modeling.