In [19]:
import pandas as pd
import plotly.express as px

# Load datasets
journey_df = pd.read_csv("journey_to_education_vista_2023_2024.csv")
schools_df = pd.read_csv("dv402-SchoolLocations2025.csv")

# Preview structure of data
print("Journey to Education:")
display(journey_df.head())

print("\nSchool Locations:")
# display(schools_df.head())
# Focus on important fields
display(schools_df[['School_Name', 'School_Type', 'Address_Town', 'X', 'Y']].head())

Journey to Education:


Unnamed: 0,hhid,persid,jteid,dayType,start_loc,start_stopid,start_time,start_LGA,end_loc,end_stopid,...,travtime_13,travtime_14,travtime_15,main_journey_mode,journey_travel_time,journey_distance,journey_elapsed_time,journey_weight,homesubregion_ASGS,homeregion_ASGS
0,Y24H5740109,Y24H5740109P03,Y24H5740109P03JTE001,Weekday,FROM_HOME,Y24H5740109P03S01,490,Casey (C),TO_EDUCATION,Y24H5740109P03S02,...,,,,School Bus,21,4.78611,23.0,1404.266673,MELB - Outer,Greater Melbourne
1,Y24H5740109,Y24H5740109P04,Y24H5740109P04JTE002,Weekday,FROM_HOME,Y24H5740109P04S01,490,Casey (C),TO_EDUCATION,Y24H5740109P04S02,...,,,,School Bus,21,4.78611,23.0,2949.606308,MELB - Outer,Greater Melbourne
2,Y24H5740109,Y24H5740109P05,Y24H5740109P05JTE003,Weekday,FROM_HOME,Y24H5740109P05S01,490,Casey (C),TO_EDUCATION,Y24H5740109P05S01,...,,,,Walking,20,0.93423,20.0,1165.139176,MELB - Outer,Greater Melbourne
3,Y24H5740131,Y24H5740131P03,Y24H5740131P03JTE004,Weekday,FROM_HOME,Y24H5740131P03S01,440,Casey (C),TO_EDUCATION,Y24H5740131P03S02,...,,,,School Bus,47,24.92903,50.0,1099.835321,MELB - Outer,Greater Melbourne
4,Y24H5740131,Y24H5740131P04,Y24H5740131P04JTE005,Weekday,FROM_HOME,Y24H5740131P04S01,410,Casey (C),TO_EDUCATION,Y24H5740131P04S01,...,,,,Vehicle Passenger,10,1.48545,10.0,1937.859267,MELB - Outer,Greater Melbourne



School Locations:


Unnamed: 0,School_Name,School_Type,Address_Town,X,Y
0,Parade College,Secondary,BUNDOORA,145.066978,-37.690178
1,Simonds Catholic College,Secondary,WEST MELBOURNE,144.952883,-37.805971
2,St Mary’s College Melbourne,Secondary,ST KILDA EAST,144.997001,-37.859365
3,St Patrick's College Ballarat,Secondary,BALLARAT,143.831558,-37.559711
4,St Patrick's School,Primary,BALLARAT,143.847147,-37.564397


In [9]:
# Time travel to school in VIC
# Drop rows with missing travel time
journey_df = journey_df.dropna(subset=['journey_travel_time'])

# Convert to numeric
journey_df['journey_travel_time'] = pd.to_numeric(journey_df['journey_travel_time'], errors='coerce')

# Classify trips
journey_df['within_20_min'] = journey_df['journey_travel_time'] <= 20

# Count total trips
total_trips = len(journey_df)
within_20 = journey_df['within_20_min'].sum()
print(f"{within_20} out of {total_trips} trips are within 20 minutes")

478 out of 684 trips are within 20 minutes


In [10]:
import plotly.express as px

# Create a summary DataFrame for pie chart
time_summary = journey_df['within_20_min'].value_counts().reset_index()
time_summary.columns = ['Within_20_Min', 'Count']
time_summary['Label'] = time_summary['Within_20_Min'].map({True: '<= 20 mins', False: '> 20 mins'})
colors = ['#003366', '#2E7D6C', '#0099CC', '#CCCCCC', '#333333']

# Donut chart
fig = px.pie(
    time_summary,
    names='Label',
    values='Count',
    title='Travel Time to School in VIC: Within vs Over 20 Minutes',
    hole=0.4,
    color_discrete_sequence=colors
)
fig.show()


In [13]:
fig.write_html("school_travel_time_donut.html")

In [None]:
# Travel mode for education trip

# count for the mode of travel 
journey_df['main_journey_mode'].value_counts()

main_journey_mode
Vehicle Passenger    397
Walking              104
Train                 50
School Bus            40
Public Bus            33
Vehicle Driver        27
Bicycle               16
Tram                  13
Other                  4
Name: count, dtype: int64

In [6]:
# Count mode values
mode_counts = journey_df['main_journey_mode'].value_counts().reset_index()
mode_counts.columns = ['Mode', 'Count']

colors = ['#003366', '#2E7D6C', '#0099CC', '#CCCCCC', '#333333']

# Create pie chart
fig = px.pie(
    mode_counts,
    names='Mode',
    values='Count',
    title='Mode of Travel to School',
    color_discrete_sequence=colors
)
fig.show()


In [11]:
fig.write_html("travel_mode_share_pie_chart.html")


In [32]:
# Avg travel time bt LGA

# Count how many unique LGAs are present
total_lgas = clean[lga_col].nunique(dropna=True)
print(f"Total unique LGAs in this dataset: {total_lgas}")

# Aggregate by LGA
agg = (
    clean
    .groupby(lga_col, dropna=False)
    .agg(
        avg_time=('journey_travel_time','mean'),
        n_trips=('journey_travel_time','count')
    )
    .reset_index()
)

# Sort so the longest average time appears first
agg = agg.sort_values('avg_time', ascending=False)

# Optional: round for nicer labels
agg['avg_time'] = agg['avg_time'].round(1)

agg.head()





Total unique LGAs in this dataset: 36


Unnamed: 0,start_LGA,avg_time,n_trips
22,Mitchell (S),48.7,3
14,Kingston (C),29.8,9
27,Mornington Peninsula (S),27.3,23
4,Campaspe (S),25.5,11
2,Boroondara (C),24.5,28


In [34]:
import plotly.express as px

# Bar chart (all LGAs)
fig = px.bar(
    agg,
    x=lga_col,
    y='avg_time',
    title='Average Travel Time to School by LGA',
    labels={lga_col: 'LGA', 'avg_time': 'Average Travel Time (mins)'},
    color='avg_time',  # color by the value for a gradient effect
    color_continuous_scale=colors
)

# Make it readable
fig.update_layout(
    xaxis_tickangle=-45,
    margin=dict(l=40, r=20, t=60, b=120),
    coloraxis_colorbar=dict(title='Minutes'),
    height=600
)

# Add counts in hover
fig.update_traces(
    hovertemplate="<b>%{x}</b><br>Avg time: %{y} mins<br>Trips: %{customdata[0]}<extra></extra>",
    customdata=agg[['n_trips']].values
)

fig.show()


In [30]:
fig.write_html("avg_travel_time_by_all_LGA.html")


In [None]:
# Select top 5 LGAs with shortest avg travel times
top5_best = agg.nsmallest(5, 'avg_time')

fig_best = px.bar(
    top5_best,
    x=lga_col,
    y='avg_time',
    title='Top 5 LGAs with Shortest Avg School Travel Time',
    labels={lga_col: 'LGA(Local Government Area)', 'avg_time': 'Average Travel Time (mins)'},
    color='avg_time',
    color_continuous_scale=colors
)

fig_best.update_layout(xaxis_tickangle=-45, height=500)
fig_best.show()



In [43]:
fig_best.write_html("top5_best_LGAs.html")

In [42]:
# Select top 5 LGAs with longest avg travel times
top5_worst = agg.nlargest(5, 'avg_time')

fig_worst = px.bar(
    top5_worst,
    x=lga_col,
    y='avg_time',
    title='Top 5 LGAs with Longest Avg School Travel Time',
    labels={lga_col: 'LGA(Local Government Area)', 'avg_time': 'Average Travel Time (mins)'},
    color='avg_time',
    color_continuous_scale=colors
)

fig_worst.update_layout(xaxis_tickangle=-45, height=500)
fig_worst.show()



In [44]:
fig_worst.write_html("top5_worst_LGAs.html")

In [None]:
# Combine into one dataframe
top5_best['Category'] = 'Best (Shortest Avg Time)'
top5_worst['Category'] = 'Worst (Longest Avg Time)'

combined = pd.concat([top5_best, top5_worst])


In [None]:
import plotly.express as px

# compare top 5 best vs worst Travel time to school by LGAs
fig_combined = px.bar(
    combined,
    x=lga_col,
    y='avg_time',
    color='Category',
    barmode='group',
    title='Top 5 Best vs Worst LGAs for School Travel Time',
    labels={lga_col: 'LGA(Local Government Area)', 'avg_time': 'Average Travel Time (mins)'},
    color_discrete_map={
        'Best (Shortest Avg Time)': '#2E7D6C',  
        'Worst (Longest Avg Time)': '#003366'   
    }
)

fig_combined.update_layout(
    xaxis_tickangle=-45,
    height=600,
    margin=dict(l=40, r=20, t=60, b=120)
)

fig_combined.show()


In [None]:
fig_combined.write_html("top5_best_vs_worst_LGAs.html")