In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
from shapely.geometry import Point
import numpy as np

df = pd.read_csv("C:/Users/udayv/Desktop/lab3/Taxi_Trips.csv")
geometry = [Point(xy) for xy in zip(df['Pickup Centroid Longitude'], df['Pickup Centroid Latitude'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=4326).sample(1000)
gdf = gdf.rename(columns={"Trip Seconds": "Trip_Seconds", "Trip Miles": "Trip_Miles"})

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94620 entries, 0 to 94619
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Trip ID                     94620 non-null  object 
 1   Taxi ID                     94613 non-null  object 
 2   Trip Start Timestamp        94620 non-null  object 
 3   Trip End Timestamp          94617 non-null  object 
 4   Trip Seconds                94600 non-null  float64
 5   Trip Miles                  94619 non-null  float64
 6   Pickup Census Tract         36858 non-null  float64
 7   Dropoff Census Tract        37522 non-null  float64
 8   Pickup Community Area       87054 non-null  float64
 9   Dropoff Community Area      85464 non-null  float64
 10  Fare                        94559 non-null  float64
 11  Tips                        94559 non-null  float64
 12  Tolls                       94559 non-null  float64
 13  Extras                      945

In [4]:


chicago = gpd.read_file('C:/Users/udayv/Desktop/lab3/chicago.geojson')

joined = gpd.sjoin(gdf, chicago, predicate='within')

for col in joined.columns:
    if col != 'zip': 
        joined[col] = pd.to_numeric(joined[col], errors='coerce')

joined = joined.select_dtypes(include=[np.number]).groupby(joined['zip']).mean()


joined = joined.filter(['Fare'])

merged = chicago.merge(joined, on='zip')
#merged.plot('Fare', cmap='Reds', legend=True,  figsize=(10, 6))

In [30]:
import altair as alt
import pandas as pd
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

# Assuming 'df' is your DataFrame containing the taxi trip dataset

# Ensure 'Pickup Community Area' is treated as a string for categorical x-axis
df['Pickup Community Area'] = df['Pickup Community Area'].astype(str)

# Select only 50 unique Pickup Community Areas to avoid a cluttered x-axis
selected_areas = df['Pickup Community Area'].dropna().unique()
if len(selected_areas) > 50:
    selected_areas = pd.Series(selected_areas).sample(50, random_state=42)  # Random 50 areas for consistency

# Filter data to include only selected Pickup Community Areas
df_filtered = df[df['Pickup Community Area'].isin(selected_areas)]

# Group data by Pickup Community Area and Payment Type for average trip seconds
grouped_duration = df_filtered.groupby(["Pickup Community Area", "Payment Type"]).agg({
    'Trip Seconds': 'mean'
}).reset_index()

# Group data by Pickup Community Area and Payment Type for counting trips
grouped_count = df_filtered.groupby(["Pickup Community Area", "Payment Type"]).size().reset_index(name='Count')

# Create an interval selection on the x-axis (Pickup Community Area)
brush = alt.selection_interval(encodings=['x'])

# Line chart for Average Trip Seconds by Pickup Community Area
line_chart = alt.Chart(grouped_duration).mark_line(point=True).encode(
    x=alt.X('Pickup Community Area:N', title='Pickup Community Area', sort=selected_areas.tolist()),  # Sorting for better visibility
    y=alt.Y('Trip Seconds:Q', title='Average Trip Duration (seconds)'),
    color=alt.condition(brush, 'Payment Type:N', alt.value('lightgray')),
    tooltip=['Pickup Community Area', 'Payment Type', 'Trip Seconds']
).properties(
    width=600,
    height=300
).add_params(
    brush
)

# Bar chart for Number of Trips by Payment Type, filtered by the brush on the line chart
bar_chart = alt.Chart(grouped_count).mark_bar().encode(
    x=alt.X('Payment Type:N', title='Payment Type'),
    y=alt.Y('Count:Q', title='Number of Trips'),
    color='Payment Type:N',
    tooltip=['Payment Type', 'Count']
).transform_filter(
    brush
).properties(
    width=600,
    height=300
)

# Combine the charts vertically
combined = alt.vconcat(line_chart, bar_chart)

combined
