# Descriptive Analytics

## Data Overview

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry.polygon import Polygon
from h3 import h3 
from shapely.geometry import shape
from shapely.geometry import Point

In [None]:
df=pd.read_csv('taxi_2017_complete_preprocessed.csv', parse_dates=["start_time","end_time"])

In [None]:
df.info()

In [None]:
df.head(3)

In [None]:
df.tail(3)

## Temporal Demand Patterns

In [None]:
df["Date"] = df["start_time"].apply(lambda x: x.date())
df["Weekday"]= df["start_time"].apply(lambda x: x.weekday())
df["Hour"] = df["start_time"].apply(lambda x: x.hour)
df['Month'] = df["start_time"].apply(lambda x: x.month)
df.head(3)

### Usage During a Day

In [None]:
# number of trips for every hour on a day
trips_day = np.zeros(24)
for i in range(24):
    trips_day[i] = len(df[df['Hour']==i])
    
time = pd.DataFrame({'Time_Period': range(24), 'Trips': trips_day})
time.head()

In [None]:
# hourly demand
plt.figure(figsize=(8,4))
plt.plot(time['Time_Period'],time['Trips'],'o-', color='blue')
plt.xlabel("Time of Day (in hours)")
plt.ylabel("Number of Trips")
plt.title("Number of Trips per Time of Day")
#plt.savefig("daily demand.png")
plt.show()

### Usage during a Week

In [None]:
weekdays =['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
trips_week=np.zeros(7)
for i in range(7):
    trips_week[i] = len(df[df["Weekday"]==i])
    
weekdaysize = pd.DataFrame({'Weekday': weekdays, 'Trips': trips_week})
weekdaysize

In [None]:
# weekly demand
plt.figure(figsize=(8,4))
plt.plot(weekdaysize["Weekday"], weekdaysize["Trips"],'o-', color='blue')
plt.ylabel("Number of Trips")
plt.title("Number of Trips on Different Days")
#plt.savefig("weekly demand.png")
plt.show()

In [None]:
# Warum so wenige Samstag und Sonntag?

### Usage during a Year

In [None]:
months=["January","February","March","April","May","June","July","August","September","Oktober","November","December"]

In [None]:
trips_month = np.zeros(12)
for i in range(12):
    trips_month[i]= len(df[df["Month"]==i+1])

year= pd.DataFrame({'Months':months, 'Trips':trips_month})
year

In [None]:
# yearly demand
plt.figure(figsize=(12,4))
plt.plot(year["Months"], year["Trips"],'o-', color='blue')
plt.ylabel("Number of Trips")
plt.title("Number of Trips in Different Months")
#plt.savefig("yearly demand.png")
plt.show()

In [None]:
# Warum so wenige im November und Dezember?

In [None]:
# Seasonal Comparison??

## Spatial Analysis

### Load Data and Data Overview

In [None]:
#df_geo = gpd.read_file('taxi_2017_complete_preprocessed.csv')

In [None]:
# community data from "extra files" for community bounderies
communities_gdf=gpd.read_file('extra_dataframes/Community Areas (current).geojson')
communities_gdf.head(2)

In [None]:
communities_gdf.info()

In [None]:
# Plot Communities of Chicago
communities_gdf.plot()

In [None]:
# census data from "extra files" for census bounderies
census=gpd.read_file('extra_dataframes/Census Tracts - 2010.geojson')
census.head(2)

In [None]:
# Plot Census Tracts of Chicago
census.plot()

In [None]:
census.info()

In [None]:
df['start_time'] = pd.to_datetime(df['start_time'])

In [None]:
# Convert taxi_trips DataFrame to a GeoDataFrame
taxi_trips = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.start_longitude, df.start_latitude)
)

In [None]:
# Merge the taxi trips data with the census tract boundaries based on spatial intersection
taxi_trips_tracts = gpd.sjoin(taxi_trips, census[['geoid10','geometry']], how = 'left', op='within')

In [None]:
taxi_trips_tracts.info()

### Start Time Variation

In [None]:
# Convert the 'start_time' column to Unix timestamps
taxi_trips_tracts['start_time'] = (taxi_trips_tracts['start_time'] - pd.Timestamp('1970-01-01')) // pd.Timedelta(seconds=1)

In [None]:
# Calculate the average start time for each census tract
tract_avg_start_time = taxi_trips_tracts.groupby('geoid10')['start_time'].mean()
tract_avg_start_time.info()

In [None]:
# Merge the average start time data with the census tract boundaries
census_avg_start_time = census.merge(tract_avg_start_time, left_on='geoid10', right_index=True)

In [None]:
import matplotlib.dates as mdates

# Create the plot
fig, ax = plt.subplots(figsize=(10, 10))
census_avg_start_time.plot(column='start_time', cmap='viridis', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)

# Format the legend to display datetime values
legend = ax.get_legend()
if legend:
    date_format = mdates.DateFormatter('%Y-%m-%d %H:%M:%S')
    legend.set_title('Average Start Time')
    legend.set_bbox_to_anchor((1.2, 1))
    legend.set_frame_on(False)
    legend.set_yticklabels([date_format(x) for x in legend.get_yticks()])

ax.set_title('Average Start Time by Census Tract')
plt.show()

### Trip Length Variation

In [None]:
# Calculate the average trip length for each census tract
tract_avg_trip_length = taxi_trips_tracts.groupby('geoid10')['trip_seconds'].mean()
tract_avg_trip_length.info()

In [None]:
# Merge the average trip length data with the census tract bounderies
census_avg_trip_length = census.merge(tract_avg_trip_length, left_on='geoid10', right_index=True)

In [None]:
# Create the plot
fig, ax = plt.subplots(figsize=(10, 10))
census_avg_trip_length.plot(column='trip_seconds', cmap='viridis', linewidth=0.8, ax=ax, legend=True, legend_kwds={'label': "Average Trip Length by Start Census Tract"}, missing_kwds={'color': 'lightgrey'})
ax.set_title('Average Trip Length by Community Area')
plt.show()

In [None]:
# Ausprobieren
geo_df = gpd.read_file('taxi_2017_complete_preprocessed.csv', parse_dates=["start_time","end_time"])