In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import seaborn as sns
import os
from PIL import Image
from sklearn.ensemble import RandomForestRegressor
import warnings 
warnings.filterwarnings("ignore")

# Load the CSV file
df = pd.read_csv('/mnt/c/wo_pessoal/uber_assessment/data_final/train_df_cleaned_full.csv', low_memory=False)

path_results = "/mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/"

In [4]:
# Add colors from Uber's palette
uber_colors = {
    'primary': '#FFFFFF',   # White
    'secondary': '#7F7F7F', # Secondary gray
    'highlight': '#55E6C1', # Green
}

# Helper function to convert seconds into "minutes:seconds" format and add 'min'
def format_duration_in_min_sec(seconds):
    minutes = int(seconds // 60)  # Integer part of minutes
    sec = int(seconds % 60)  # Seconds part
    return f'{minutes:02}:{sec:02} min'

# 1. Plot for is_weekend - with high resolution and average trip_duration in "minutes:seconds"
plt.figure(figsize=(8, 6))
value_counts_weekend = df['is_weekend'].value_counts()
mean_trip_duration_by_weekend = df.groupby('is_weekend')['trip_duration'].mean()

bars_weekend = value_counts_weekend.plot(kind='bar', color=[uber_colors['primary'], uber_colors['secondary']], edgecolor='white')

# Adding the average trip duration on top of the bars in "minutes:seconds" format
for i, (index, value) in enumerate(value_counts_weekend.items()):
    mean_duration_min_sec = format_duration_in_min_sec(mean_trip_duration_by_weekend.loc[index])
    bars_weekend.text(i, value + 0.05 * value, mean_duration_min_sec, ha='center', color='white', fontsize=12)

# Title and formatting
plt.title('Number of Pickups on Weekdays vs Weekends', fontsize=14, color='white', pad=20)
plt.xlabel('Is Weekend (0 = Weekday, 1 = Weekend)', fontsize=12, color='white')
plt.ylabel('Count', fontsize=12, color='white')
plt.xticks(color='white')
plt.yticks(color='white')
plt.grid(False)
plt.tight_layout()
plt.savefig(path_results + 'is_weekend_plot_high_res_with_avg_duration_min_sec.png', transparent=True, dpi=300)
plt.close()

# 2. Plot for is_holiday - with high resolution and average trip_duration in "minutes:seconds"
plt.figure(figsize=(8, 6))
value_counts_holiday = df['is_holiday'].value_counts()
mean_trip_duration_by_holiday = df.groupby('is_holiday')['trip_duration'].mean()

bars_holiday = value_counts_holiday.plot(kind='bar', color=[uber_colors['primary'], uber_colors['secondary']], edgecolor='white')

# Adding the average trip duration on top of the bars in "minutes:seconds" format
for i, (index, value) in enumerate(value_counts_holiday.items()):
    mean_duration_min_sec = format_duration_in_min_sec(mean_trip_duration_by_holiday.loc[index])
    bars_holiday.text(i, value + 0.05 * value, mean_duration_min_sec, ha='center', color='white', fontsize=12)

# Title and formatting
plt.title('Number of Pickups on Holidays vs Non-Holidays', fontsize=14, color='white', pad=20)
plt.xlabel('Is Holiday (0 = No, 1 = Yes)', fontsize=12, color='white')
plt.ylabel('Count', fontsize=12, color='white')
plt.xticks(color='white')
plt.yticks(color='white')
plt.grid(False)
plt.tight_layout()
plt.savefig(path_results + 'is_holiday_plot_high_res_with_avg_duration_min_sec.png', transparent=True, dpi=300)
plt.close()

# 3. Plot for is_rush_hour - with high resolution and average trip_duration in "minutes:seconds"
plt.figure(figsize=(8, 6))
value_counts_rush = df['is_rush_hour'].value_counts()
mean_trip_duration_by_rush_hour = df.groupby('is_rush_hour')['trip_duration'].mean()

bars_rush = value_counts_rush.plot(kind='bar', color=[uber_colors['primary'], uber_colors['secondary']], edgecolor='white')

# Adding the average trip duration on top of the bars in "minutes:seconds" format
for i, (index, value) in enumerate(value_counts_rush.items()):
    mean_duration_min_sec = format_duration_in_min_sec(mean_trip_duration_by_rush_hour.loc[index])
    bars_rush.text(i, value + 0.05 * value, mean_duration_min_sec, ha='center', color='white', fontsize=12)

# Title and formatting
plt.title('Number of Pickups During Rush Hour vs Non-Rush Hour', fontsize=14, color='white', pad=20)
plt.xlabel('Is Rush Hour (0 = No, 1 = Yes)', fontsize=12, color='white')
plt.ylabel('Count', fontsize=12, color='white')
plt.xticks(color='white')
plt.yticks(color='white')
plt.grid(False)
plt.tight_layout()
plt.savefig(path_results + 'is_rush_hour_plot_high_res_with_avg_duration_min_sec.png', transparent=True, dpi=300)
plt.close()

# 4. Plot for pickup_hour - with average trip_duration in "minutes:seconds" and high resolution
fig, ax1 = plt.subplots(figsize=(8, 6))

# Bar chart for the number of pickups per hour
ax1.bar(df['pickup_hour'].value_counts().sort_index().index, 
        df['pickup_hour'].value_counts().sort_index().values, 
        color=uber_colors['primary'], edgecolor='white')

# Settings for the left Y-axis (for the number of pickups)
ax1.set_xlabel('Hour of the Day (0-23)', fontsize=12, color='white')
ax1.set_ylabel('Number of Pickups', fontsize=12, color='white')
ax1.tick_params(axis='x', colors='white')
ax1.tick_params(axis='y', colors='white')
ax1.set_xticks(range(24))
ax1.set_xticklabels(range(24), color='white')
ax1.set_yticklabels(ax1.get_yticks(), color='white')

# Right Y-axis for the average trip_duration
ax2 = ax1.twinx()
avg_trip_duration_by_hour = df.groupby('pickup_hour')['trip_duration'].mean()
avg_trip_duration_by_hour_min_sec = avg_trip_duration_by_hour / 60  # Convert to minutes and seconds
ax2.plot(avg_trip_duration_by_hour.index, avg_trip_duration_by_hour_min_sec.values, color=uber_colors['highlight'], linewidth=2, marker='o')

# Settings for the right Y-axis (for the average trip_duration)
ax2.set_ylabel('Average Trip Duration (minutes:seconds)', fontsize=12, color=uber_colors['highlight'])
ax2.tick_params(axis='y', colors=uber_colors['highlight'])
ax2.set_yticklabels(ax2.get_yticks(), color=uber_colors['highlight'])

# Titles and formatting
plt.title('Number of Pickups and Average Trip Duration by Hour of the Day', fontsize=14, color='white')

# Adjustments and save the plot
fig.tight_layout()
fig.patch.set_facecolor('black')
plt.savefig(path_results + 'pickup_hour_with_avg_trip_duration_min_sec.png', transparent=True, dpi=300)
plt.close()

# Return the name of generated files
output_files = [
    'is_weekend_plot_high_res_with_avg_duration_min_sec.png',
    'is_holiday_plot_high_res_with_avg_duration_min_sec.png',
    'is_rush_hour_plot_high_res_with_avg_duration_min_sec.png',
    'pickup_hour_with_avg_trip_duration_min_sec.png'
]

output_files


  ax1.set_yticklabels(ax1.get_yticks(), color='white')
  ax2.set_yticklabels(ax2.get_yticks(), color=uber_colors['highlight'])


['is_weekend_plot_high_res_with_avg_duration_min_sec.png',
 'is_holiday_plot_high_res_with_avg_duration_min_sec.png',
 'is_rush_hour_plot_high_res_with_avg_duration_min_sec.png',
 'pickup_hour_with_avg_trip_duration_min_sec.png']

In [5]:
# Example of feature importance using RandomForest

# Define the features and target (X and y)
X = df[['is_weekend', 'is_rush_hour', 'is_holiday', 'pickup_hour']]
y = df['trip_duration']

# Train the model
model = RandomForestRegressor()
model.fit(X, y)

# Get the feature importance and sort by importance
feature_importances = model.feature_importances_
features = X.columns

# Create a dataframe for easier sorting and plotting
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot the sorted feature importance
plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color=uber_colors['primary'])
plt.title('Feature Importance', fontsize=14, color='white')
plt.xlabel('Importance', fontsize=12, color='white')
plt.ylabel('Features', fontsize=12, color='white')
plt.xticks(color='white')
plt.yticks(color='white')
plt.gca().invert_yaxis()  # To keep the highest importance at the top
plt.tight_layout()

# Save the plot with higher resolution (set high DPI)
plt.savefig(path_results + 'feature_importance_ordered_high_res.png', transparent=True, dpi=300)  # Set DPI to 300
plt.close()

# Return the path of the generated file
output_file = path_results + 'feature_importance_ordered_high_res.png'
output_file

'/mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/feature_importance_ordered_high_res.png'

In [7]:
# Convert the pickup timestamp to datetime
df['pickup_ts'] = pd.to_datetime(df['pickup_ts'])
df['pickup_hour'] = df['pickup_ts'].dt.hour  # Collecting the hour of the ride
df['pickup_day_of_the_week'] = df['pickup_ts'].dt.day_name()  # Day of the week

# Set up the shapefile for São Paulo
shapefile_path = '/mnt/c/wo_pessoal/uber_assessment/data_extra/SP-SÃO_PAULO.shp'
gdf_city = gpd.read_file(shapefile_path)

# Set consistent latitude and longitude limits for all plots
longitude_limits = (-46.9, -46.1)
latitude_limits = (-24.0, -23.3)

# Dictionary to ensure weekdays are ordered correctly
weekday_order = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}

# Sort days of the week by their natural order
days_of_week = sorted(df['pickup_day_of_the_week'].unique(), key=lambda x: weekday_order[x])
hours = range(24)

# Loop through each day of the week and each hour
for day in days_of_week:
    output_dir = f"/mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/combined_plots_{day}"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for hour in hours:
        # Filter the DataFrame for the current day and hour
        df_filtered = df[(df['pickup_day_of_the_week'] == day) & (df['pickup_hour'] == hour)]

        if df_filtered.empty:
            # Skip if there's no data for that day and hour
            continue

        # Sample 100 rows for points (pickup and dropoff) for visualization clarity
        df_sampled_points = df_filtered.sample(100) if len(df_filtered) > 100 else df_filtered

        # Prepare data for the heatmap using the full filtered dataset
        pickup_latitudes = df_filtered['pick_lat']
        pickup_longitudes = df_filtered['pick_lng']

        # Set up the plot for the heatmap and points
        plt.style.use('dark_background')
        fig, ax = plt.subplots(figsize=(10, 10))

        # Create a 2D KDE heatmap using seaborn
        cmap = 'hot'
        sns.kdeplot(
            x=pickup_longitudes, y=pickup_latitudes, ax=ax,
            cmap=cmap, fill=True, thresh=0, alpha=0.5, levels=100
        )

        # Set consistent axis limits
        ax.set_xlim(longitude_limits)
        ax.set_ylim(latitude_limits)

        # Plot the city boundary
        gdf_city.boundary.plot(ax=ax, color='white', linewidth=1.5)

        # Plot pickup points (sampled data) in white
        gdf_pickup = gpd.GeoDataFrame(df_sampled_points, geometry=gpd.points_from_xy(df_sampled_points['pick_lng'], df_sampled_points['pick_lat']), crs="EPSG:4326")
        gdf_pickup.plot(ax=ax, color='white', marker='o', label='Pickup', alpha=0.6)

        # Plot dropoff points (sampled data) in green
        gdf_dropoff = gpd.GeoDataFrame(df_sampled_points, geometry=gpd.points_from_xy(df_sampled_points['dropoff_lng'], df_sampled_points['dropoff_lat']), crs="EPSG:4326")
        gdf_dropoff.plot(ax=ax, color='green', marker='o', label='Dropoff', alpha=0.6)

        # Plot lines between each pickup and dropoff point (sampled data)
        for _, row in df_sampled_points.iterrows():
            plt.plot([row['pick_lng'], row['dropoff_lng']],
                     [row['pick_lat'], row['dropoff_lat']],
                     color='red', linestyle='-', linewidth=1, alpha=0.7)

        # Remove axis labels and titles
        ax.axis('off')

        # Add day of the week and hour in the bottom right corner
        ax.text(0.95, 0.05, f"{day}, {hour:02d}:00", color='white', fontsize=20,
                ha='right', va='center', transform=ax.transAxes,
                bbox=dict(facecolor='black', alpha=0.75, pad=5))

        # Display the legend
        plt.legend(facecolor='black', edgecolor='white')

        # Save the combined plot as an image with 300 dpi
        image_path = os.path.join(output_dir, f"{weekday_order[day]}_{hour:02d}_{day}_{hour}.png")
        plt.savefig(image_path, bbox_inches='tight', facecolor='black', dpi=300)
        plt.close()

    # Create a GIF from the saved images for the specific day of the week
    images = []

    # Sorting the images by hour to ensure correct ordering in the GIF
    for hour in hours:
        image_path = os.path.join(output_dir, f"{weekday_order[day]}_{hour:02d}_{day}_{hour}.png")
        if os.path.exists(image_path):
            images.append(Image.open(image_path))

    # Save the GIF for the specific day
    if images:
        gif_path = os.path.join(output_dir, f"combined_pickup_heatmap_{day}.gif")
        images[0].save(gif_path, save_all=True, append_images=images[1:], duration=1000, loop=0)

    print(f"GIF for {day} saved at: {gif_path}")

GIF for Monday saved at: /mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/combined_plots_Monday/combined_pickup_heatmap_Monday.gif
GIF for Tuesday saved at: /mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/combined_plots_Tuesday/combined_pickup_heatmap_Tuesday.gif
GIF for Wednesday saved at: /mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/combined_plots_Wednesday/combined_pickup_heatmap_Wednesday.gif
GIF for Thursday saved at: /mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/combined_plots_Thursday/combined_pickup_heatmap_Thursday.gif
GIF for Friday saved at: /mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/combined_plots_Friday/combined_pickup_heatmap_Friday.gif
GIF for Saturday saved at: /mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/combined_plots_Saturday/combined_pickup_heatmap_Saturday.gif
GIF for Sunday saved at: /mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/combined_plots_Sunday/combined_pickup_heatmap_Sunday.gif
