### Reading data

In [3]:
import pandas as pd
import numpy as np
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image

import os

from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

# Load the CSV file
df = pd.read_csv('/mnt/c/wo_pessoal/uber_assessment/data_final/train_df_cleaned_full.csv', low_memory=False)
path_results = "/mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/"

#### Generates 4 graphs for datetime features

In [None]:
uber_colors = {
    'primary': '#FFFFFF',   # White
    'secondary': '#7F7F7F', # Secondary gray
    'highlight': '#55E6C1', # Green
}

# Helper function to convert seconds into "minutes:seconds" format and add 'min'
def format_duration_in_min_sec(seconds):
    minutes = int(seconds // 60)  # Integer part of minutes
    sec = int(seconds % 60)  # Seconds part
    return f'{minutes:02}:{sec:02} min'

# Create the 'weekday' column by extracting the weekday name from 'pickup_ts'
df['pickup_ts'] = pd.to_datetime(df['pickup_ts'])  # Ensure 'pickup_ts' is in datetime format
df['weekday'] = df['pickup_ts'].dt.strftime('%A')  # Extract the weekday name (e.g., Monday, Tuesday)

# Define the custom order for the weekdays (from Monday to Sunday)
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# 1. Plot for weekday - with high resolution and average trip_duration in "minutes:seconds"
plt.figure(figsize=(8, 6))

# Get the value counts and mean trip duration, ensuring the correct order of weekdays
value_counts_weekday = df['weekday'].value_counts().reindex(weekday_order)
mean_trip_duration_by_weekday = df.groupby('weekday')['trip_duration'].mean().reindex(weekday_order)

bars_weekday = value_counts_weekday.plot(kind='bar', color=[uber_colors['primary'], uber_colors['secondary']], edgecolor='white')

# Adding the average trip duration on top of the bars in "minutes:seconds" format
for i, (index, value) in enumerate(value_counts_weekday.items()):
    mean_duration_min_sec = format_duration_in_min_sec(mean_trip_duration_by_weekday.loc[index])
    bars_weekday.text(i, value + 0.01 * value, mean_duration_min_sec, ha='center', va='bottom', color='white', fontsize=12)

# Remove the border/spines around the plot
for spine in plt.gca().spines.values():
    spine.set_visible(False)

# Title and formatting
plt.title('Number of Pickups per Weekday', fontsize=14, color='white', pad=20)
plt.xlabel('Weekday', fontsize=12, color='white')
plt.ylabel('Count', fontsize=12, color='white')
plt.xticks(color='white', rotation=45)
plt.yticks(color='white')
plt.grid(False)
plt.tight_layout()
plt.savefig(path_results + 'weekday_plot_no_border_high_res.png', transparent=True, dpi=300)
plt.close()

# 2. Plot for is_holiday - with high resolution and average trip_duration in "minutes:seconds"
plt.figure(figsize=(8, 6))
value_counts_holiday = df['is_holiday'].value_counts()
mean_trip_duration_by_holiday = df.groupby('is_holiday')['trip_duration'].mean()

bars_holiday = value_counts_holiday.plot(kind='bar', color=[uber_colors['primary'], uber_colors['secondary']], edgecolor='white')

# Adding the average trip duration on top of the bars in "minutes:seconds" format
for i, (index, value) in enumerate(value_counts_holiday.items()):
    mean_duration_min_sec = format_duration_in_min_sec(mean_trip_duration_by_holiday.loc[index])
    bars_holiday.text(i, value + 0.01 * value, mean_duration_min_sec, ha='center', va='bottom', color='white', fontsize=12)

# Remove the border/spines around the plot
for spine in plt.gca().spines.values():
    spine.set_visible(False)

# Title and formatting
plt.title('Number of Pickups on Holidays vs Non-Holidays', fontsize=14, color='white', pad=20)
plt.xlabel('Is Holiday (0 = No, 1 = Yes)', fontsize=12, color='white')
plt.ylabel('Count', fontsize=12, color='white')
plt.xticks(color='white')
plt.yticks(color='white')
plt.grid(False)
plt.tight_layout()
plt.savefig(path_results + 'is_holiday_plot_no_border_high_res.png', transparent=True, dpi=300)
plt.close()

# 3. Plot for is_rush_hour - with high resolution and average trip_duration in "minutes:seconds"
plt.figure(figsize=(8, 6))
value_counts_rush = df['is_rush_hour'].value_counts()
mean_trip_duration_by_rush_hour = df.groupby('is_rush_hour')['trip_duration'].mean()

bars_rush = value_counts_rush.plot(kind='bar', color=[uber_colors['primary'], uber_colors['secondary']], edgecolor='white')

# Adding the average trip duration on top of the bars in "minutes:seconds" format
for i, (index, value) in enumerate(value_counts_rush.items()):
    mean_duration_min_sec = format_duration_in_min_sec(mean_trip_duration_by_rush_hour.loc[index])
    bars_rush.text(i, value + 0.01 * value, mean_duration_min_sec, ha='center', va='bottom', color='white', fontsize=12)

# Remove the border/spines around the plot
for spine in plt.gca().spines.values():
    spine.set_visible(False)

# Title and formatting
plt.title('Number of Pickups During Rush Hour vs Non-Rush Hour', fontsize=14, color='white', pad=20)
plt.xlabel('Is Rush Hour (0 = No, 1 = Yes)', fontsize=12, color='white')
plt.ylabel('Count', fontsize=12, color='white')
plt.xticks(color='white')
plt.yticks(color='white')
plt.grid(False)
plt.tight_layout()
plt.savefig(path_results + 'is_rush_hour_plot_no_border_high_res.png', transparent=True, dpi=300)
plt.close()

# 4. Plot for pickup_hour - with average trip_duration in "minutes:seconds" and high resolution
fig, ax1 = plt.subplots(figsize=(8, 6))

# Bar chart for the number of pickups per hour
ax1.bar(df['pickup_hour'].value_counts().sort_index().index, 
        df['pickup_hour'].value_counts().sort_index().values, 
        color=uber_colors['primary'], edgecolor='white')

# Settings for the left Y-axis (for the number of pickups)
ax1.set_xlabel('Hour of the Day (0-23)', fontsize=12, color='white')
ax1.set_ylabel('Number of Pickups', fontsize=12, color='white')
ax1.tick_params(axis='x', colors='white')
ax1.tick_params(axis='y', colors='white')
ax1.set_xticks(range(24))
ax1.set_xticklabels(range(24), color='white')
ax1.set_yticklabels(ax1.get_yticks(), color='white')

# Right Y-axis for the average trip_duration
ax2 = ax1.twinx()
avg_trip_duration_by_hour = df.groupby('pickup_hour')['trip_duration'].mean()
avg_trip_duration_by_hour_min_sec = avg_trip_duration_by_hour / 60  # Convert to minutes and seconds
ax2.plot(avg_trip_duration_by_hour.index, avg_trip_duration_by_hour_min_sec.values, color=uber_colors['highlight'], linewidth=2, marker='o')

# Settings for the right Y-axis (for the average trip_duration)
ax2.set_ylabel('Average Trip Duration (minutes:seconds)', fontsize=12, color=uber_colors['highlight'])
ax2.tick_params(axis='y', colors=uber_colors['highlight'])
ax2.set_yticklabels(ax2.get_yticks(), color=uber_colors['highlight'])

# Remove the border/spines around the plot
for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

# Titles and formatting
plt.title('Number of Pickups and Average Trip Duration by Hour of the Day', fontsize=14, color='white')

# Adjustments and save the plot
fig.tight_layout()
fig.patch.set_facecolor('black')
plt.savefig(path_results + 'pickup_hour_with_avg_trip_duration_no_border.png', transparent=True, dpi=300)
plt.close()

#### Generates Feature Importance for each datetime new columns

In [None]:
X = df[['weekday', 'is_rush_hour', 'is_holiday', 'pickup_hour']]

# One-Hot Encoding for 'weekday' column
preprocessor = ColumnTransformer(
    transformers=[
        ('weekday', OneHotEncoder(), ['weekday'])
    ],
    remainder='passthrough'  # Keep the rest of the columns as they are
)

# Create a pipeline that first preprocesses the data and then fits the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

y = df['trip_duration']

# Train the model
model.fit(X, y)

# Get feature names after one-hot encoding
encoded_columns = model.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(['weekday'])
all_columns = list(encoded_columns) + ['is_rush_hour', 'is_holiday', 'pickup_hour']

# Get the feature importance and sort by importance
feature_importances = model.named_steps['regressor'].feature_importances_

# Create a dataframe for easier sorting and plotting
importance_df = pd.DataFrame({
    'Feature': all_columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot the sorted feature importance
plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color=uber_colors['primary'])
plt.title('Feature Importance (XGBoost)', fontsize=14, color='white')
plt.xlabel('Importance', fontsize=12, color='white')
plt.ylabel('Features', fontsize=12, color='white')
plt.xticks(color='white')
plt.yticks(color='white')
plt.gca().invert_yaxis()  # To keep the highest importance at the top
plt.tight_layout()

# Save the plot with higher resolution (set high DPI)
plt.savefig(path_results + 'feature_importance_xgboost_high_res.png', transparent=True, dpi=300)  # Set DPI to 300
plt.close()

# Return the path of the generated file
output_file = path_results + 'feature_importance_xgboost_high_res_xgboost.png'
output_file

#### Generates sample map plot

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

# Coordinates for São Paulo's city center
CITY_CENTER_LAT = -23.55052
CITY_CENTER_LNG = -46.633308

df['pickup_ts'] = pd.to_datetime(df['pickup_ts'])
df['pickup_hour'] = df['pickup_ts'].dt.hour  # Collecting the hour of the ride
df['pickup_day_of_the_week'] = df['pickup_ts'].dt.day_name()  # Day of the week

# Set up the shapefile for São Paulo
shapefile_path = '/mnt/c/wo_pessoal/uber_assessment/data_extra/SP-SÃO_PAULO.shp'
gdf_city = gpd.read_file(shapefile_path)

# Set consistent latitude and longitude limits for all plots
longitude_limits = (-46.9, -46.1)
latitude_limits = (-24.0, -23.3)

# Sample 100 rows for points (pickup and dropoff) for visualization clarity
df_sampled_points = df.sample(100) if len(df) > 100 else df

# Set up the plot for the points (pickup and dropoff)
plt.style.use('dark_background')
fig, ax = plt.subplots(figsize=(10, 10))

# Set consistent axis limits
ax.set_xlim(longitude_limits)
ax.set_ylim(latitude_limits)

# Plot the city boundary
gdf_city.boundary.plot(ax=ax, color='white', linewidth=1.5)

# Plot pickup points (sampled data) in white
gdf_pickup = gpd.GeoDataFrame(df_sampled_points, geometry=gpd.points_from_xy(df_sampled_points['pick_lng'], df_sampled_points['pick_lat']), crs="EPSG:4326")
gdf_pickup.plot(ax=ax, color='white', marker='o', label='Pickup', alpha=0.6)

# Plot dropoff points (sampled data) in green
gdf_dropoff = gpd.GeoDataFrame(df_sampled_points, geometry=gpd.points_from_xy(df_sampled_points['dropoff_lng'], df_sampled_points['dropoff_lat']), crs="EPSG:4326")
gdf_dropoff.plot(ax=ax, color='green', marker='o', label='Dropoff', alpha=0.6)

# Plot lines between each pickup and dropoff point (sampled data)
for _, row in df_sampled_points.iterrows():
    plt.plot([row['pick_lng'], row['dropoff_lng']],
             [row['pick_lat'], row['dropoff_lat']],
             color='red', linestyle='-', linewidth=1, alpha=0.7)

# Plot the city center marker
ax.plot(CITY_CENTER_LNG, CITY_CENTER_LAT, color='yellow', marker='o', markersize=25, label='City Center', alpha=0.5)
ax.text(CITY_CENTER_LNG, CITY_CENTER_LAT, '', color='yellow', ha='left', va='center', weight='bold', size=20)

# Remove axis labels and titles
ax.axis('off')

# Display the legend with consistent marker sizes
plt.legend(facecolor='black', edgecolor='white', markerscale=2)  # Adjust markerscale as needed for uniform size

# Save the plot as an image with high resolution
plt.tight_layout()
plt.savefig('/mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/pickup_dropoff_sample_plot_with_center.png', bbox_inches='tight', facecolor='black', dpi=300)
plt.close()

print("Plot with city center indicator saved successfully!")

#### Generates Feature Importance for each datetime new columns

In [None]:
CITY_CENTER_LAT = -23.55052
CITY_CENTER_LNG = -46.633308

SP_LAT_MIN = -23.68
SP_LAT_MAX = -23.35
SP_LNG_MIN = -46.83
SP_LNG_MAX = -46.40

def is_outside_sao_paulo(lat, lng):
    return 1 if lat < SP_LAT_MIN or lat > SP_LAT_MAX or lng < SP_LNG_MIN or lng > SP_LNG_MAX else 0

df['pickup_outside_sp'] = df.apply(lambda row: is_outside_sao_paulo(row['pick_lat'], row['pick_lng']), axis=1)
df['dropoff_outside_sp'] = df.apply(lambda row: is_outside_sao_paulo(row['dropoff_lat'], row['dropoff_lng']), axis=1)


def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the earth in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = (np.sin(dlat / 2) * np.sin(dlat / 2) + np.cos(np.radians(lat1)) *
            np.cos(np.radians(lat2)) * np.sin(dlon / 2) * np.sin(dlon / 2))
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

df['trip_distance_haversine'] = haversine(df['pick_lat'], df['pick_lng'],
                                                df['dropoff_lat'], df['dropoff_lng'])
# Calculate distance from city center
df['distance_from_center'] = df.apply(lambda row: haversine(row['pick_lat'], row['pick_lng'],
                                                                        CITY_CENTER_LAT, CITY_CENTER_LNG), axis=1)

# Select the target and features for XGBoost
X = df[['pickup_outside_sp', 'dropoff_outside_sp', 'distance_from_center']]
y = df['trip_duration']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and calculate RMSE to evaluate performance
y_pred = xgb_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

# Get the feature importance
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Plot the sorted feature importance
plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='white', edgecolor='black')
plt.title('Feature Importance using XGBoost', fontsize=14, color='white')
plt.xlabel('Importance', fontsize=12, color='white')
plt.ylabel('Features', fontsize=12, color='white')
plt.xticks(color='white')
plt.yticks(color='white')
plt.gca().invert_yaxis()  # To keep the highest importance at the top
plt.tight_layout()

# Save the plot with higher resolution (set high DPI)
plt.savefig('/mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/feature_importance_xgboost_high_res_geo.png', transparent=True, dpi=300)

#### Generates Gif map plot for each hour of every weekday

In [None]:
df['pickup_ts'] = pd.to_datetime(df['pickup_ts'])
df['pickup_hour'] = df['pickup_ts'].dt.hour  # Collecting the hour of the ride
df['pickup_day_of_the_week'] = df['pickup_ts'].dt.day_name()  # Day of the week

# Set up the shapefile for São Paulo
shapefile_path = '/mnt/c/wo_pessoal/uber_assessment/data_extra/SP-SÃO_PAULO.shp'
gdf_city = gpd.read_file(shapefile_path)

# Set consistent latitude and longitude limits for all plots
longitude_limits = (-46.9, -46.1)
latitude_limits = (-24.0, -23.3)

# Dictionary to ensure weekdays are ordered correctly
weekday_order = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}

# Sort days of the week by their natural order
days_of_week = sorted(df['pickup_day_of_the_week'].unique(), key=lambda x: weekday_order[x])
hours = range(24)

# Loop through each day of the week and each hour
for day in days_of_week:
    output_dir = f"/mnt/c/wo_pessoal/uber_assessment/PowerPoint/Graphs/combined_plots_{day}"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for hour in hours:
        # Filter the DataFrame for the current day and hour
        df_filtered = df[(df['pickup_day_of_the_week'] == day) & (df['pickup_hour'] == hour)]

        if df_filtered.empty:
            # Skip if there's no data for that day and hour
            continue

        # Sample 100 rows for points (pickup and dropoff) for visualization clarity
        df_sampled_points = df_filtered.sample(100) if len(df_filtered) > 100 else df_filtered

        # Prepare data for the heatmap using the full filtered dataset
        pickup_latitudes = df_filtered['pick_lat']
        pickup_longitudes = df_filtered['pick_lng']

        # Set up the plot for the heatmap and points
        plt.style.use('dark_background')
        fig, ax = plt.subplots(figsize=(10, 10))

        # Create a 2D KDE heatmap using seaborn
        cmap = 'hot'
        sns.kdeplot(
            x=pickup_longitudes, y=pickup_latitudes, ax=ax,
            cmap=cmap, fill=True, thresh=0, alpha=0.5, levels=100
        )

        # Set consistent axis limits
        ax.set_xlim(longitude_limits)
        ax.set_ylim(latitude_limits)

        # Plot the city boundary
        gdf_city.boundary.plot(ax=ax, color='white', linewidth=1.5)

        # Plot pickup points (sampled data) in white
        gdf_pickup = gpd.GeoDataFrame(df_sampled_points, geometry=gpd.points_from_xy(df_sampled_points['pick_lng'], df_sampled_points['pick_lat']), crs="EPSG:4326")
        gdf_pickup.plot(ax=ax, color='white', marker='o', label='Pickup', alpha=0.6)

        # Plot dropoff points (sampled data) in green
        gdf_dropoff = gpd.GeoDataFrame(df_sampled_points, geometry=gpd.points_from_xy(df_sampled_points['dropoff_lng'], df_sampled_points['dropoff_lat']), crs="EPSG:4326")
        gdf_dropoff.plot(ax=ax, color='green', marker='o', label='Dropoff', alpha=0.6)

        # Plot lines between each pickup and dropoff point (sampled data)
        for _, row in df_sampled_points.iterrows():
            plt.plot([row['pick_lng'], row['dropoff_lng']],
                     [row['pick_lat'], row['dropoff_lat']],
                     color='red', linestyle='-', linewidth=1, alpha=0.7)

        # Remove axis labels and titles
        ax.axis('off')

        # Add day of the week and hour in the bottom right corner
        ax.text(0.95, 0.05, f"{day}, {hour:02d}:00", color='white', fontsize=20,
                ha='right', va='center', transform=ax.transAxes,
                bbox=dict(facecolor='black', alpha=0.75, pad=5))

        # Display the legend
        plt.legend(facecolor='black', edgecolor='white')

        # Save the combined plot as an image with 300 dpi
        image_path = os.path.join(output_dir, f"{weekday_order[day]}_{hour:02d}_{day}_{hour}.png")
        plt.savefig(image_path, bbox_inches='tight', facecolor='black', dpi=300)
        plt.close()

    # Create a GIF from the saved images for the specific day of the week
    images = []

    # Sorting the images by hour to ensure correct ordering in the GIF
    for hour in hours:
        image_path = os.path.join(output_dir, f"{weekday_order[day]}_{hour:02d}_{day}_{hour}.png")
        if os.path.exists(image_path):
            images.append(Image.open(image_path))

    # Save the GIF for the specific day
    if images:
        gif_path = os.path.join(output_dir, f"combined_pickup_heatmap_{day}.gif")
        images[0].save(gif_path, save_all=True, append_images=images[1:], duration=700, loop=0)

    print(f"GIF for {day} saved at: {gif_path}")

#### Generates XGBoost degradient

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from matplotlib.animation import PillowWriter
from PIL import Image
import os

# Create a directory to save the frames
output_dir = "Graphs/xgboost_frames"
os.makedirs(output_dir, exist_ok=True)

# Sample data
np.random.seed(0)
X = np.sort(np.random.rand(40))
y = np.cos(1.5 * np.pi * X) + np.random.normal(0, 0.1, X.shape)

# Parameters
learning_rate = 0.5
n_estimators = 5

# Initialize the initial prediction (F0) as the mean of y
initial_prediction = np.mean(y)
predictions = np.full_like(y, initial_prediction)

# Set up the figure with dark background style
plt.style.use('dark_background')
fig, ax = plt.subplots(figsize=(6, 4))
ax.set_xlim(0, 1)
ax.set_ylim(-1.5, 1.5)
ax.set_xlabel("X")
ax.set_ylabel("y")

# Generate each frame and save it as an image
for frame in range(n_estimators + 1):
    ax.cla()
    ax.set_xlim(0, 1)
    ax.set_ylim(-1.5, 1.5)
    ax.set_xlabel("X")
    ax.set_ylabel("y")

    if frame == 0:
        ax.plot(X, [initial_prediction] * len(X), label=r"$F_0$", color='blue', lw=2)
        ax.scatter(X, y, color="red", s=40, zorder=5)
    else:
        residuals = y - predictions
        tree = DecisionTreeRegressor(max_depth=2, random_state=frame)
        tree.fit(X.reshape(-1, 1), residuals)

        step_predictions = learning_rate * tree.predict(X.reshape(-1, 1))
        predictions[:] += step_predictions

        ax.vlines(X, predictions, y, color="green", linestyle="--", label=r"$r_{%d} = y - F_{%d}$" % (frame - 1, frame - 1))
        ax.plot(X, predictions, label=r"$F_%d$" % frame, color="blue", lw=2)
        ax.scatter(X, y, color="red", s=40, zorder=5)

    ax.legend(loc="upper right")
    
    # Save each frame as an image
    frame_path = os.path.join(output_dir, f"frame_{frame}.png")
    plt.savefig(frame_path, dpi=300)

# Combine frames into a GIF
frames = [Image.open(os.path.join(output_dir, f"frame_{i}.png")) for i in range(n_estimators + 1)]
gif_path = "Graphs/xgboost_gradient_boosting_example.gif"
frames[0].save(gif_path, save_all=True, append_images=frames[1:], duration=500, loop=0)

print(f"GIF saved as '{gif_path}'")