In [13]:
#Question 9: Distance Matrix Calculation
# To create a function that calculates a distance matrix based on a given dataset, we need to follow these steps:

# Load the Data: Read the distance information from the CSV file.
# Create the Distance Matrix: Initialize a DataFrame where each entry represents the distance between two IDs.
# Fill in Distances: Populate the DataFrame with known distances and calculate cumulative distances for routes that are connected.
# Ensure Symmetry: Make sure that the distance from A to B is equal to the distance from B to A.
# Set Diagonal Values to Zero: The distance from any ID to itself should be zero.



In [2]:
import pandas as pd
import numpy as np

def calculate_distance_matrix(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate a distance matrix based on the dataframe, df.

    Args:
        df (pandas.DataFrame): DataFrame containing distances between toll locations.

    Returns:
        pandas.DataFrame: Distance matrix
    """
    # Extract unique IDs from the dataframe
    unique_ids = pd.unique(df[['id1', 'id2']].values.ravel('K'))
    
    # Initialize the distance matrix with infinity
    distance_matrix = pd.DataFrame(np.inf, index=unique_ids, columns=unique_ids)
    
    # Fill in the known distances from the DataFrame
    for _, row in df.iterrows():
        id1, id2, distance = row['id1'], row['id2'], row['distance']
        distance_matrix.at[id1, id2] = distance
        distance_matrix.at[id2, id1] = distance  # Ensure symmetry

    # Set the diagonal to 0 (distance to itself)
    np.fill_diagonal(distance_matrix.values, 0)

    # Calculate cumulative distances for connected locations
    for k in unique_ids:
        for i in unique_ids:
            for j in unique_ids:
                if distance_matrix.at[i, k] + distance_matrix.at[k, j] < distance_matrix.at[i, j]:
                    distance_matrix.at[i, j] = distance_matrix.at[i, k] + distance_matrix.at[k, j]

    return distance_matrix

# Example usage
# df = pd.read_csv('dataset-2.csv')  # Load your dataset
# distance_matrix = calculate_distance_matrix(df)
# print(distance_matrix)


In [12]:
#Question 10: Unroll Distance Matrix
# To unroll a distance matrix into a DataFrame that resembles the
# original dataset format, we need to perform the following steps:

# Extract Unique IDs: We need to get the unique IDs from the distance matrix.
# Create a New DataFrame: Initialize an empty DataFrame to hold the unrolled data with
# the specified columns: id_start, id_end, and distance.
# Fill the New DataFrame: Iterate over the distance matrix,
# and for each combination of IDs, populate the new DataFrame with the corresponding distances while excluding distances
# where the id_start is the same as id_end.
# Return the New DataFrame: The resulting DataFrame should contain all unique pairs of IDs along with their distances

In [4]:
import pandas as pd

def unroll_distance_matrix(df: pd.DataFrame) -> pd.DataFrame:
    """
    Unroll a distance matrix to a DataFrame in the style of the initial dataset.

    Args:
        df (pandas.DataFrame): Distance matrix DataFrame with IDs as both rows and columns.

    Returns:
        pandas.DataFrame: Unrolled DataFrame containing columns 'id_start', 'id_end', and 'distance'.
    """
    # Create an empty list to hold the unrolled data
    unrolled_data = []

    # Iterate over the rows and columns of the distance matrix
    for id_start in df.index:
        for id_end in df.columns:
            if id_start != id_end:  # Exclude the same id pairs
                distance = df.at[id_start, id_end]
                unrolled_data.append({'id_start': id_start, 'id_end': id_end, 'distance': distance})

    # Create a new DataFrame from the unrolled data
    unrolled_df = pd.DataFrame(unrolled_data)

    return unrolled_df

# Example usage
# distance_matrix = calculate_distance_matrix(df)  # Assume this is your distance matrix from Question 9
# unrolled_df = unroll_distance_matrix(distance_matrix)
# print(unrolled_df)


In [11]:
#Question 11: Finding IDs within Percentage Threshold
# To implement the function find_ids_within_ten_percentage_threshold, we need to follow these steps:

# Calculate the Average Distance for the Reference ID: First,
# we'll filter the DataFrame to find the rows corresponding to the reference_id and calculate its average distance.

# Determine the Threshold: Calculate the lower and upper bounds for the 10% threshold based on the reference ID's average distance.

# Filter IDs: Filter the DataFrame to find all IDs whose average distance lies within this threshold.

# Return the Result: Return a sorted DataFrame containing the IDs that meet the criteria.


In [6]:
import pandas as pd
import numpy as np

def find_ids_within_ten_percentage_threshold(df: pd.DataFrame, reference_id: int) -> pd.DataFrame:
    """
    Find all IDs whose average distance lies within 10% of the average distance of the reference ID.

    Args:
        df (pandas.DataFrame): DataFrame containing columns 'id_start', 'id_end', and 'distance'.
        reference_id (int): The ID for which to calculate the average distance.

    Returns:
        pandas.DataFrame: DataFrame with IDs whose average distance is within the specified percentage threshold
                          of the reference ID's average distance.
    """
    # Calculate the average distance for the reference_id
    ref_avg_distance = df[df['id_start'] == reference_id]['distance'].mean()

    if pd.isna(ref_avg_distance):  # Check if the reference ID exists in the DataFrame
        return pd.DataFrame(columns=['id_start', 'average_distance'])

    # Calculate the 10% threshold
    lower_bound = ref_avg_distance * 0.9
    upper_bound = ref_avg_distance * 1.1

    # Calculate the average distances for each id_start
    avg_distances = df.groupby('id_start')['distance'].mean().reset_index()

    # Filter IDs that lie within the 10% threshold
    filtered_ids = avg_distances[(avg_distances['distance'] >= lower_bound) & 
                                  (avg_distances['distance'] <= upper_bound)]

    # Sort the result based on the ID
    filtered_ids_sorted = filtered_ids.sort_values(by='id_start')

    return filtered_ids_sorted

# Example usage
# unrolled_df = unroll_distance_matrix(distance_matrix)  # Assume this is your unrolled DataFrame from Question 10
# result_df = find_ids_within_ten_percentage_threshold(unrolled_df, 1001400)
# print(result_df)


In [7]:
#Question 12: Calculate Toll Rate
# To create the function calculate_toll_rate, we need to take the DataFrame generated from
# the previous question and calculate the toll rates based on different vehicle types using the provided coefficients.
# The function will add five new columns (moto, car, rv, bus, and truck) to the DataFrame,
# representing the toll rates for each vehicle type


In [8]:
import pandas as pd

def calculate_toll_rate(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate toll rates for each vehicle type based on the unrolled DataFrame.

    Args:
        df (pandas.DataFrame): Input DataFrame with 'id_start', 'id_end', and 'distance'.

    Returns:
        pandas.DataFrame: DataFrame with added columns for toll rates.
    """
    # Define rate coefficients for each vehicle type
    rate_coefficients = {
        'moto': 0.8,
        'car': 1.2,
        'rv': 1.5,
        'bus': 2.2,
        'truck': 3.6
    }
    
    # Calculate toll rates for each vehicle type
    for vehicle, rate in rate_coefficients.items():
        df[vehicle] = df['distance'] * rate

    return df

# Example usage
# unrolled_df = unroll_distance_matrix(distance_matrix)  # Assume this is your unrolled DataFrame from Question 10
# toll_rate_df = calculate_toll_rate(unrolled_df)
# print(toll_rate_df)


In [9]:
#Question 13: Calculate Time-Based Toll Rates

# To implement the function calculate_time_based_toll_rates,
# we'll need to modify the DataFrame produced in Question 12 to add the new columns
# (start day, start time, end day, and end time) and adjust the toll rates based on specified time intervals.
# The discounts will vary depending on whether the day is a weekday or a weekend.


In [10]:
import pandas as pd
import numpy as np
from datetime import time, timedelta

def calculate_time_based_toll_rates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate time-based toll rates for different time intervals within a day.

    Args:
        df (pandas.DataFrame): Input DataFrame with toll rates.

    Returns:
        pandas.DataFrame: DataFrame with time-based toll rates.
    """
    # Define time intervals and discount factors
    weekday_intervals = [
        (time(0, 0), time(10, 0), 0.8),   # 00:00 to 10:00
        (time(10, 0), time(18, 0), 1.2),  # 10:00 to 18:00
        (time(18, 0), time(23, 59, 59), 0.8)  # 18:00 to 23:59
    ]

    weekend_discount = 0.7

    # Define days of the week
    days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

    # Prepare a list to store new rows
    new_rows = []

    for _, row in df.iterrows():
        id_start = row['id_start']
        id_end = row['id_end']
        distance = row['distance']
        
        # Loop through each day of the week
        for day in days_of_week:
            # Determine if the day is a weekday or weekend
            is_weekend = day in ['Saturday', 'Sunday']

            # Set start day and end day
            start_day = day
            end_day = day

            # Add time intervals for the given day
            for interval in weekday_intervals if not is_weekend else [(time(0, 0), time(23, 59, 59), weekend_discount)]:
                start_time, end_time, discount = interval
                
                # Calculate new toll rates
                new_row = {
                    'id_start': id_start,
                    'id_end': id_end,
                    'distance': distance,
                    'start day': start_day,
                    'start_time': start_time,
                    'end_day': end_day,
                    'end_time': end_time,
                }
                
                # Apply discounts for each vehicle type
                for vehicle in ['moto', 'car', 'rv', 'bus', 'truck']:
                    new_row[vehicle] = row[vehicle] * discount
                
                # Append new row to the list
                new_rows.append(new_row)

    # Create a new DataFrame from the new rows
    result_df = pd.DataFrame(new_rows)

    return result_df

# Example usage
# toll_rate_df = calculate_toll_rate(unrolled_df)  # Assume this is your DataFrame from Question 12
# time_based_toll_rate_df = calculate_time_based_toll_rates(toll_rate_df)
# print(time_based_toll_rate_df)
