In [None]:
pip install networkx

In [1]:
## Question 1: Distance Matrix Calculation

import pandas as pd
import networkx as nx

def calculate_distance_matrix(dataframe):
    print("Columns in the DataFrame:")
    print(dataframe.columns)

    # Create a directed graph
    G = nx.DiGraph()

    # Add edges and distances from the DataFrame
    for index, row in dataframe.iterrows():
        G.add_edge(row['id_start'], row['id_end'], distance=row['distance'])
    
    # Create a DataFrame to store distances
    nodes = sorted(G.nodes())
    distance_matrix = pd.DataFrame(index=nodes, columns=nodes, dtype=float)

    # Calculate cumulative distances
    for node_from in nodes:
        for node_to in nodes:
            if node_from == node_to:
                distance_matrix.at[node_from, node_to] = 0
            else:
                try:
                    distance_matrix.at[node_from, node_to] = nx.shortest_path_length(G, node_from, node_to, weight='distance')
                except nx.NetworkXNoPath:
                    # Handle cases where there is no path between nodes
                    distance_matrix.at[node_from, node_to] = float('inf')

    return distance_matrix

# Example usage
csv_file_path = 'dataset-3.csv'
df = pd.read_csv(csv_file_path)

# Call the function
result_distance_matrix = calculate_distance_matrix(df)

# Display the result
print("Distance Matrix:")
print(result_distance_matrix)


Columns in the DataFrame:
Index(['id_start', 'id_end', 'distance'], dtype='object')
Distance Matrix:
           1001400.0  1001402.0  1001404.0  1001406.0  1001408.0  1001410.0  \
1001400.0        0.0        9.7       29.9       45.9       67.6       78.7   
1001402.0        inf        0.0       20.2       36.2       57.9       69.0   
1001404.0        inf        inf        0.0       16.0       37.7       48.8   
1001406.0        inf        inf        inf        0.0       21.7       32.8   
1001408.0        inf        inf        inf        inf        0.0       11.1   
1001410.0        inf        inf        inf        inf        inf        0.0   
1001412.0        inf        inf        inf        inf        inf        inf   
1001414.0        inf        inf        inf        inf        inf        inf   
1001416.0        inf        inf        inf        inf        inf        inf   
1001418.0        inf        inf        inf        inf        inf        inf   
1001420.0        inf        in

In [2]:
## Question 2: Unroll Distance Matrix
import pandas as pd

def unroll_distance_matrix(distance_matrix):
    # Initialize an empty list to store rows
    unrolled_rows = []

    # Iterate over the rows of the distance matrix
    for id_start in distance_matrix.index:
        for id_end in distance_matrix.columns:
            # Skip rows where id_start is equal to id_end
            if id_start != id_end:
                distance = distance_matrix.at[id_start, id_end]
                # Append a dictionary with the row information to the list
                unrolled_rows.append({'id_start': id_start, 'id_end': id_end, 'distance': distance})

    # Create a DataFrame from the list of rows
    unrolled_df = pd.DataFrame(unrolled_rows)

    return unrolled_df

# Example usage
# Assuming 'result_distance_matrix' is the DataFrame from Question 1
unrolled_distance_df = unroll_distance_matrix(result_distance_matrix)

# Display the resulting DataFrame
print("Unrolled Distance DataFrame:")
print(unrolled_distance_df)


Unrolled Distance DataFrame:
       id_start     id_end  distance
0     1001400.0  1001402.0       9.7
1     1001400.0  1001404.0      29.9
2     1001400.0  1001406.0      45.9
3     1001400.0  1001408.0      67.6
4     1001400.0  1001410.0      78.7
...         ...        ...       ...
1801  1004356.0  1001470.0     159.8
1802  1004356.0  1001472.0     175.8
1803  1004356.0  1001488.0       inf
1804  1004356.0  1004354.0       2.0
1805  1004356.0  1004355.0       4.0

[1806 rows x 3 columns]


In [3]:
## Question 3: Finding IDs within Percentage Threshold
import pandas as pd

def find_ids_within_ten_percentage_threshold(df, reference_id):
    # Filter the DataFrame for rows with the given reference_id
    reference_df = df[df['id_start'] == reference_id]

    # Calculate the average distance for the reference_id
    reference_avg_distance = reference_df['distance'].mean()

    # Calculate the lower and upper bounds for the 10% threshold
    lower_bound = reference_avg_distance - 0.1 * reference_avg_distance
    upper_bound = reference_avg_distance + 0.1 * reference_avg_distance

    # Filter rows where the distance is within the 10% threshold
    within_threshold_df = df[(df['id_start'] != reference_id) & (df['distance'] >= lower_bound) & (df['distance'] <= upper_bound)]

    # Get unique values from the 'id_start' column and sort them
    result_ids = sorted(within_threshold_df['id_start'].unique())

    return result_ids

# Example usage
# Assuming 'unrolled_distance_df' is the DataFrame from the previous question
reference_value = 1  # Replace with the desired reference value
result_ids_within_threshold = find_ids_within_ten_percentage_threshold(unrolled_distance_df, reference_value)

# Display the result
print(f"IDs within 10% threshold of the average distance for {reference_value}: {result_ids_within_threshold}")


IDs within 10% threshold of the average distance for 1: []


In [4]:
## Question 4: Calculate Toll Rate

import pandas as pd

def calculate_toll_rate(df):
    # Define rate coefficients for each vehicle type
    rate_coefficients = {'moto': 0.8, 'car': 1.2, 'rv': 1.5, 'bus': 2.2, 'truck': 3.6}

    # Add columns for toll rates based on vehicle types
    for vehicle_type, rate_coefficient in rate_coefficients.items():
        df[vehicle_type] = df['distance'] * rate_coefficient

    return df

# Example usage
# Assuming 'unrolled_distance_df' is the DataFrame from the previous question
toll_rate_df = calculate_toll_rate(unrolled_distance_df)

# Display the resulting DataFrame
print(toll_rate_df)


       id_start     id_end  distance    moto     car      rv     bus   truck
0     1001400.0  1001402.0       9.7    7.76   11.64   14.55   21.34   34.92
1     1001400.0  1001404.0      29.9   23.92   35.88   44.85   65.78  107.64
2     1001400.0  1001406.0      45.9   36.72   55.08   68.85  100.98  165.24
3     1001400.0  1001408.0      67.6   54.08   81.12  101.40  148.72  243.36
4     1001400.0  1001410.0      78.7   62.96   94.44  118.05  173.14  283.32
...         ...        ...       ...     ...     ...     ...     ...     ...
1801  1004356.0  1001470.0     159.8  127.84  191.76  239.70  351.56  575.28
1802  1004356.0  1001472.0     175.8  140.64  210.96  263.70  386.76  632.88
1803  1004356.0  1001488.0       inf     inf     inf     inf     inf     inf
1804  1004356.0  1004354.0       2.0    1.60    2.40    3.00    4.40    7.20
1805  1004356.0  1004355.0       4.0    3.20    4.80    6.00    8.80   14.40

[1806 rows x 8 columns]


In [5]:
## Question 5: Calculate Time-Based Toll Rates

import pandas as pd

def calculate_time_based_toll_rates(df):
    # Ensure that the 'start_timestamp' and 'end_timestamp' columns are in datetime format
    # If these columns don't exist, use the appropriate column names from your DataFrame
    # For example, if your columns are 'id_start', 'id_end', you should replace 'start_timestamp' and 'end_timestamp' with 'id_start' and 'id_end'
    # df['start_timestamp'] = pd.to_datetime(df['start_timestamp'])
    # df['end_timestamp'] = pd.to_datetime(df['end_timestamp'])
    
    # Replace 'start_timestamp' and 'end_timestamp' with your actual column names
    df['start_timestamp'] = pd.to_datetime(df['id_start'])
    df['end_timestamp'] = pd.to_datetime(df['id_end'])
    
    # Initialize lists to store the modified columns
    start_day_list, end_day_list, start_time_list, end_time_list = [], [], [], []

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Extract start and end timestamps
        start_timestamp, end_timestamp = row['start_timestamp'], row['end_timestamp']
        
        # Determine day of the week for start and end timestamps
        start_day = start_timestamp.day_name()
        end_day = end_timestamp.day_name()

        # Determine time of day for start and end timestamps
        start_time = start_timestamp.time()
        end_time = end_timestamp.time()

        # Append values to the lists
        start_day_list.append(start_day)
        end_day_list.append(end_day)
        start_time_list.append(start_time)
        end_time_list.append(end_time)

    # Add new columns to the DataFrame
    df['start_day'] = start_day_list
    df['end_day'] = end_day_list
    df['start_time'] = start_time_list
    df['end_time'] = end_time_list
    
    

    return df

# Example usage
time_based_toll_df = calculate_time_based_toll_rates(toll_rate_df)
print(time_based_toll_df)
print(time_based_toll_df.columns)




       id_start     id_end  distance    moto     car      rv     bus   truck  \
0     1001400.0  1001402.0       9.7    7.76   11.64   14.55   21.34   34.92   
1     1001400.0  1001404.0      29.9   23.92   35.88   44.85   65.78  107.64   
2     1001400.0  1001406.0      45.9   36.72   55.08   68.85  100.98  165.24   
3     1001400.0  1001408.0      67.6   54.08   81.12  101.40  148.72  243.36   
4     1001400.0  1001410.0      78.7   62.96   94.44  118.05  173.14  283.32   
...         ...        ...       ...     ...     ...     ...     ...     ...   
1801  1004356.0  1001470.0     159.8  127.84  191.76  239.70  351.56  575.28   
1802  1004356.0  1001472.0     175.8  140.64  210.96  263.70  386.76  632.88   
1803  1004356.0  1001488.0       inf     inf     inf     inf     inf     inf   
1804  1004356.0  1004354.0       2.0    1.60    2.40    3.00    4.40    7.20   
1805  1004356.0  1004355.0       4.0    3.20    4.80    6.00    8.80   14.40   

                   start_timestamp     