# Import Libraries

In [19]:
# Import libraries needed for API and calculations
# For API, use the sodapy library for High Volume API.
# Install the library if not already installed.
# %pip install sodapy

import pandas as pd
from sodapy import Socrata
from datetime import datetime
import numpy as np
from scipy.stats import t

# API Request


In [20]:
# Set up the Socrata client for accessing the API
client = Socrata("data.cityofnewyork.us", None)

# Fetch data from the API (adjust the limit as needed)
limit = 200000
results = client.get("4p5c-cbgn", limit=limit)

# Convert the results to a DataFrame
results_df = pd.DataFrame.from_records(results)



**Checking DataFrame Shape and First Rows**


In [21]:
# Display the shape of the DataFrame
print("DataFrame Shape:", results_df.shape)

# Display the first few rows of the DataFrame
results_df.head()

DataFrame Shape: (200000, 7)


Unnamed: 0,hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,pulocationid,dolocationid,sr_flag
0,HV0003,B02879,2019-05-27T18:14:01.000,2019-05-27T18:28:37.000,67,11,
1,HV0003,B02864,2019-05-27T18:22:18.000,2019-05-27T18:52:47.000,17,39,
2,HV0003,B02877,2019-05-27T18:12:45.000,2019-05-27T18:20:09.000,144,113,
3,HV0003,B02877,2019-05-27T18:29:08.000,2019-05-27T18:35:33.000,164,233,
4,HV0003,B02877,2019-05-27T18:44:17.000,2019-05-27T19:00:05.000,233,68,


``trip_duration`` function

In [22]:
def trip_duration(data_frame):
    """
    Calculates trip duration for taxi rides.

    Parameters:
    - data_frame: DataFrame containing 'pickup_datetime' and 'dropoff_datetime' columns.

    Returns:
    - DataFrame with a new column named 'trip_duration' (in minutes).
    """
    data_frame['dropoff_datetime'] = pd.to_datetime(data_frame['dropoff_datetime'], format="%Y-%m-%dT%H:%M:%S.%f")
    data_frame['pickup_datetime'] = pd.to_datetime(data_frame['pickup_datetime'], format="%Y-%m-%dT%H:%M:%S.%f")

    data_frame['trip_duration'] = (data_frame['dropoff_datetime'] - data_frame['pickup_datetime']).dt.total_seconds() / 60

    return data_frame

`add_features_time` function


In [23]:
def add_features_time(data_frame):
    """
    Adds time-related features to the DataFrame.

    Parameters:
    - data_frame: DataFrame containing 'pickup_datetime' column.

    Returns:
    - DataFrame with new columns 'hour' and 'day_of_week'.
    """
    data_frame['hour'] = data_frame['pickup_datetime'].dt.hour
    data_frame['day_of_week'] = data_frame['pickup_datetime'].dt.dayofweek

    return data_frame

`calculate_confidence_interval` function

In [24]:
def calculate_confidence_interval(data_frame):
    """
    Computes mean trip duration and margin of error using a 95% confidence interval.

    Parameters:
    - data_frame: DataFrame containing 'trip_duration' column.

    Returns:
    - DataFrame 'predictions' with columns: 'mean_trip_duration' and 'margin_of_error'.
    """
    grouped_data = data_frame.groupby(['pulocationid', 'dolocationid', 'day_of_week', 'hour'])['trip_duration']

    mean_duration = grouped_data.mean()
    std_dev = grouped_data.std()
    sample_size = grouped_data.count() 

    margin_of_error = t.ppf(0.95, df=(sample_size - 1)) * (std_dev / np.sqrt(sample_size))

    predictions = pd.DataFrame({
        'mean_trip_duration': mean_duration,
        'margin_of_error': margin_of_error
    })

    return predictions


`generate_predictions` function

In [25]:
def generate_predictions(results_df):
    """
    Reads the data file, performs necessary computations, and generates the 'predictions' data frame.

    Parameters:
    - results_df: DataFrame containing raw data.

    Returns:
    - DataFrame 'predictions' with mean trip duration and margin of error.
    """
    data = trip_duration(results_df)
    data = add_features_time(data)

    predictions = calculate_confidence_interval(data)

    return predictions

In [26]:
predicted_df = generate_predictions(results_df)
predicted_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean_trip_duration,margin_of_error
pulocationid,dolocationid,day_of_week,hour,Unnamed: 4_level_1,Unnamed: 5_level_1
1,106,1,2,43.800000,
1,107,0,21,36.066667,
1,113,0,20,39.166667,
1,113,0,21,30.083333,
1,13,0,21,25.533333,
...,...,...,...,...,...
98,98,1,1,4.566667,
99,23,1,2,9.283333,
99,44,1,1,10.816667,
99,5,0,21,6.083333,
