In [2]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
train_data = pd.read_parquet(
    "data/yellow_tripdata_2023-01.parquet")

test_data = pd.read_parquet(
    "data/yellow_tripdata_2023-02.parquet")

### Q1. Downloading the data

In [8]:
## number of rows and columns in train data 
print(f"number of samples is {train_data.shape[0]}\
 and number of attributes {train_data.shape[1]}")

number of samples is 3066766 and number of attributes 19


### Computing Duration

In [9]:
trip_duration_datetime = train_data['tpep_dropoff_datetime']-train_data['tpep_pickup_datetime']

In [10]:
minute_data = []
for i in tqdm(range(train_data.shape[0])):
    minute_data.append(
        trip_duration_datetime.iloc[i].total_seconds()/60
    )

100%|██████████| 3066766/3066766 [00:46<00:00, 65357.67it/s]


In [11]:
duration_std = np.array(minute_data).std()
print(f"standard deviation of the duration is {duration_std}")

standard deviation of the duration is 42.59434429744777


### Dropping outliers

In [12]:
minute_data = np.array(minute_data)
train_data['duration'] = minute_data

In [13]:
size_before_filter = len(minute_data)
size_after_filter = len(minute_data[(minute_data>=1) & (minute_data<=60)])
print(f"data shape before {train_data.shape}")
train_data = train_data[(train_data.duration>=1) & (train_data.duration<=60)]
print(f"data shape after {train_data.shape}")


data shape before (3066766, 20)
data shape after (3009173, 20)


In [14]:
frac_left = (size_after_filter/size_before_filter)*100
print(f"fraction of the records left after dropping outliers {frac_left}")

fraction of the records left after dropping outliers 98.1220282212598


In [15]:
categorical_col =["PULocationID","DOLocationID"]
numerical_col = ['trip_distance']
train_data[categorical_col] = train_data[categorical_col].astype(str)


In [16]:
train_dicts = train_data[categorical_col + numerical_col].to_dict(orient='records')

vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_dicts)

In [17]:
print(f"one hot encoding feature {X_train.shape[1]}")

one hot encoding feature 516


In [18]:
target = 'duration'
y_train = train_data[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

In [19]:
np.sqrt(
    mean_squared_error(y_train,
                        y_pred,
                        squared=False)
)

2.7673821210203418

In [20]:
def evaluate(data:pd.DataFrame, categorical_col:list, numerical_col:list):

    trip_duration_datetime = data['tpep_dropoff_datetime']-data['tpep_pickup_datetime']
    ## remove outliers
    minute_data = []
    for i in tqdm(range(data.shape[0])):
        minute_data.append(
            trip_duration_datetime.iloc[i].total_seconds()/60
        )

    minute_data = np.array(minute_data)
    data['duration'] = minute_data

    data = data[(data.duration>=1) & (data.duration<=60)]

    data[categorical_col] = data[categorical_col].astype(str)
    
    val_dicts = data[categorical_col + numerical_col].to_dict(orient='records')

    X_valid = vectorizer.transform(val_dicts)
    target = 'duration'
    y_valid = data[target].values
    y_pred = lr.predict(X_valid)

    print(mean_squared_error(y_valid, y_pred, squared=False))

In [21]:
categorical_col =["PULocationID","DOLocationID"]
numerical_col = ['trip_distance']
evaluate(test_data,
        categorical_col=categorical_col,
        numerical_col=numerical_col
        )

100%|██████████| 2913955/2913955 [00:43<00:00, 66679.51it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_col] = data[categorical_col].astype(str)


7.819950695172521
