In [1]:
!python --version

Python 3.9.0


In [2]:
# import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

# import the linear model and the feature extraction
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer

# metrics
from sklearn.metrics import mean_squared_error

In [3]:
#load the dataset
train = pd.read_parquet('./Data/fhv_tripdata_2021-01.parquet')
test = pd.read_parquet('./Data/fhv_tripdata_2021-02.parquet')

In [4]:
# display five 5 instances of the January/train data
train.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype          
---  ------                  --------------    -----          
 0   dispatching_base_num    1154112 non-null  object         
 1   pickup_datetime         1154112 non-null  datetime64[ns] 
 2   dropOff_datetime        1154112 non-null  datetime64[ns] 
 3   PUlocationID            195845 non-null   float64        
 4   DOlocationID            991892 non-null   float64        
 5   SR_Flag                 0 non-null        object         
 6   Affiliated_base_number  1153227 non-null  object         
 7   duration                1154112 non-null  timedelta64[ns]
dtypes: datetime64[ns](2), float64(2), object(3), timedelta64[ns](1)
memory usage: 70.4+ MB


In [5]:
# Q1- How many records are there in January data/ train dataframe
print(f"The size of the January/train dataset is {train.shape[0]}")

The size of the January/train dataset is 1154112


In [13]:
# Q2- Compute the duration in January
train['duration'] = train['dropOff_datetime'] - train['pickup_datetime']
test['duration'] = test['dropOff_datetime'] - test['pickup_datetime']



In [14]:
# Q2- Compute the average trip duration in January in minutes by dividing seconds by 60
train['duration'] = train['duration'].apply(lambda x: x.total_seconds()/ 60 )
test['duration'] = test['duration'].apply(lambda x: x.total_seconds()/ 60 )

print('The average trip duration in January is {}'.format(train['duration'].mean()))

The average trip duration in January is 19.167224093791006


In [15]:
# Check the distribution of duration variable in January
train.duration.describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

In [16]:
# The data attributes within the duration of 1 and 60 minutes inclusive
df_train = train[(train['duration'] >= 1) & (train['duration']<= 60)]
df_test = test[(test['duration'] >= 1) & (test['duration']<= 60)]

In [17]:
# Q2 - The number of records dropped
print('The number of records dropped after outlier removal for January/Train data is : {}'.format(np.abs(len(train)-len(df_train)))) 
print('The number of records dropped after outlier removal for February/Test data is : {}'.format(np.abs(len(test)-len(df_test)))) 

The number of records dropped after outlier removal for January/Train data is : 44286
The number of records dropped after outlier removal for February/Test data is : 47579


In [18]:
train.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number',
       'duration'],
      dtype='object')

In [34]:
# Q3 - Fractions of missing values in the data after selecting features to be used
features_to_be_used = ['PUlocationID', 'DOlocationID']

train_df = df_train[features_to_be_used]
test_df = df_test[features_to_be_used]

In [35]:
# fill the null value with -1
train_df.fillna(-1, inplace= True)
test_df.fillna(-1, inplace=True)

pickup_frac = ((train_df['PUlocationID'] == -1).sum() / len(train_df))* 100
dropp_off_frac = ((test_df['DOlocationID'] == -1).sum() / len(test_df)) * 100

# Q3 - Fractions of missing values in the data after selecting features to be used
print('The fraction of missing values for the Pickup Location ID is : {}'.format(pickup_frac))
print('The fraction of missing values for the Drop off Location ID is : {}'.format(dropp_off_frac))

The fraction of missing values for the Pickup Location ID is : 83.52732770722618
The fraction of missing values for the Drop off Location ID is : 13.610567682678642


In [36]:
# Q4 - Dimensionality of the matrix after after one-hot encoding with DictVectorizer 
train_df = train_df.astype(str)
test_df = test_df.astype(str)

# Turn the dataframe into dictionary

train_dict = train_df.to_dict(orient='records')
test_dict =  test_df.to_dict(orient = 'records')

vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_dict)
X_test = vectorizer.transform(test_dict)

# dimension of the encoded feature
print('The length of the encoded features is : {}'.format(len(vectorizer.feature_names_)))

The length of the encoded features is : 525


In [37]:
# Q5 - Training a model and evaluating the model with RMSE
y_train = df_train.duration.values
y_test = df_test.duration.values

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)

10.528519388409808

In [38]:
test_pred = lin_reg.predict(X_test)
mean_squared_error(y_test, test_pred, squared= False)

11.014287519486222