In [1]:
!python --version

Python 3.9.0


In [1]:
# import required libraries
import pandas as pd
import numpy as np

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

# import the linear model and the feature extraction
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer

# metrics
from sklearn.metrics import mean_squared_error

In [2]:
#load the dataset
train = pd.read_parquet('./Data/fhv_tripdata_2021-01.parquet')
test = pd.read_parquet('./Data/fhv_tripdata_2021-02.parquet')

In [3]:
# display five 5 instances of the January/train data
train.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1154112 non-null  object        
 1   pickup_datetime         1154112 non-null  datetime64[ns]
 2   dropOff_datetime        1154112 non-null  datetime64[ns]
 3   PUlocationID            195845 non-null   float64       
 4   DOlocationID            991892 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1153227 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(3)
memory usage: 61.6+ MB


In [5]:
# Q1- How many records are there in January data/ train dataframe
print(f"The size of the January/train dataset is {train.shape[0]}")

The size of the January/train dataset is 1154112


In [6]:
# Q2- Compute the duration in January
train['duration'] = train['dropOff_datetime'] - train['pickup_datetime']
test['duration'] = test['dropOff_datetime'] - test['pickup_datetime']



In [7]:
# Q2- Compute the average trip duration in January in minutes by dividing seconds by 60
train['duration'] = train['duration'].apply(lambda x: x.total_seconds()/ 60 )
test['duration'] = test['duration'].apply(lambda x: x.total_seconds()/ 60 )

print('The average trip duration in January is {}'.format(train['duration'].mean()))

The average trip duration in January is 19.167224093791006


In [8]:
# Check the distribution of duration variable in January
train.duration.describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

In [9]:
# The data attributes within the duration of 1 and 60 minutes inclusive
df_train = train[(train['duration'] >= 1) & (train['duration']<= 60)]
df_test = test[(test['duration'] >= 1) & (test['duration']<= 60)]

In [10]:
# Q2 - The number of records dropped
print('The number of records dropped after outlier removal for January/Train data is : {}'.format(np.abs(len(train)-len(df_train)))) 
print('The number of records dropped after outlier removal for February/Test data is : {}'.format(np.abs(len(test)-len(df_test)))) 

The number of records dropped after outlier removal for January/Train data is : 44286
The number of records dropped after outlier removal for February/Test data is : 47579


In [11]:
train.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number',
       'duration'],
      dtype='object')

In [12]:
# Q3 - Fractions of missing values in the data after selecting features to be used
features_to_be_used = ['PUlocationID', 'DOlocationID']

train_df = df_train[features_to_be_used]
test_df = df_test[features_to_be_used]

In [13]:
# fill the null value with -1
train_df.fillna(-1, inplace= True)
test_df.fillna(-1, inplace=True)

pickup_frac = ((train_df['PUlocationID'] == -1).sum() / len(train_df))* 100
dropp_off_frac = ((test_df['DOlocationID'] == -1).sum() / len(test_df)) * 100

# Q3 - Fractions of missing values in the data after selecting features to be used
print('The fraction of missing values for the Pickup Location ID is : {}'.format(pickup_frac))
print('The fraction of missing values for the Drop off Location ID is : {}'.format(dropp_off_frac))

The fraction of missing values for the Pickup Location ID is : 83.52732770722618
The fraction of missing values for the Drop off Location ID is : 13.610567682678642


In [30]:
# Q4 - Dimensionality of the matrix after after one-hot encoding with DictVectorizer 
train_df = train_df.astype(str)
test_df = test_df.astype(str)

# Turn the dataframe into dictionary

train_dict = train_df.to_dict(orient='records')
test_dict =  test_df.to_dict(orient = 'records')

vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_dict)
X_test = vectorizer.transform(test_dict)

# dimension of the encoded feature
print('The length of the encoded features is : {}'.format(len(vectorizer.feature_names_)))

The length of the encoded features is : 525


In [47]:
vectorizer.get_feature_names()

['DOlocationID=-1.0',
 'DOlocationID=1.0',
 'DOlocationID=10.0',
 'DOlocationID=100.0',
 'DOlocationID=101.0',
 'DOlocationID=102.0',
 'DOlocationID=105.0',
 'DOlocationID=106.0',
 'DOlocationID=107.0',
 'DOlocationID=108.0',
 'DOlocationID=109.0',
 'DOlocationID=11.0',
 'DOlocationID=111.0',
 'DOlocationID=112.0',
 'DOlocationID=113.0',
 'DOlocationID=114.0',
 'DOlocationID=115.0',
 'DOlocationID=116.0',
 'DOlocationID=117.0',
 'DOlocationID=118.0',
 'DOlocationID=119.0',
 'DOlocationID=12.0',
 'DOlocationID=120.0',
 'DOlocationID=121.0',
 'DOlocationID=122.0',
 'DOlocationID=123.0',
 'DOlocationID=124.0',
 'DOlocationID=125.0',
 'DOlocationID=126.0',
 'DOlocationID=127.0',
 'DOlocationID=128.0',
 'DOlocationID=129.0',
 'DOlocationID=13.0',
 'DOlocationID=130.0',
 'DOlocationID=131.0',
 'DOlocationID=132.0',
 'DOlocationID=133.0',
 'DOlocationID=134.0',
 'DOlocationID=135.0',
 'DOlocationID=136.0',
 'DOlocationID=137.0',
 'DOlocationID=138.0',
 'DOlocationID=139.0',
 'DOlocationID=14.

In [31]:
# Q5 - Training a model and evaluating the model with RMSE
y_train = df_train.duration.values
y_test = df_test.duration.values

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)

10.52851938897048

In [32]:
y_pred[:10]

array([23.05208364, 23.05208364, 14.10568643, 13.81722016, 14.87819181,
       17.84736266, 15.23307303, 16.62515383, 15.23307303, 14.10568643])

In [35]:
import pickle
with open('models/model.bin', 'wb') as f_out:
    pickle.dump((vectorizer, lin_reg), f_out)

In [33]:
test_pred = lin_reg.predict(X_test)
mean_squared_error(y_test, test_pred, squared= False)

11.014286453780494

In [34]:
test_pred[:10]

array([14.53936989, 13.74013328, 15.59373566, 15.18800233, 13.81722016,
       15.75754193, 14.10568643, 12.91114435, 23.46588943, 15.10486697])

In [36]:
with open('models/model.bin', 'rb') as f_out:
    dv, lr = pickle.load(f_out)

In [41]:
lr.coef_

array([  5.35189041,  15.62854163,  -1.02816356,  10.20091647,
         7.11656881,  -5.16101194,  -0.16221905,   0.92809564,
         5.28882648,   0.83382825,   7.02513643,  -1.69004394,
        -4.6146448 ,   3.88844262,   4.15166427,   2.42216982,
         2.33609202,  -3.70899963,  16.47641852,   4.98136639,
        -4.00717141,   4.93031765,  -5.04369994,  -2.53930563,
         0.20976645,  -0.45178344,  -4.77227554,   1.75082417,
        -4.55293858,  -5.58844254,  -6.23931557,  -4.48310154,
         5.84313256,  -5.29401553,  -1.21615771,   6.44268842,
        -0.45610652,  -5.37570148,  -0.50657694,  -4.21312188,
         6.81673489,   2.58789471,   4.29999488,  -3.78999915,
         9.43596983,   6.50263751,   6.87580382,   6.36029875,
         5.11289292,   3.04582639,   2.67471937,  -3.66037544,
         2.50656804,   0.02945102,  -4.90203402,   2.24670837,
         0.29302853,  -3.85664561,  -4.59421122,   2.45064672,
         0.47982449,   6.82025484,  -3.93533875,   4.08

In [53]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [56]:
# for february 2021
df = read_data('./Data/fhv_tripdata_2021-02.parquet')
dicts = df[categorical].to_dict(orient='records')
X_val = vectorizer.transform(dicts)
y_pred = lin_reg.predict(X_val)
y_pred[:10]

array([21.57399889, 21.57399889, 21.57399889, 21.57399889, 21.57399889,
       21.57399889, 21.57399889, 21.57399889, 21.57399889, 21.57399889])

In [61]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

def prepare_dict(df):
    dicts = df[categorical].to_dict(orient='records')

    return dicts

def prepare_features(train_dicts, test_dicts):
    dv = DictVectorizer()



    X_train = dv.fit_transform(train_dicts)
    X_test = dv.transform(test_dicts)

    y_train = df_train.duration.values
    y_test = df_test.duration.values

    return X_train, X_test, y_train, y_test, dv

def apply_model(X_train, X_test, y_train, y_test):
    lr = LinearRegression()

    lr.fit(X_train, y_train)

    y_train_pred = lr.predict(X_train)
    y_test_pred = lr.predict(X_test)
    print(mean_squared_error(y_train, y_train_pred, squared=False))
    print(mean_squared_error(y_test, y_test_pred, squared=False))
    print(y_test_pred[:10])
    return lr

def main():
    train_df = read_data('./Data/fhv_tripdata_2021-01.parquet')
    test_df = read_data('./Data/fhv_tripdata_2021-02.parquet')
    train_dict = prepare_dict(train_df)
    test_dict = prepare_dict(test_df)
    X_train, X_test, y_train, y_test, dv = prepare_features(train_dict, test_dict)
    lr = apply_model(X_train, X_test, y_train, y_test)
    return lr, dv


lr, dv = main()


10.52851938897048
11.014286453780494
[14.53936989 13.74013328 15.59373566 15.18800233 13.81722016 15.75754193
 14.10568643 12.91114435 23.46588943 15.10486697]


In [65]:
with open('models/regressor.bin', 'wb') as f_out:
    pickle.dump((lr, dv), f_out)

In [70]:
def test_pred():
    with open('models/regressor.bin', 'rb') as f_in:
        lr, dv = pickle.load(f_in)
    test_data = read_data('./Data/fhv_tripdata_2021-02.parquet')
    test_dict = prepare_dict(test_data)
    X_val = dv.transform(test_dict)
    y_pred = lr.predict(X_val)
    print(y_pred[:10])
    print(y_pred.mean())

    return 
    
test_pred()

    

[14.53936989 13.74013328 15.59373566 15.18800233 13.81722016 15.75754193
 14.10568643 12.91114435 23.46588943 15.10486697]
16.1916970458043
