In [37]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

In [61]:
from sklearn.metrics import mean_squared_error
import pickle

In [26]:
def data_read_pre_processing(file_path):
    
   
    #reading the data from the defined path
    df = pd.read_parquet(file_path)
    
    #Converting the columns to timestamp
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    
    #Calculating duration of the trip
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df['duration'] = df.duration.apply(lambda td: td.total_seconds()/60)
    
    #filetring the data 
    df = df[(df.duration>=1) & (df.duration<=60)]
    
    return df

In [27]:
df_train = data_read_pre_processing('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')

df_val = data_read_pre_processing('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [49]:
#### Created new Feature
df_train['PU_DO'] = df_train['PULocationID'] +'_'+ df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] +'_'+ df_val['DOLocationID']

In [50]:
### defining the train set 
categorical = ['PU_DO']#['PULocationID','DOLocationID']
numerical = ['trip_distance']

### Converting categorical variable into string for preprocessing
df_train[categorical] = df_train[categorical].astype(str)
train_dicts = df_train[categorical+numerical].to_dict(orient = 'records') 

### Converting categorical variable into string for preprocessing
df_val[categorical] = df_val[categorical].astype(str)
val_dicts = df_val[categorical+numerical].to_dict(orient = 'records') 

In [51]:
### Vectoriser
dv = DictVectorizer()

In [52]:
## vectorizing traing Features
X_train = dv.fit_transform(train_dicts)
X_train

<73908x13221 sparse matrix of type '<class 'numpy.float64'>'
	with 147816 stored elements in Compressed Sparse Row format>

In [53]:
## vectorizing validation Features
X_val = dv.transform(val_dicts)
X_val

<61921x13221 sparse matrix of type '<class 'numpy.float64'>'
	with 118585 stored elements in Compressed Sparse Row format>

In [54]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [55]:
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

In [56]:
# prediting the values for train set
y_predict_train = lr.predict(X_train)

# prediting the values for train set
y_predict_val = lr.predict(X_val)

In [57]:
mean_squared_error(y_train,y_predict,squared = False)

9.775464552087287

In [58]:
mean_squared_error(y_val,y_predict_val,squared = False)

7.479562160810692

In [63]:
with open('models/lin_reg.bin','wb') as f_out:
    pickle.dump((dv,lr),f_out)

# Checking a different Model like LASSO or Ridge

In [60]:
lr_new = Lasso(alpha=0.0001)
lr_new.fit(X_train,y_train)

y_pred_new = lr_new.predict(X_val)
mean_squared_error(y_val,y_pred_new,squared = False)

7.616617761096093