In [1]:
!python -V

Python 3.9.23


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import pickle
import sklearn
import numpy as np

In [3]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-eperiment")



2025/12/10 23:45:34 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/10 23:45:34 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/workspaces/Mlops_Zoom_camp/02-Experiment Tracking/mlruns/1', creation_time=1765408879095, experiment_id='1', last_update_time=1765408879095, lifecycle_stage='active', name='nyc-taxi-eperiment', tags={}>

In [4]:
def data_read_pre_processing(file_path):
    
   
    #reading the data from the defined path
    df = pd.read_parquet(file_path)
    
    #Converting the columns to timestamp
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    
    #Calculating duration of the trip
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df['duration'] = df.duration.apply(lambda td: td.total_seconds()/60)
    
    #filetring the data 
    df = df[(df.duration>=1) & (df.duration<=60)]
    
    return df

In [5]:
df_train = data_read_pre_processing('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')

df_val = data_read_pre_processing('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [6]:
import sys, sklearn
print(sys.executable)
print(sklearn.__version__)


/home/codespace/anaconda3/envs/exp-tracking-env/bin/python
1.3.0


In [7]:
#### Created new Feature
df_train['PU_DO'] = df_train['PULocationID'].astype(str) + '_' + df_train['DOLocationID'].astype(str)
df_val['PU_DO']  = df_val['PULocationID'].astype(str)  + '_' + df_val['DOLocationID'].astype(str)


In [8]:
### defining the train set 
categorical = ['PU_DO']#['PULocationID','DOLocationID']
numerical = ['trip_distance']

### Converting categorical variable into string for preprocessing
df_train[categorical] = df_train[categorical].astype(str)
train_dicts = df_train[categorical+numerical].to_dict(orient = 'records') 

### Converting categorical variable into string for preprocessing
df_val[categorical] = df_val[categorical].astype(str)
val_dicts = df_val[categorical+numerical].to_dict(orient = 'records') 

In [9]:
### Vectoriser
dv = DictVectorizer()

In [10]:
## vectorizing traing Features
X_train = dv.fit_transform(train_dicts)
X_train

<73908x13221 sparse matrix of type '<class 'numpy.float64'>'
	with 147816 stored elements in Compressed Sparse Row format>

In [11]:
## vectorizing validation Features
X_val = dv.transform(val_dicts)
X_val

<61921x13221 sparse matrix of type '<class 'numpy.float64'>'
	with 118585 stored elements in Compressed Sparse Row format>

In [12]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [14]:
# prediting the values for train set
y_predict_train = lr.predict(X_train)

# prediting the values for train set
y_predict_val = lr.predict(X_val)

In [15]:
import inspect
inspect.signature(mean_squared_error)


<Signature (y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', squared=True)>

In [16]:
rmse = np.sqrt(mean_squared_error(y_train,y_predict_train))
rmse

5.699564118198979

In [17]:
rmse_val =  mean_squared_error(y_val,y_predict_val,squared = False)
rmse_val

7.758715209663881

In [18]:
with open('models/lin_reg.bin','wb') as f_out:
    pickle.dump((dv,lr),f_out)

# Checking a different Model like LASSO or Ridge

In [20]:
with mlflow.start_run():

    mlflow.set_tag("developer","Tej")

    mlflow.log_param("train-data-url","https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet")
    mlflow.log_param("validation-data-url","https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet")
    
    alpha =0.01
    mlflow.log_param("alpha",alpha)
    lr_new = Lasso(alpha)
    lr_new.fit(X_train,y_train)
    
    y_pred_new = lr_new.predict(X_val)
    rmse = mean_squared_error(y_val,y_pred_new,squared = False)
    mlflow.log_metric("rmse",rmse)

11.167275941179728