In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer #For creating Full Pipelines

# Train and Test Models on the Training Set
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [5]:
def shuffle_and_split(data, test_ratio):
    np.random.seed(42) # set the seed for reproducibility
    shuffled_indices = np.random.permutation(len(data)) # this return the shuffled indices
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [6]:
train , test = shuffle_and_split(df, 0.4)

In [7]:
df = train.copy()

<h1>For Getting the Distance From Cordinates</h1>

In [9]:
locs = pd.DataFrame({
    'lat1':df['pickup_longitude'],
    'lon1':df['pickup_latitude'],
    'lat2':df['dropoff_longitude'],
    'lon2':df['pickup_latitude']
})

In [10]:

def haversine_vector(lat1, lon1, lat2, lon2, radius=6371):
    """
    Calculate the great-circle distance between two sets of coordinates
    using the Haversine formula (vectorized).

    Parameters
    ----------
    lat1, lon1 : array-like
        Latitudes & longitudes of the first set of points (in degrees).
    lat2, lon2 : array-like
        Latitudes & longitudes of the second set of points (in degrees).
    radius : float
        Earth radius in kilometers (default 6371 km). Use 3956 for miles.

    Returns
    -------
    numpy.ndarray
        Distances in kilometers (same shape as input arrays).
    """
    # Convert degrees to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    # Compute differences
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return radius * c


In [11]:
df['distance_km'] = haversine_vector(locs.lat1, locs.lon1, locs.lat2, locs.lon2)

In [12]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance_km
670732,id0570804,2,2016-06-15 11:28:41,2016-06-15 13:07:17,2,-73.781631,40.644848,-73.98806,40.757236,N,5916,22.953805
1116926,id1240020,1,2016-02-23 07:00:39,2016-02-23 07:12:06,1,-73.985283,40.723553,-73.975433,40.755489,N,687,1.09522
573845,id1510937,1,2016-05-05 15:51:18,2016-05-05 16:29:14,1,-73.986282,40.742828,-73.871162,40.774216,N,2276,12.800753
64951,id3909687,2,2016-01-31 19:31:24,2016-01-31 19:54:28,2,-73.950233,40.779861,-73.996193,40.726009,N,1384,5.11046
35535,id3757092,1,2016-04-23 09:22:37,2016-04-23 09:25:11,1,-73.982208,40.740139,-73.977905,40.746174,N,154,0.478469


<h1>For The Pick And Drop Time </h1>

In [14]:
def add_day_and_duration(data,pic_time):
    data[pic_time] = pd.to_datetime(data[pic_time])
    data['week_day'] = data[pic_time].dt.day_name()
    return data

In [15]:
df = add_day_and_duration(df, 'pickup_datetime')

In [16]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance_km,week_day
670732,id0570804,2,2016-06-15 11:28:41,2016-06-15 13:07:17,2,-73.781631,40.644848,-73.98806,40.757236,N,5916,22.953805,Wednesday
1116926,id1240020,1,2016-02-23 07:00:39,2016-02-23 07:12:06,1,-73.985283,40.723553,-73.975433,40.755489,N,687,1.09522,Tuesday
573845,id1510937,1,2016-05-05 15:51:18,2016-05-05 16:29:14,1,-73.986282,40.742828,-73.871162,40.774216,N,2276,12.800753,Thursday
64951,id3909687,2,2016-01-31 19:31:24,2016-01-31 19:54:28,2,-73.950233,40.779861,-73.996193,40.726009,N,1384,5.11046,Sunday
35535,id3757092,1,2016-04-23 09:22:37,2016-04-23 09:25:11,1,-73.982208,40.740139,-73.977905,40.746174,N,154,0.478469,Saturday


In [19]:
df['timestamp'] = df['pickup_datetime'].astype('int64') // 10**9  # seconds

<h1>Making a copies</h1>

In [23]:
temp = train.copy()

In [24]:
train = df.copy()

In [25]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance_km,week_day,timestamp
670732,id0570804,2,2016-06-15 11:28:41,2016-06-15 13:07:17,2,-73.781631,40.644848,-73.98806,40.757236,N,5916,22.953805,Wednesday,1465990121
1116926,id1240020,1,2016-02-23 07:00:39,2016-02-23 07:12:06,1,-73.985283,40.723553,-73.975433,40.755489,N,687,1.09522,Tuesday,1456210839
573845,id1510937,1,2016-05-05 15:51:18,2016-05-05 16:29:14,1,-73.986282,40.742828,-73.871162,40.774216,N,2276,12.800753,Thursday,1462463478
64951,id3909687,2,2016-01-31 19:31:24,2016-01-31 19:54:28,2,-73.950233,40.779861,-73.996193,40.726009,N,1384,5.11046,Sunday,1454268684
35535,id3757092,1,2016-04-23 09:22:37,2016-04-23 09:25:11,1,-73.982208,40.740139,-73.977905,40.746174,N,154,0.478469,Saturday,1461403357


In [26]:
df = temp.copy()

<h1>Extracting the 'id' for the ids in id column</h1>

In [28]:
train['id'] = train['id'].str.extract("(\d+)").astype(int)

  train['id'] = train['id'].str.extract("(\d+)").astype(int)


In [29]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance_km,week_day,timestamp
670732,570804,2,2016-06-15 11:28:41,2016-06-15 13:07:17,2,-73.781631,40.644848,-73.98806,40.757236,N,5916,22.953805,Wednesday,1465990121
1116926,1240020,1,2016-02-23 07:00:39,2016-02-23 07:12:06,1,-73.985283,40.723553,-73.975433,40.755489,N,687,1.09522,Tuesday,1456210839
573845,1510937,1,2016-05-05 15:51:18,2016-05-05 16:29:14,1,-73.986282,40.742828,-73.871162,40.774216,N,2276,12.800753,Thursday,1462463478
64951,3909687,2,2016-01-31 19:31:24,2016-01-31 19:54:28,2,-73.950233,40.779861,-73.996193,40.726009,N,1384,5.11046,Sunday,1454268684
35535,3757092,1,2016-04-23 09:22:37,2016-04-23 09:25:11,1,-73.982208,40.740139,-73.977905,40.746174,N,154,0.478469,Saturday,1461403357


In [31]:
train.drop(["pickup_datetime","dropoff_datetime", "store_and_fwd_flag" ],axis=1,inplace=True)
train.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,distance_km,week_day,timestamp
670732,570804,2,2,-73.781631,40.644848,-73.98806,40.757236,5916,22.953805,Wednesday,1465990121
1116926,1240020,1,1,-73.985283,40.723553,-73.975433,40.755489,687,1.09522,Tuesday,1456210839
573845,1510937,1,1,-73.986282,40.742828,-73.871162,40.774216,2276,12.800753,Thursday,1462463478
64951,3909687,2,2,-73.950233,40.779861,-73.996193,40.726009,1384,5.11046,Sunday,1454268684
35535,3757092,1,1,-73.982208,40.740139,-73.977905,40.746174,154,0.478469,Saturday,1461403357


<h1>Seperating Features and labels for Dataset</h1>

In [33]:
train_features = train.drop("trip_duration", axis=1)
train_labels = train["trip_duration"].copy()

In [34]:
train_features.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,distance_km,week_day,timestamp
670732,570804,2,2,-73.781631,40.644848,-73.98806,40.757236,22.953805,Wednesday,1465990121
1116926,1240020,1,1,-73.985283,40.723553,-73.975433,40.755489,1.09522,Tuesday,1456210839
573845,1510937,1,1,-73.986282,40.742828,-73.871162,40.774216,12.800753,Thursday,1462463478
64951,3909687,2,2,-73.950233,40.779861,-73.996193,40.726009,5.11046,Sunday,1454268684
35535,3757092,1,1,-73.982208,40.740139,-73.977905,40.746174,0.478469,Saturday,1461403357


<h1>For Handling missing data we can use imputer : but my don't have missing values</h1>

In [36]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

<h1>Handling Categorical data : in this case ists the Week days</h1>

In [38]:
train_cat = train_features[["week_day"]]
train_cat.head()

Unnamed: 0,week_day
670732,Wednesday
1116926,Tuesday
573845,Thursday
64951,Sunday
35535,Saturday


#so for handling we use sklearn preprocessing we have ordinal and one-hot encoding and 
i will use one hot encoder as its best choice

In [40]:
cat_encoder = OneHotEncoder()
train_cathot = cat_encoder.fit_transform(train_cat) # this gives a sparse matrix(efficient storage for mostly zeros)

In [41]:
cat_encoder.categories_

[array(['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
        'Wednesday'], dtype=object)]

<h1>Constructing Pipelines in sklearn</h1>

Not really needed as the dataset i have dont the missing values and its standardiz

In [44]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

In [45]:
train_num = train_features.select_dtypes(include=[np.number])

In [46]:
train_num_prepared = num_pipeline.fit_transform(train_num)
print(train_num_prepared[:2].round(2))

[[-1.24  0.93  0.26  2.89 -3.16 -0.22  0.15  4.68  1.45]
 [-0.66 -1.07 -0.51 -0.18 -0.82 -0.03  0.1  -0.37 -0.74]]


In [47]:
#To turn the result back into a DataFrame with feature names:
df_train_num_prepared = pd.DataFrame(
    train_num_prepared,
    columns=num_pipeline.get_feature_names_out(),
    index=train_num.index
)

<h1>Heres the main stuff : Creating pipelines for num and cat then combining it</h1>

In [49]:
#Seperate numerical and categorical columns 
num_attribs = train_features.drop("week_day", axis=1)
cat_attribs = train_cat

In [50]:
#PipeLInes
# NUmerical pipeline
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

#Categorical pipeline
cat_pipeline = Pipeline([
    # ("ordinal", OrdinalEncoder())  # Use this if you prefer ordinal encoding
       ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Full Pipeline 
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs.columns),
    ("cat", cat_pipeline, cat_attribs.columns)
])


# Transform the data
train_prepared = full_pipeline.fit_transform(train_features)

train_prepared.shape

(875187, 16)

<h1>Traing and Testing Models</h1>

In [52]:
#Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(train_prepared, train_labels)

In [53]:
#Decission Tree
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(train_prepared,train_labels)

In [100]:
# Predict using training data
lin_preds = lin_reg.predict(train_prepared)
tree_preds = tree_reg.predict(train_prepared)

In [102]:
# Calculate RMSE
lin_rmse = mean_squared_error(train_labels, lin_preds, squared=False)
tree_rmse = mean_squared_error(train_labels, tree_preds, squared=False)



In [None]:
print("Linear Regression RMSE:", lin_rmse)
print("Decision Tree RMSE:", tree_rmse)