In [None]:
# from colabcode import ColabCode
# ColabCode(port=10000, password='Nelson123')

use 1% of data to speed up initial analysis

In [None]:
import pandas as pd
import random

url = 'https://raw.githubusercontent.com/PrinceKex/fare-prediction/refs/heads/main/taxifare.csv'

selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')

# define the data types
dtypes = {
  'fare_amount': 'float32',
  'pickup_datetime': 'float32',
  "pickup_longitude": 'float32',
  'pickup_latitude': 'float32',
  'dropoff_longitude': 'float32',
  'dropoff_latitude': 'float32',
  'passenger_count': 'uint8'
}

# define skip row function to randomize selection
def skip_row(row_idx):
  if row_idx == 0:
    return False
  return random.random() > 0.01
random.seed(42)
  


df = pd.read_csv(url, usecols=selected_cols, parse_dates=['pickup_datetime'], dtype=dtypes)
df.shape

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df.info()

2. Explore the Dataset

a. Basic info about the training set

b. Basic info about the test set

c. Exploratory data analysis and 
 
d. visualization

In [None]:
test_df.info()

In [None]:
train_df.describe()


In [None]:
train_df['pickup_datetime'].max()


In [None]:
train_df['pickup_datetime'].min()

In [None]:
test_df.describe()


In [None]:
test_df['pickup_datetime'].max(),


In [None]:
test_df['pickup_datetime'].min()

Exploratory data analylsis and Visualization
Answer these questions about the data
1. What is the busiest day of the week?
2. What is the busiest time of the day?
3. In which month are the fares the highest?
4. Which pickup locations have the highest fares?
5. Which drop location have the highest fares?
6. What is the average ride distance

3. Prepare Dataset for Training

a. split the training and validation set

b. fill/remove missing values

c. extract inputs and outputs

In [None]:
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)
train_df.info()


In [None]:
train_df = train_df.dropna()
val_df = val_df.dropna()

In [None]:
train_df.columns

input_cols = ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count']
target_col = 'fare_amount'

train_inputs = train_df[input_cols]
train_targets = train_df[target_col]

In [None]:
train_inputs.info()

In [None]:
train_targets.info()

In [None]:
val_inputs = val_df[input_cols]
val_targets = val_df[target_col]

In [None]:
val_inputs.info()

In [None]:
val_targets.info()

4. Train hardcoded and Baseline Models

In [None]:
import numpy as np

In [None]:
class MeanRegressor: 
  def fit(self, inputs, targets):
    self.mean = targets.mean()

  def predict(self, inputs):
    return np.full(inputs.shape[0], self.mean)

In [None]:
mean_model = MeanRegressor()
mean_model.fit(train_inputs,  train_targets)
mean_model.mean

In [None]:
train_preds = mean_model.predict(train_inputs)
train_preds

In [None]:
val_preds = mean_model.predict(val_inputs)
val_preds

In [None]:
from sklearn.metrics import root_mean_squared_error

def rmse( targets, preds):
  return root_mean_squared_error(targets, preds)

train_rmse = rmse(train_targets, train_preds)
train_rmse

In [None]:
val_rmse = rmse(val_targets, val_preds)
val_rmse

Train a Linear Regression Model on the data

In [None]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(train_inputs, train_targets)

train_preds = linear_model.predict(train_inputs)
train_preds

In [None]:
train_targets

In [None]:
train_rmse = rmse(train_targets, train_preds)
train_rmse

In [None]:
val_rmse = rmse(val_targets, val_preds)
val_rmse

In [None]:
test_preds = linear_model.predict(test_inputs)

Create the sample submission file and submit to kaggle

In [None]:
def predict_and_submit(model, test_inputs, fname):
  test_preds = model.predict(test_inputs)
  sub_df = pd.read_csv('sample_submission.csv')
  sub_df['fare_amount'] = test_preds
  sub_df.to_csv(fname, index=None)
  return sub_df

predict_and_submit(linear_model, 'linear_model_submission')

Feature Engineering.

a. extract part of the date

b. remove outliers and invalid date

c. add distance between pickups and drop

d. add distance from landmarks

In [None]:
def add_dateparts(df, col):
  df[col + '_year'] = df[col].dt.year
  df[col + '_month'] = df[col].dt.month
  df[col + '_day'] = df[col].dt.day
  df[col + '_weekday'] = df[col].dt.weekday
  df[col + '_hour'] = df[col].dt.hour

add_dateparts(train_df, 'pickup_datetime')
train_df.info()

In [None]:
add_dateparts(val_df, 'pickup_datetime')
val_df.info()

In [None]:
add_dateparts(test_df, 'pickup_datetime')
test_df.info()


Add distance between pickups and dropoffs

In [None]:

def hoversine_np(lat1, lat2, len1, len2):
  #calculate the distance between two points on the earth(specified as decimal degree)
  # all args must be of equal length

  len1, lat1, len2, lat2 = map(np.radians, [len1, lat1, len2, lat2])

  dlen = len2 - len1
  dlat = lat2 - lat1

  a = np.sin(dlat/2)**2 * np.cos(lat1) * np.cos(lat2) * np.sin(dlen/2)**2

  c = 2 * np.arcsin(np.sqrt(a))
  km = 6367 * c
  return km

def add_trip_distance(df):
  df['trip_distance'] = hoversine_np(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])


add_trip_distance(train_df)

In [None]:
add_trip_distance(val_df)

In [None]:
add_trip_distance(test_df)

Add distance from popular Landmarks. Airports, Times Square, Meuseum, World Trade Center

In [None]:
jfk_lonlat = -73.7781, 40.6413
lga_lonlat = -73.8740, 40.7769
ewr_lonlat = -74.1745, 40.6895
met_lonlat = -74.9612, 40.7794
wtc_lonlat = -74.0039, 40.7129

def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
  lon, lat = landmark_lonlat
  df[landmark_name + 'drop_distance'] = hoversine_np(lon, lat, df['dropoff_longitude'], df['dropoff_latitude'])

def add_landmarks(a_df):
  landmarks = [('jfk', jfk_lonlat), ('lga', lga_lonlat), ('ewr', ewr_lonlat), ('wtc', wtc_lonlat)]
  for name, lonlat in landmarks:
    add_landmark_dropoff_distance(a_df, name, lonlat)

add_landmarks(train_df)

In [None]:
add_landmarks(test_df)

In [None]:
add_landmarks(val_df)

Remove Outliers and Invalid Data

In [None]:
def remove_outliers(df):
  return df[(df['fare_amount'] >= 1.) & (df['fare_amount'] <= 500.) & (df['pickup_longitude'] >= -75) & (df['pickup_longitude'] <= -72) & (df['dropoff_longitude'] >= -75) & (df['dropoff_longitude'] <= -72) &  (df['pickup_latitude'] >= -75) & (df['pickup_latitude'] >= 40) & (df['pickup_latitude'] <= 42) & (df['dropoff_latitude'] >= 40) & (df['dropoff_latitude'] <= 42) & (df['passenger_count'] >= 1) & (df['passenger_count'] <= 6)] 

train_df = remove_outliers(train_df)

In [None]:
test_df = remove_outliers(test_df)

In [None]:
val_df = remove_outliers(val_df)

Scaling and One Hot Encoding.
Try scaling numeric columns to the (0, 1) range and encoding categorical columns using a one hot encoder.
Not done because we are training a tree based model which is generally effecting without scaling and one hot

Save Intermediate DataFrames
save the dataframes in the parquet format so it can be reloaded when needed to continue the ML

In [None]:
train_df.to_parquet('train.parquet')
test_df.to_parquet('test.parquet')
val_df.to_parquet('val.parquet')

Train and Evaluate different models
1. Ridge Regression
2. Random Forest
3. Gradient Boosting
4. Lasson
5. SVM
6. KNN
7. Decision Tree Models

In [None]:
input_cols = ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count', 'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day', 'pickup_datetime_weekday', 'pickup_datetime_hour', 'trip_distance', 'jfk_drop_distance', 'lga_drop_distance', 'ewr_drop_distance', 'met_drop_distance', 'wtc_drop_distance' ]
target_col = 'fare_amount'

train_inputs = train_df[input_cols]
train_targets = train_df[target_col]

val_inputs = val_df[input_cols]
val_targets = val_df[target_col]

test_inputs = test_df[input_cols]


Define helper function to evaluate models

In [None]:
def evaluation(model):
  train_preds = model.predict(train_inputs)
  train_rmse = rmse(train_targets, train_preds)
  val_preds = model.predict(val_inputs)
  val_rmse = rmse(val_targets, val_preds)
  return train_rmse, val_rmse, train_preds, val_preds

Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

modelI = Ridge(random_state=42, alpha=0.9)

modelI.fit(train_inputs, train_targets)

evaluation(modelI)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
 
linear_modelII = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=10, n_estimators=100)

modelII.fit(train_inputs, train_targets)

evaluation(modelII)

Gradient Boosting 

In [None]:
from xgboost import XGBRegressor

modelIII = XGBRegressor(max_depth=5, objective='reg:squared_error', n_estimators=200, random_state=42, n_jobs=-1)

modelIII.fit(train_inputs, train_targets)

evaluation(modelIII)

Tunning Hyperparameters

In [None]:
import matplotlib.pyplot as plt

def test_params(ModelClass, **params):
  model = ModelClass(**params).fit(train_inputs, train_targets)
  train_rmse = rmse(model.predict(train_inputs), train_targets)
  val_rmse = rmse(model.predict(val_inputs), train_targets)
  return train_rmse, val_rmse

def test_params_and_plot(ModelClass, param_name, param_value, **other_param):
  train_errors, val_errors = [], []
  for value in param_value:
    params = dict[other_params]
    params[param_name] = value
    train_rmse, val_rmse = test_params(ModelClass, **params)
    train_errors.append(train_rmse)
    val_errors.append(val_rmse)

  plt.figure(figsize=(10, 6))
  plt.title('Overfitting curve: ' + param_name)
  plt.plot(param_value, train_errors, 'b-0')
  plt.plot(param_value, val_errors, 'r-0')
  plt.xlabel(param_name)
  plt.ylabel('RMSE')
  plt.legend(['Training', 'Validation'])

In [None]:
best_params = {
  'random_state': 42,
  'n_jobs': -1,
  'objective': 'reg:squarederror'
}

test_params_and_plot(XGBRegressor, 'num_estimators', [100, 200, 400], **best_params)

In [None]:
xgb_model_final = XGBRegressor(objective='reg:squarederror', n_jobs=-1, random_state=42, n_estimators=500, learning_rate=0.08, subsample=0.7, colsample_bytree=0.8)

In [None]:
xgb_model_final.fit(train_inputs, train_targets)
evaluation(xgb_model_final)