Plan

1. Preprocess data

2. Find variables with high correlation with tip amount variable

3. Use a RandomForestRegressor + RandomizedSearchCV to build a prediction model

4. Find feature importance of variables after model is finished training

5. Summarize results

# Data preprocessing

In [None]:
!unzip cleaned_data.zip

In [None]:
import pandas as pd

# concatenate all files into single dataframe
alldata = (pd.read_parquet(f"cleaned_data/1.parquet")).head(1500000)
for i in range(2, 13):
  alldata = pd.concat([alldata, (pd.read_parquet(f"cleaned_data/{i}.parquet")).head(1500000)])

In [None]:
# turn date-time into numerical values
alldata['tpep_pickup_datetime'] = pd.to_datetime(alldata['tpep_pickup_datetime']).astype('int64') // 1e9
alldata['tpep_dropoff_datetime'] = pd.to_datetime(alldata['tpep_dropoff_datetime']).astype('int64') // 1e9

In [None]:
# turn 'N' and 'Y' into 0 and 1
alldata["store_and_fwd_flag"] = alldata["store_and_fwd_flag"].map(lambda x: 0 if x == "N" else 1)

In [None]:
# drop unwanted columns
alldata = alldata.drop(columns=["yyyy-mm", "Airport_fee", "total_amount"])

# fillNA with zeroes
alldata["airport_fee"] = alldata["airport_fee"].fillna(0.00)

In [None]:
# one-hot encode nominal categories
alldata = pd.get_dummies(alldata, columns=['VendorID', 'RatecodeID', 'payment_type'], prefix_sep='_', dtype=int)

In [None]:
alldata.head()

# Variable Correlations

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# select numerical values
numericals = alldata.select_dtypes("float")

corr = numericals.corr()[['tip_amount']]
sns.heatmap(corr, annot=True)

# Build a Random Forest model

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import randint

# get variables and target
X = alldata.drop("tip_amount", axis=1)
y = alldata["tip_amount"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

forest = RandomForestRegressor(random_state=0, max_features=9, n_jobs=8)

# hyperparameters to search for
param_dist = {
    'n_estimators': randint(100, 250),
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': randint(2, 7),
    'min_samples_leaf': randint(1, 5)
}

random_search = RandomizedSearchCV(
    estimator=forest,
    param_distributions=param_dist,
    n_iter=5,
    cv=2,
    scoring='neg_mean_squared_error',
    verbose=1,
    random_state = 0
)

# Train the model with random search and find best parameter combination
random_search.fit(X_train, y_train)
print(f'Best Parameters: {random_search.best_params_}')

In [None]:
# Use the Best Model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse:.2f}')