## **Data Science 112 Final Project: Data Extraction** **bold text**


# Tricks for Tips : A Data-Driven Analysis of Tipping Influences
***by Amelie and Spurti***

This project explores variables that impact the tipping percentages.

This section covers the development and evaluation of our machine learning model to predict tipping behavior.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/My Drive/data.csv")
df['pickup_datetime'] = pd.to_datetime(
    df['pickup_date'].astype(str) + ' ' + df['pickup_time'].astype(str),
    errors='coerce'
)
df['dropoff_datetime'] = pd.to_datetime(
    df['pickup_date'].astype(str) + ' ' + df['dropoff_time'].astype(str),
    errors='coerce'
)

df['tip_pct']      = df['tip_amount'] / df['total_amount']
df['pickup_hour']  = df['pickup_datetime'].dt.hour

feature_cols = [
    'pickup_income',
    'dropoff_income',
    'temperature_2m',
    'pickup_hour',
    'trip_duration',
    'trip_distance',
    'total_amount',
    'congestion_surcharge'
]
X = df[feature_cols]
y = df['tip_pct'].fillna(0)

pipeline = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(n_estimators=100, random_state=42)
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE:",  test_rmse)

cv_mse  = -cross_val_score(
    pipeline, X, y,
    cv=5,
    scoring='neg_mean_squared_error'
)
cv_rmse = np.sqrt(cv_mse).mean()
print("5 fold RMSE:",  cv_rmse)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Test RMSE: 0.034554561587719475
5 fold RMSE: 0.03884298827382569


In [None]:
from sklearn.neighbors import KNeighborsRegressor
pipeline = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor(n_neighbors=5)
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE:",  test_rmse)

cv_mse  = -cross_val_score(
    pipeline, X, y,
    cv=5,
    scoring='neg_mean_squared_error'
)
cv_rmse = np.sqrt(cv_mse).mean()
print("5-fold CV RMSE:",  cv_rmse)


Test RMSE: 0.07858550817432462
5-fold CV RMSE: 0.07834349413600236


In [None]:
from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(random_state=42)
)
param_grid = {
    'randomforestregressor__n_estimators': [100, 200, 300],
    'randomforestregressor__max_depth': [None, 10, 20],
    'randomforestregressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
cv_rmse = np.sqrt(-grid_search.best_score_)

print("Best Parameters:", grid_search.best_params_)
print("Test RMSE:", test_rmse)
print("CV RMSE:", cv_rmse)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'randomforestregressor__max_depth': 20, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 300}
Test RMSE: 0.04306692369141193
CV RMSE: 0.04263021528459892


In [None]:
# @markdown Run this cell to download this notebook as a webpage, `_NOTEBOOK.html`.

import google, json, nbformat
x
# Get the current notebook and write it to _NOTEBOOK.ipynb
raw_notebook = google.colab._message.blocking_request("get_ipynb",
                                                      timeout_sec=30)["ipynb"]
with open("_NOTEBOOK.ipynb", "w", encoding="utf-8") as ipynb_file:
  ipynb_file.write(json.dumps(raw_notebook))

# Use nbconvert to convert .ipynb to .html.
!jupyter nbconvert --to html --log-level WARN _NOTEBOOK.ipynb

# Download the .html file.
google.colab.files.download("_NOTEBOOK.html")