In [7]:
# notebooks/exploration.ipynb 
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score 
# Hardcoded paths and magic numbers (The Anti-Patterns) 
DATA_PATH = "data/green_tripdata_2023-01.parquet"
 
N_ESTIMATORS = 100 
SEED = 42 
 
print("Loading data...") 
df = pd.read_parquet(DATA_PATH) 
df = df.dropna(subset=['passenger_count', 'trip_distance', 'fare_amount', 
'tip_amount']) 
 
print("Feature Engineering...") 
df['tip_pct'] = df['tip_amount'] / (df['fare_amount'] + 0.0001) 
df['high_tip'] = (df['tip_pct'] > 0.15).astype(int) 
 
features = ['passenger_count', 'trip_distance', 'fare_amount', 
'PULocationID', 'DOLocationID'] 
target = 'high_tip' 
 
X = df[features] 
y = df[target] 
 
print("Training...") 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
random_state=SEED) 
 
rf = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=SEED) 
rf.fit(X_train, y_train) 
 
y_pred = rf.predict(X_test) 
print(f"Accuracy: {accuracy_score(y_test, y_pred)}") 

Loading data...
Feature Engineering...
Training...
Accuracy: 0.6295977461261544
