In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease



# Import Data

In [2]:
df = pd.read_csv("data/taxi_fare.csv", parse_dates=["pickup_datetime"])
df.dropna(inplace=True)

pickup_dt = df.pickup_datetime.dt
df["year"] = pickup_dt.year
df["month"] = pickup_dt.month
df["day"] = pickup_dt.dayofweek
df["hour"] = pickup_dt.hour
df.drop(columns=["pickup_datetime"], inplace=True)

df['distance'] = np.abs(df.pickup_longitude - df.dropoff_longitude) + np.abs(df.pickup_latitude - df.dropoff_latitude)
df.drop(columns=['pickup_longitude', 'dropoff_longitude', 'pickup_latitude', 'dropoff_latitude'], inplace=True)

df.head()

Unnamed: 0,fare_amount,passenger_count,year,month,day,hour,distance
0,4.5,1.0,2009,6,0,17,0.011742
1,16.9,1.0,2010,1,1,16,0.107481
2,5.7,2.0,2011,8,3,0,0.019212
3,7.7,1.0,2012,4,5,4,0.029386
4,5.3,1.0,2010,3,1,7,0.027194


In [3]:
df = df[(df.distance > 0) & (df.fare_amount > 0)]

# Correlation matrix

pearson
- linear relationship
- syarat: berdistribusi normal

spearman
- rank relationship
- syarat: subjek yang berbeda
- contoh: Nilai yang diberikan oleh Guru A vs Guru B
- contoh: Harga saham A vs Harga saham B

kendall
- rank relationship
- syarat: subjek yang sama
- contoh: korelasi nilai quiz vs nilai ujian untuk murid A

In [4]:
from jcopml.plot import plot_correlation_matrix

In [5]:
plot_correlation_matrix(df, 'fare_amount', numeric_col=['passenger_count', 'year', 'distance'])

interactive(children=(ToggleButtons(description='method', options=('spearman', 'kendall', 'pearson', 'pearson_…

# Dataset Splitting

In [6]:
X = df.drop(columns="fare_amount")
y = df.fare_amount

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14705, 6), (3677, 6), (14705,), (3677,))

# Training

In [7]:
from xgboost import XGBRegressor

from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp



In [8]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), ['year', 'passenger_count', 'distance']),
    ('categoric', cat_pipe(encoder='onehot'), ['month', 'day', 'hour'])
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(random_state=42))
])

model = RandomizedSearchCV(pipeline, rsp.xgb_params, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   22.1s finished


{'algo__colsample_bytree': 0.5261245937025092, 'algo__gamma': 1, 'algo__learning_rate': 0.08498604636076731, 'algo__max_depth': 5, 'algo__n_estimators': 122, 'algo__reg_alpha': 0.0015568103018717575, 'algo__reg_lambda': 6.2601238645330595, 'algo__subsample': 0.7433401936490238}
0.8421022883140867 0.7830748580159071 0.7953759529171327


# Masih banyak yang bisa diimprove

Mari kita belajar dari peserta lain di Kaggle

https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration

- Ternyata ada data yang salah (terletak di air)
- Analisis lokasi penjemputan, apakah di landmark tertentu seperti airport
- dll