In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# tampilkan grafik lebih rapi
sns.set(style="whitegrid")

# load dataset
uber = pd.read_csv(r"C:\Users\thesa\Documents\Sertifikasi\uber.csv")
uber.head()


In [None]:
uber.info()
uber.describe()
uber.isnull().sum()


In [None]:
plt.figure(figsize=(8,4))
plt.hist(uber['fare_amount'], bins=100, color='skyblue')
plt.xlabel('Fare Amount')
plt.ylabel('Frequency')
plt.title('Distribusi Fare Amount')
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(x=uber['fare_amount'])
plt.title('Boxplot Fare Amount')
plt.show()


In [None]:
# 1. Hapus fare <= 0
uber = uber[uber['fare_amount'] > 0]

# 2. Filter jumlah penumpang (1-6)
uber = uber[(uber['passenger_count'] >= 1) & (uber['passenger_count'] <= 6)]

# 3. Drop NA values
uber = uber.dropna()

# 4. Filter hanya koordinat dalam area NYC
lat_min, lat_max = 40.5, 41.0
lon_min, lon_max = -74.5, -72.5

uber = uber[
    (uber['pickup_latitude'].between(lat_min, lat_max)) &
    (uber['pickup_longitude'].between(lon_min, lon_max)) &
    (uber['dropoff_latitude'].between(lat_min, lat_max)) &
    (uber['dropoff_longitude'].between(lon_min, lon_max))
]

# 5. Hitung jarak (Haversine)
def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2*np.arcsin(np.sqrt(a))
    return R*c

uber['distance_km'] = haversine(
    uber['pickup_latitude'], uber['pickup_longitude'],
    uber['dropoff_latitude'], uber['dropoff_longitude']
)

# 6. Filter jarak wajar (0 < distance <= 100 km)
uber = uber[(uber['distance_km'] > 0) & (uber['distance_km'] <= 100)]

uber.shape


In [None]:
# Distribusi fare setelah cleaning
plt.figure(figsize=(8,4))
sns.histplot(uber['fare_amount'], bins=50, kde=True, color='green')
plt.title("Distribusi Fare setelah Cleaning (NYC saja)")
plt.xlabel("Fare Amount ($)")
plt.show()

# Distribusi distance setelah cleaning
plt.figure(figsize=(8,4))
sns.histplot(uber['distance_km'], bins=50, kde=True, color='orange')
plt.title("Distribusi Distance setelah Cleaning (NYC saja)")
plt.xlabel("Distance (km)")
plt.show()

# Scatter distance vs fare
plt.figure(figsize=(8,4))
plt.scatter(uber['distance_km'], uber['fare_amount'], alpha=0.3, color='purple')
plt.xlabel("Distance (km)")
plt.ylabel("Fare Amount ($)")
plt.title("Scatter Distance vs Fare (NYC only)")
plt.show()


In [None]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # radius bumi dalam km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

uber['distance_km'] = haversine(
    uber['pickup_latitude'], uber['pickup_longitude'],
    uber['dropoff_latitude'], uber['dropoff_longitude']
)

plt.figure(figsize=(8,4))
plt.scatter(uber['distance_km'], uber['fare_amount'], alpha=0.2)
plt.xlabel('Distance (km)')
plt.ylabel('Fare Amount')
plt.title('Scatter Distance vs Fare')
plt.show()


In [49]:
from sklearn.model_selection import train_test_split

X = uber[['distance_km', 'passenger_count']]
y = uber['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [50]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.01),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    results.append([name, r2, rmse, mae])

results_df = pd.DataFrame(results, columns=['Model','R2','RMSE','MAE'])
results_df


Unnamed: 0,Model,R2,RMSE,MAE
0,Linear Regression,0.713522,5.325843,2.351216
1,Ridge Regression,0.713522,5.325843,2.351217
2,Lasso Regression,0.713522,5.325847,2.351395
3,Random Forest,0.663827,5.769312,2.753753
4,Gradient Boosting,0.733584,5.135979,2.331306
5,XGBoost,0.728657,5.183247,2.342511
