In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
#import xgboost as xgb
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
#import seaborn as sns

# Load data
data = pd.read_csv("CA_data.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
# Check missing values as a percentage
missing_percentage = train.isna().sum() / len(train)
print(missing_percentage)

id                            0.000000
taxi_id                       0.000000
trip_start_timestamp          0.000000
trip_end_timestamp            0.000055
trip_seconds                  0.000250
trip_miles                    0.000055
pickup_community_area         0.072425
dropoff_community_area        0.097835
fare                          0.000100
tips                          0.000100
tolls                         0.050765
extras                        0.000100
trip_total                    0.000100
payment_type                  0.000000
company                       0.000000
pickup_centroid_latitude      0.072290
pickup_centroid_longitude     0.072290
dropoff_centroid_latitude     0.094375
dropoff_centroid_longitude    0.094375
dtype: float64


In [7]:
# Define public holidays
public_holidays = [
    "2018-01-01", "2018-01-15", "2018-02-19", "2018-03-05", "2018-05-28",
    "2018-07-04", "2018-09-03", "2018-10-08", "2018-11-11", "2018-11-22",
    "2018-12-25", "2018-02-12", "2019-01-01", "2019-01-21", "2019-02-18",
    "2019-03-04", "2019-05-27", "2019-07-04", "2019-09-02", "2019-10-14",
    "2019-11-11", "2019-11-28", "2019-12-25", "2019-02-12"
]

# Convert trip_start_timestamp to datetime
train['trip_start_timestamp'] = pd.to_datetime(train['trip_start_timestamp'])

# Add is_working_day
train['is_working_day'] = np.where(
    (train['trip_start_timestamp'].dt.weekday.isin(range(0, 5))) & 
    (~train['trip_start_timestamp'].dt.strftime('%Y-%m-%d').isin(public_holidays)), 
    1, 0
)

# Add is_rush_hour
train['is_rush_hour'] = np.where(
    ((train['trip_start_timestamp'].dt.hour.isin([7, 8, 9]) &
      (train['trip_start_timestamp'].dt.minute <= 59)) |
     (train['trip_start_timestamp'].dt.hour.isin([16, 17, 18]) &
      (train['trip_start_timestamp'].dt.minute <= 59))) &
    (train['is_working_day'] == 1),
    1, 0
)

In [8]:
# Fill missing values for numeric columns with mode
for col in ['trip_seconds', 'trip_miles']:
    train[col] = train[col].fillna(train[col].mode()[0])

In [9]:
# Function to get mode
def get_mode(series):
    return series.mode().iloc[0] if not series.mode().empty else np.nan

# Group by taxi_id and calculate modes
modes = train.groupby('taxi_id').agg({
    'pickup_community_area': get_mode,
    'dropoff_community_area': get_mode,
    'pickup_centroid_latitude': get_mode,
    'pickup_centroid_longitude': get_mode,
    'dropoff_centroid_latitude': get_mode,
    'dropoff_centroid_longitude': get_mode
}).reset_index()

# Merge with train
train = train.merge(modes, on='taxi_id', suffixes=('', '_mode'))

# Fill missing values with group modes
for col in [
    'pickup_community_area', 'dropoff_community_area',
    'pickup_centroid_latitude', 'pickup_centroid_longitude',
    'dropoff_centroid_latitude', 'dropoff_centroid_longitude'
]:
    train[col] = train[col].fillna(train[f"{col}_mode"]).fillna(0)

In [10]:
# Drop unnecessary columns
train = train.drop(columns=[
    'taxi_id', 'trip_start_timestamp', 'trip_end_timestamp', 
    'payment_type', 'tolls', 'id'
])

In [11]:
# Encode categorical columns
for col in train.select_dtypes('object').columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])

In [12]:
# Split data into training, validation, and test sets
X = train.drop(columns=['is_rush_hour'])
y = train['is_rush_hour']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=123)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=123)

In [None]:
# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)

# Train XGBoost model
xgb_model = xgb.train(
    {
        'max_depth': 5,
        'eta': 0.2,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',
    },
    dtrain,
    num_boost_round=500,
    evals=[(dtrain, 'train'), (dval, 'val')],
    early_stopping_rounds=200
)

In [None]:
# Predict on test set
y_pred = xgb_model.predict(dtest)
y_pred_binary = (y_pred >= 0.5).astype(int)

# Confusion matrix and metrics
conf_matrix = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:")
print(conf_matrix)

print("Classification Report:")
print(classification_report(y_test, y_pred_binary))

# AUC Score
auc_score = roc_auc_score(y_test, y_pred)
print(f"AUC: {auc_score:.4f}")

In [None]:
# Plot density of predictions
sns.kdeplot(y_pred, label="Predicted Probabilities", fill=True)
plt.title("Density Plot of Predicted Probabilities")
plt.xlabel("Predicted Probability")
plt.ylabel("Density")
plt.show()

## TRAIN WITH LG INSTEAD

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

In [16]:
# Check missing values as a percentage
missing_percentage = train.isna().sum() / len(train)
print(missing_percentage)

trip_seconds                       0.000000
trip_miles                         0.000000
pickup_community_area              0.000000
dropoff_community_area             0.000000
fare                               0.000100
tips                               0.000100
extras                             0.000100
trip_total                         0.000100
company                            0.000000
pickup_centroid_latitude           0.000000
pickup_centroid_longitude          0.000000
dropoff_centroid_latitude          0.000000
dropoff_centroid_longitude         0.000000
is_working_day                     0.000000
is_rush_hour                       0.000000
pickup_community_area_mode         0.013535
dropoff_community_area_mode        0.013425
pickup_centroid_latitude_mode      0.013400
pickup_centroid_longitude_mode     0.013400
dropoff_centroid_latitude_mode     0.013425
dropoff_centroid_longitude_mode    0.013425
dtype: float64


In [14]:
# Define features (X) and target variable (y)
X = train.drop(columns=['is_rush_hour'])
y = train['is_rush_hour']

# Split into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=123)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=123)

In [15]:
# Initialize logistic regression model
logreg_model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
logreg_model.fit(X_train, y_train)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Predict on the validation set
y_val_pred = logreg_model.predict(X_val)
y_val_pred_prob = logreg_model.predict_proba(X_val)[:, 1]  # Predicted probabilities for the positive class

In [None]:
# Accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# AUC (Area Under the ROC Curve)
auc = roc_auc_score(y_val, y_val_pred_prob)
print(f"Validation AUC: {auc:.4f}")

# Classification Report
print("Classification Report:")
print(classification_report(y_val, y_val_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# Predict on the test set
y_test_pred = logreg_model.predict(X_test)
y_test_pred_prob = logreg_model.predict_proba(X_test)[:, 1]

# Accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# AUC
test_auc = roc_auc_score(y_test, y_test_pred_prob)
print(f"Test AUC: {test_auc:.4f}")

# Classification Report
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
print("Test Confusion Matrix:")
print(conf_matrix_test)

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_prob)

# Plot ROC curve
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {test_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend()
plt.show()

In [None]:
import seaborn as sns

# Heatmap for Confusion Matrix
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Rush', 'Rush'], yticklabels=['Non-Rush', 'Rush'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Test Set Confusion Matrix")
plt.show()