In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

train["payment_type"].value_counts()
train["company"].value_counts()

Unnamed: 0,id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude
0,1,fc7e6d61ad96136f2f84e65cb578719128a76777,2019-05-10T21:45:00Z,2019-05-10T22:00:00Z,420.0,0.0,8.0,8.0,6.0,0.0,0.0,0.0,6.0,Cash,Blue Ribbon Taxi Association Inc.,41.898332,-87.620763,41.90752,-87.626659
1,2,32e3320b2ad8fc0e4c90c95d962029be24d38ca2,2019-01-16T08:00:00Z,2019-01-16T08:00:00Z,419.0,1.0,8.0,8.0,6.25,1.0,0.0,0.0,7.75,Credit Card,City Service,41.892042,-87.631864,41.895033,-87.619711
2,3,39127d25654f081bbc273d7fe29e44c140d0c210,2018-10-27T14:30:00Z,2018-10-27T14:45:00Z,1175.0,3.3,33.0,8.0,13.25,3.31,,0.0,16.56,Credit Card,Flash Cab,41.85935,-87.617358,41.892042,-87.631864
3,4,90bd7186d83d9653bd0453dd16c143941e597f6b,2019-09-25T17:30:00Z,2019-09-25T17:45:00Z,609.0,2.52,28.0,24.0,9.75,2.0,0.0,0.0,12.25,Credit Card,Sun Taxi,41.874005,-87.663518,41.901207,-87.676356
4,5,455653ef8a20a3125f30a4938ec1eedadef50ab5,2018-12-14T15:15:00Z,2018-12-14T15:30:00Z,1026.0,1.6,,,10.0,2.0,0.0,0.0,12.5,Credit Card,Flash Cab,,,,


In [38]:
# Convert trip_start_timestamp to datetime
train['start_time'] = pd.to_datetime(train['trip_start_timestamp'])

# Extract hour and minute
train['hour'] = train['start_time'].dt.hour
train['minute'] = train['start_time'].dt.minute
train['weekday'] = train['start_time'].dt.weekday  # 0=Monday, 6=Sunday

# Define rush hour based on hour and minute
train['is_rush_hour'] = (
    ((train['hour'] == 7) | (train['hour'] == 8) | ((train['hour'] == 9) & (train['minute'] == 0))) |
    ((train['hour'] == 16) | (train['hour'] == 17) | ((train['hour'] == 18) & (train['minute'] == 0)))
) & (train['weekday'] < 5)  # Only weekdays

# Define public holidays
public_holidays = [
    "2018-01-01", "2018-01-15", "2018-02-19", "2018-03-05", "2018-05-28", "2018-07-04", "2018-09-03",
    "2018-10-08", "2018-11-11", "2018-11-22", "2018-12-25", "2018-02-12", "2019-01-01", "2019-01-21",
    "2019-02-18", "2019-03-04", "2019-05-27", "2019-07-04", "2019-09-02", "2019-10-14", "2019-11-11",
    "2019-11-28", "2019-12-25", "2019-02-12"
]

# Mark holidays
train['is_holiday'] = train['start_time'].dt.date.astype(str).isin(public_holidays)

# Exclude holidays from rush hour
train['is_rush_hour'] = train['is_rush_hour'] & (~train['is_holiday'])

# Convert company to category type (optional)
train['company'] = train['company'].astype('category')
# Calculate mode latitude and longitude for each pickup community area
lat_by_pickup_area = train.groupby('taxi_id')['pickup_centroid_latitude'].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
)
long_by_pickup_area = train.groupby('taxi_id')['pickup_centroid_longitude'].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
)

# Calculate mode latitude and longitude for each dropoff community area
lat_by_dropoff_area = train.groupby('taxi_id')['dropoff_centroid_latitude'].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
)
pickup_community_area = train.groupby('taxi_id')['pickup_community_area'].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
)
dropoff_community_area = train.groupby('taxi_id')['dropoff_community_area'].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
)

# Fill missing pickup latitude/longitude based on pickup community area
train['pickup_centroid_latitude'] = train['pickup_centroid_latitude'].fillna(lat_by_pickup_area)
train['pickup_centroid_longitude'] = train['pickup_centroid_longitude'].fillna(long_by_pickup_area)

# Fill missing dropoff latitude/longitude based on dropoff community area
train['dropoff_centroid_latitude'] = train['dropoff_centroid_latitude'].fillna(lat_by_dropoff_area)
train['dropoff_centroid_longitude'] = train['dropoff_centroid_longitude'].fillna(long_by_dropoff_area)


# Fill missing dropoff latitude/longitude based on dropoff community area
train['pickup_community_area'] = train['pickup_community_area'].fillna(pickup_community_area)
train['dropoff_community_area'] = train['dropoff_community_area'].fillna(dropoff_community_area)

# Fill remaining missing values with 0
columns_to_fill = [

    'fare',
    'extras',
    'trip_total',
    'trip_seconds',
    'trip_miles',
    'tolls',
    'tips'
]
train[columns_to_fill] = train[columns_to_fill].fillna(0)

In [32]:
train[train['is_rush_hour']==1].count()

id                            40028
taxi_id                       40028
trip_start_timestamp          40028
trip_end_timestamp            40025
trip_seconds                  40028
trip_miles                    40028
pickup_community_area         40028
dropoff_community_area        40028
fare                          40028
tips                          40028
tolls                         40028
extras                        40028
trip_total                    40028
payment_type                  40028
company                       40028
pickup_centroid_latitude      40028
pickup_centroid_longitude     40028
dropoff_centroid_latitude     40028
dropoff_centroid_longitude    40028
start_time                    40028
hour                          40028
minute                        40028
weekday                       40028
is_rush_hour                  40028
is_holiday                    40028
dtype: int64

In [15]:
# Calculate mode latitude and longitude for each pickup community area
lat_by_pickup_area = train.groupby('taxi_id')['pickup_centroid_latitude'].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
)
long_by_pickup_area = train.groupby('taxi_id')['pickup_centroid_longitude'].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
)

# Calculate mode latitude and longitude for each dropoff community area
lat_by_dropoff_area = train.groupby('taxi_id')['dropoff_centroid_latitude'].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
)
long_by_dropoff_area = train.groupby('taxi_id')['dropoff_centroid_longitude'].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
)

# Fill missing pickup latitude/longitude based on pickup community area
train['pickup_centroid_latitude'] = train['pickup_centroid_latitude'].fillna(lat_by_pickup_area)
train['pickup_centroid_longitude'] = train['pickup_centroid_longitude'].fillna(long_by_pickup_area)

# Fill missing dropoff latitude/longitude based on dropoff community area
train['dropoff_centroid_latitude'] = train['dropoff_centroid_latitude'].fillna(lat_by_dropoff_area)
train['dropoff_centroid_longitude'] = train['dropoff_centroid_longitude'].fillna(long_by_dropoff_area)

# Fill remaining missing values with 0
columns_to_fill = [
    'pickup_community_area',
    'dropoff_community_area',
    'pickup_centroid_latitude', 
    'pickup_centroid_longitude',
    'dropoff_centroid_latitude',
    'dropoff_centroid_longitude',
    'fare',
    'extras',
    'trip_total',
    'trip_seconds',
    'trip_miles',
    'tolls',
    'tips'
]
#train[columns_to_fill] = train[columns_to_fill].fillna(0)

In [33]:
print(train.isna().sum())


id                             0
taxi_id                        0
trip_start_timestamp           0
trip_end_timestamp            11
trip_seconds                   0
trip_miles                     0
pickup_community_area          0
dropoff_community_area         0
fare                           0
tips                           0
tolls                          0
extras                         0
trip_total                     0
payment_type                   0
company                        0
pickup_centroid_latitude       0
pickup_centroid_longitude      0
dropoff_centroid_latitude      0
dropoff_centroid_longitude     0
start_time                     0
hour                           0
minute                         0
weekday                        0
is_rush_hour                   0
is_holiday                     0
dtype: int64


In [34]:
from sklearn.model_selection import train_test_split

# Define the target variable (y)
y = train['is_rush_hour']  # Replace 'is_rush_hour' with your actual target column name

# Define features (X) as all columns except the target
X = train.drop(columns=['is_rush_hour','taxi_id','tolls','trip_start_timestamp','trip_end_timestamp','id','payment_type','company','start_time'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Print shapes to verify
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (160000, 16)
X_test shape: (40000, 16)
y_train shape: (160000,)
y_test shape: (40000,)


In [35]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix

# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=5000)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate Accuracy and Balanced Accuracy
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_accuracy:.4f}")

# Detailed Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7996
Balanced Accuracy: 0.5017
Classification Report:
              precision    recall  f1-score   support

       False       0.80      1.00      0.89     32017
        True       0.37      0.01      0.01      7983

    accuracy                           0.80     40000
   macro avg       0.59      0.50      0.45     40000
weighted avg       0.72      0.80      0.71     40000

Confusion Matrix:
[[31940    77]
 [ 7937    46]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
model.summary

AttributeError: 'LogisticRegression' object has no attribute 'summary'

In [29]:
from collections import Counter

y_pred = model.predict(X_test)


In [30]:
# Accuracy and Balanced Accuracy
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_accuracy:.4f}")

# Detailed Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7995
Balanced Accuracy: 0.5009
Classification Report:
              precision    recall  f1-score   support

       False       0.80      1.00      0.89     32017
        True       0.32      0.00      0.01      7983

    accuracy                           0.80     40000
   macro avg       0.56      0.50      0.45     40000
weighted avg       0.70      0.80      0.71     40000



In [48]:
import statsmodels.api as sm

from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix


# Add a constant to the feature set
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

# Fit the logistic regression model
model_sm = sm.Logit(y_train, X_train_sm)
result = model_sm.fit()

# Print the summary
print(result.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
X_train_sm = pd.get_dummies(X_train_sm, drop_first=True)
X_test_sm = pd.get_dummies(X_test_sm, drop_first=True)

# Ensure columns in train and test match
X_test_sm = X_test_sm.reindex(columns=X_train_sm.columns, fill_value=0)

X_train_sm = X_train_sm.apply(pd.to_numeric, errors='coerce')
X_test_sm = X_test_sm.apply(pd.to_numeric, errors='coerce')

const                         float64
trip_seconds                  float64
trip_miles                    float64
pickup_community_area         float64
dropoff_community_area        float64
fare                          float64
tips                          float64
extras                        float64
trip_total                    float64
pickup_centroid_latitude      float64
pickup_centroid_longitude     float64
dropoff_centroid_latitude     float64
dropoff_centroid_longitude    float64
hour                            int32
minute                          int32
weekday                         int32
is_holiday                       bool
dtype: object


In [50]:
print(X_train_sm.dtypes)


const                         float64
trip_seconds                  float64
trip_miles                    float64
pickup_community_area         float64
dropoff_community_area        float64
fare                          float64
tips                          float64
extras                        float64
trip_total                    float64
pickup_centroid_latitude      float64
pickup_centroid_longitude     float64
dropoff_centroid_latitude     float64
dropoff_centroid_longitude    float64
hour                            int32
minute                          int32
weekday                         int32
is_holiday                       bool
dtype: object
