In [1]:
import pandas as pd

# Load the data
file_path = r'E:\Data_Practice\Taxi.csv'
df = pd.read_csv(file_path)

# Display the first few rows
print(df.head())


   event_date  hour  minute  community_id      community_name  picks  drops
0  2024-05-06    14      16         318.0           AL KARAMA    222    222
1  2024-05-06    23      22         922.0        AL YALAYIS 2     24      6
2  2024-05-22    11      46         369.0  AL QOUZ IND.FOURTH      1      0
3  2024-05-13     9      50         119.0            AL RIGGA      1      1
4  2024-05-07    12      46         264.0   MUHAISANAH SECOND      1      0


In [2]:
# Convert event_date, hour, and minute to a datetime column
df['datetime'] = pd.to_datetime(df['event_date'] + ' ' + df['hour'].astype(str) + ':' + df['minute'].astype(str))

# Drop the original date, hour, and minute columns (if no longer needed)
df.drop(['event_date', 'hour', 'minute'], axis=1, inplace=True)

# Handling missing values if any
df.fillna(0, inplace=True)

# Check the data types and ensure everything is correct
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3236905 entries, 0 to 3236904
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   community_id    float64       
 1   community_name  object        
 2   picks           int64         
 3   drops           int64         
 4   datetime        datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 123.5+ MB
None


In [3]:
# Feature engineering: Create time-based features
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Create lag features (for predicting pickups based on previous minutes)
df['lag_1'] = df['picks'].shift(1)
df['lag_60'] = df['picks'].shift(60)  # Lag by 60 minutes

# Drop NA values created by lag features
df.dropna(inplace=True)

# Check the updated dataframe
print(df.head())


    community_id                     community_name  picks  drops  \
60         115.0                          AL SABKHA      0      2   
61         412.0                         AL KHEERAN      2      1   
62         352.0                      JUMEIRA THIRD      0      1   
63         324.0                           AL KIFAF      2      0   
64         631.0  HADAEQ SHEIKH MOHAMMED BIN RASHID     43     25   

              datetime  hour  day_of_week  is_weekend  lag_1  lag_60  
60 2024-05-02 18:21:00    18            3           0   11.0   222.0  
61 2024-05-09 19:35:00    19            3           0    0.0    24.0  
62 2024-05-11 00:36:00     0            5           1    2.0     1.0  
63 2024-05-30 07:27:00     7            3           0    0.0     1.0  
64 2024-05-06 06:20:00     6            0           0    2.0     1.0  


In [4]:
from sklearn.model_selection import train_test_split

# Define your features (X) and target (y)
X = df[['hour', 'day_of_week', 'is_weekend', 'lag_1', 'lag_60']]
y = df['picks']

# Split the data into training and testing sets (train on past data, test on the current day)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Check the shapes of the train/test sets
print(X_train.shape, X_test.shape)


(2589476, 5) (647369, 5)


In [7]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [8]:
import xgboost as xgb

# Initialize the model
model = xgb.XGBRegressor()

# Train the model
model.fit(X_train, y_train)

# Predict the pickups for the current day
predictions = model.predict(X_test)

# Display the predictions
print(predictions)


[2.8448215 2.5210261 6.1964736 ... 3.170137  2.7183354 1.9609233]


In [10]:
from sklearn.metrics import root_mean_squared_error
# Calculate and print the evaluation metrics
mae = mean_absolute_error(y_test, predictions)
rmse = root_mean_squared_error(y_test, predictions)

print(f"MAE: {mae}, RMSE: {rmse}")


MAE: 7.395148410266548, RMSE: 27.65664760648458


In [12]:
import pandas as pd

# Assuming you have the original dataframe 'df' which includes community info
# We will merge the community information with the predictions

# Recreate the 'X_test' to include community information (like community_name)
X_test['community_id'] = df['community_id'].loc[y_test.index]
X_test['community_name'] = df['community_name'].loc[y_test.index]

# Add actual and predicted values to the test DataFrame
X_test['actual_picks'] = y_test
X_test['predicted_picks'] = predictions

# Save the DataFrame to CSV
save_path = r'E:\Data_Practice\Taxi_Predictions.csv'
X_test.to_csv(save_path, index=False)

print(f"Predictions saved to {save_path}")


Predictions saved to E:\Data_Practice\Taxi_Predictions.csv
