In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from scipy.stats import chi2_contingency
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_classif
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score
import math
pd.set_option('display.max_columns', None)

In [2]:
riders = pd.read_csv('Riders.csv')
train_raw = pd.read_csv( 'Train.csv')

In [3]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21201 entries, 0 to 21200
Data columns (total 29 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Order No                                   21201 non-null  object 
 1   User Id                                    21201 non-null  object 
 2   Vehicle Type                               21201 non-null  object 
 3   Platform Type                              21201 non-null  int64  
 4   Personal or Business                       21201 non-null  object 
 5   Placement - Day of Month                   21201 non-null  int64  
 6   Placement - Weekday (Mo = 1)               21201 non-null  int64  
 7   Placement - Time                           21201 non-null  object 
 8   Confirmation - Day of Month                21201 non-null  int64  
 9   Confirmation - Weekday (Mo = 1)            21201 non-null  int64  
 10  Confirmation - Time   

In [4]:
# Convert time columns to datetime
time_columns = [
    "Confirmation - Time",
    "Arrival at Pickup - Time",
    "Pickup - Time",
    "Arrival at Destination - Time"
]

for col in time_columns:
    train_raw[col] = pd.to_datetime(train_raw[col], format='%I:%M:%S %p').dt.time

# Function to convert time to seconds since midnight
def time_to_seconds(t):
    return t.hour * 3600 + t.minute * 60 + t.second

# Apply conversion
for col in time_columns:
    train_raw[f"{col}(Seconds since midnight)"] = train_raw[col].apply(time_to_seconds)



train_raw['Placement - Time'] = pd.to_datetime(train_raw['Placement - Time'])

# Extract the hour from the 'Placement - Time'
train_raw['Placement_Hour'] = train_raw['Placement - Time'].dt.hour


train_raw[col] = pd.to_datetime(train_raw[col], format='%H:%M:%S').dt.time

# Drop original time columns
train_raw.drop(columns=time_columns, inplace=True)

  train_raw['Placement - Time'] = pd.to_datetime(train_raw['Placement - Time'])


In [5]:
# Calculate the average temperature for each hour
hourly_avg_temp = train_raw.groupby('Placement_Hour')['Temperature'].mean()

# Function to fill NaN values with the average temperature of the corresponding hour
def fill_temperature(row):
    if pd.isnull(row['Temperature']):
        return hourly_avg_temp[row['Placement_Hour']]
    return row['Temperature']

# Apply the function to fill NaN values
train_raw['Temperature'] = train_raw.apply(fill_temperature, axis=1)

# Drop the auxiliary 'Hour' column
train_raw.drop('Placement_Hour', axis=1, inplace=True)
train_raw

#Round it off to one decimal places
train_raw['Temperature'] = train_raw['Temperature'].round(1)

In [6]:
train_raw.drop('Precipitation in millimeters', axis=1, inplace=True)

In [7]:
# Define features and target
X = train_raw.drop(columns=["Order No", "User Id", "Rider Id", "Arrival at Destination - Time(Seconds since midnight)"])
y = train_raw["Arrival at Destination - Time(Seconds since midnight)"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
train_raw.columns

Index(['Order No', 'User Id', 'Vehicle Type', 'Platform Type',
       'Personal or Business', 'Placement - Day of Month',
       'Placement - Weekday (Mo = 1)', 'Placement - Time',
       'Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
       'Arrival at Pickup - Day of Month',
       'Arrival at Pickup - Weekday (Mo = 1)', 'Pickup - Day of Month',
       'Pickup - Weekday (Mo = 1)', 'Arrival at Destination - Day of Month',
       'Arrival at Destination - Weekday (Mo = 1)', 'Distance (KM)',
       'Temperature', 'Pickup Lat', 'Pickup Long', 'Destination Lat',
       'Destination Long', 'Rider Id', 'Time from Pickup to Arrival',
       'Confirmation - Time(Seconds since midnight)',
       'Arrival at Pickup - Time(Seconds since midnight)',
       'Pickup - Time(Seconds since midnight)',
       'Arrival at Destination - Time(Seconds since midnight)'],
      dtype='object')

In [9]:
# Preprocessing pipelines for numerical and categorical data

numerical_features = ['Pickup - Day of Month',
                      'Pickup - Weekday (Mo = 1)','Distance (KM)',
                      'Temperature','Pickup - Time(Seconds since midnight)']

categorical_features = ["Vehicle Type", "Platform Type", "Personal or Business"]

# Verify that all columns exist in X_train
missing_numerical = [col for col in numerical_features if col not in X_train.columns]
missing_categorical = [col for col in categorical_features if col not in X_train.columns]

if missing_numerical or missing_categorical:
    print("Missing numerical columns:", missing_numerical)
    print("Missing categorical columns:", missing_categorical)
    raise ValueError("Some columns are missing from the dataframe")

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[('num', numerical_transformer, numerical_features),('cat', categorical_transformer, categorical_features)])

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])

# Train the model
pipeline.fit(X_train, y_train)

In [10]:
# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")

Root Mean Squared Error: 850.5423813058522
R^2 Score: 0.9912368676235768


In [None]:
# Function to convert seconds since midnight to HH:MM:SS format
def seconds_to_time(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

# Get user input for required features
user_input = {
    "Vehicle Type": input("Enter Vehicle Type (e.g., Car, Bike): "),
    "Platform Type": int(input("Enter Platform Type (e.g., 1, 2): ")),
    "Personal or Business": input("Enter Personal or Business: "),
    "Pickup - Day of Month": int(input("Enter Pickup - Day of Month: ")),
    "Pickup - Weekday (Mo = 1)": int(input("Enter Pickup - Weekday (Mo = 1): ")),
    "Pickup - Time": input("Enter Pickup - Time (HH:MM:SS AM/PM): "),
    "Distance (KM)": float(input("Enter Distance (KM): ")),
    "Temperature": float(input("Enter Temperature: "))
}

# Convert time inputs to seconds since midnight
for col in ["Pickup - Time"]:
    user_input[f"{col}(Seconds since midnight)"] = time_to_seconds(pd.to_datetime(user_input[col], format='%I:%M:%S %p').time())

# Create a DataFrame for the user input
user_input_df = pd.DataFrame([user_input])

# Drop original time columns
user_input_df.drop(columns=["Pickup - Time"], inplace=True)

# Define features (exclude columns not needed for prediction)
#X_new = user_input_df.drop(columns=["Order No", "User Id", "Rider Id", "Arrival at Destination - Time(Seconds since midnight)"])

# Make predictions
new_prediction = pipeline.predict(user_input_df)

# Convert predicted time in seconds since midnight to HH:MM:SS format
predicted_time = seconds_to_time(new_prediction[0])

# Display the predicted arrival time
print("Predicted Arrival at Destination Time:", predicted_time)

In [13]:
import pickle

with open('pipepline.pkl', 'wb') as file:
    pickle.dump(model, file)