In [1]:
import pandas as pd
from datetime import datetime, time

# Load your dataset
df = pd.read_csv("cab_booking_training_data.csv")

# Convert time strings
def to_time(t):
    try:
        return datetime.strptime(t.strip(), "%H:%M").time()
    except:
        return None

df['Shift_Start'] = df['Shift_Start'].apply(to_time)
df['Shift_End'] = df['Shift_End'].apply(to_time)

# Cab-required logic
def strict_cab_required(start, end):
    if not start or not end:
        return False

    night_start = time(22, 0)
    night_end = time(6, 0)

    if start >= night_start and end <= night_end:
        return True
    elif start == night_start or start == night_end:
        return True
    elif end == night_start or end == night_end:
        return True
    elif (start >= night_start or start <= night_end):
        return True
    elif (end >= night_start or end <= night_end):
        return True
    return False

# Apply logic
df['Cab_Required_Logic'] = df.apply(lambda row: strict_cab_required(row['Shift_Start'], row['Shift_End']), axis=1)

# Fix mismatches
df['Cab_Required'] = df['Cab_Required_Logic']

# Drop the helper column
df.drop(columns=['Cab_Required_Logic'], inplace=True)

# Save the corrected file
df.to_csv("cab_booking_training_data_corrected.csv", index=False)

print("✅ Cab_Required column corrected and saved to 'cab_booking_training_data_corrected.csv'")


✅ Cab_Required column corrected and saved to 'cab_booking_training_data_corrected.csv'


In [1]:
import pandas as pd
from datetime import datetime, time

# Load your dataset
df = pd.read_csv("balcab_booking_dataset.csv")

# Convert Shift_Start and Shift_End to datetime, then extract time
df['Shift_Start'] = pd.to_datetime(df['Shift_Start']).dt.time
df['Shift_End'] = pd.to_datetime(df['Shift_End']).dt.time

# Cab required logic (based on your full rules)
def strict_cab_required(start, end):
    if not start or not end:
        return False

    night_start = time(22, 0)
    night_end = time(6, 0)

    if start >= night_start and end <= night_end:
        return True
    elif start == night_start or start == night_end:
        return True
    elif end == night_start or end == night_end:
        return True
    elif (start >= night_start or start <= night_end):
        return True
    elif (end >= night_start or end <= night_end):
        return True
    return False

# Apply cab-required logic
df['Cab_Required_Logic'] = df.apply(lambda row: strict_cab_required(row['Shift_Start'], row['Shift_End']), axis=1)

# Compare against existing column
mismatch_df = df[df['Cab_Required'] != df['Cab_Required_Logic']]

# Show mismatches
print(mismatch_df[['Emp_ID', 'Shift_Start', 'Shift_End', 'Cab_Required', 'Cab_Required_Logic']])



Empty DataFrame
Columns: [Emp_ID, Shift_Start, Shift_End, Cab_Required, Cab_Required_Logic]
Index: []


  df['Shift_Start'] = pd.to_datetime(df['Shift_Start']).dt.time
  df['Shift_End'] = pd.to_datetime(df['Shift_End']).dt.time


In [15]:
df = pd.read_csv('balcab_booking_dataset.csv')
df.head(2)

Unnamed: 0,Emp_ID,Shift_Date,Shift_Start,Shift_End,Cab_Required,DayOfWeek,Booked_At,BookingTimeMin,AvgBookingTimeMin,MissedDeadline,PastBookingCount,PastLateBooking,PreviousRemainders,Risk
0,4387,12-07-2025,20:45,04:45,True,5,12-07-2025 19:09,96,96,1,15,4,5,1
1,1719,13-07-2025,17:00,01:00,True,6,13-07-2025 03:35,805,805,0,8,0,3,0


In [16]:
print(df.columns.tolist())  # View column names before

['Emp_ID', 'Shift_Date', 'Shift_Start', 'Shift_End', 'Cab_Required', 'DayOfWeek', 'Booked_At', 'BookingTimeMin', 'AvgBookingTimeMin', 'MissedDeadline', 'PastBookingCount', 'PastLateBooking', 'PreviousRemainders', 'Risk']


In [17]:
#If Columns contains leading and Trailing spaces
df.columns = df.columns.str.strip()
print(df.columns.tolist())  # View cleaned names

['Emp_ID', 'Shift_Date', 'Shift_Start', 'Shift_End', 'Cab_Required', 'DayOfWeek', 'Booked_At', 'BookingTimeMin', 'AvgBookingTimeMin', 'MissedDeadline', 'PastBookingCount', 'PastLateBooking', 'PreviousRemainders', 'Risk']


In [18]:
#Dropping Colummns as they are unusual
df_clean = df.copy()

df_clean = df_clean.drop(['Emp_ID','Shift_Date', 'Shift_Start', 'Shift_End','Booked_At'], axis=1) #not using inplace = true as it changes og dataset using a copy of it i.e df


In [19]:
df_clean.head(2)

Unnamed: 0,Cab_Required,DayOfWeek,BookingTimeMin,AvgBookingTimeMin,MissedDeadline,PastBookingCount,PastLateBooking,PreviousRemainders,Risk
0,True,5,96,96,1,15,4,5,1
1,True,6,805,805,0,8,0,3,0


In [20]:
print(df_clean["Risk"].value_counts())
print(df_clean["Cab_Required"].value_counts())


Risk
1    250
0    250
Name: count, dtype: int64
Cab_Required
True     364
False    136
Name: count, dtype: int64


In [21]:
df_clean = df_clean[df_clean['Cab_Required'] == True]


In [23]:
print(df_clean["Risk"].value_counts())


Risk
1    202
0    162
Name: count, dtype: int64


In [24]:
df_clean.head(2)

Unnamed: 0,Cab_Required,DayOfWeek,BookingTimeMin,AvgBookingTimeMin,MissedDeadline,PastBookingCount,PastLateBooking,PreviousRemainders,Risk
0,True,5,96,96,1,15,4,5,1
1,True,6,805,805,0,8,0,3,0


In [25]:
df_clean.isnull().sum()

Cab_Required          0
DayOfWeek             0
BookingTimeMin        0
AvgBookingTimeMin     0
MissedDeadline        0
PastBookingCount      0
PastLateBooking       0
PreviousRemainders    0
Risk                  0
dtype: int64

In [26]:
df_clean = df_clean.drop('Cab_Required', axis=1)


In [27]:
df_clean.head(2)

Unnamed: 0,DayOfWeek,BookingTimeMin,AvgBookingTimeMin,MissedDeadline,PastBookingCount,PastLateBooking,PreviousRemainders,Risk
0,5,96,96,1,15,4,5,1
1,6,805,805,0,8,0,3,0


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


# --- Define Features and Target ---
X = df_clean.drop(['Risk'], axis=1)     # Features
y = df_clean['Risk']                    # Target (1 = risky, 0 = safe)

# --- Split into Train/Test Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Random Forest Model ---
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# --- Evaluate Random Forest ---
rf_preds = rf_model.predict(X_test)
print("📊 Random Forest Performance:")
print(classification_report(y_test, rf_preds))
print("Accuracy:", accuracy_score(y_test, rf_preds))

# --- Save Random Forest Model ---
joblib.dump(rf_model, "rf_model.pkl")


📊 Random Forest Performance:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        30
           1       0.98      1.00      0.99        43

    accuracy                           0.99        73
   macro avg       0.99      0.98      0.99        73
weighted avg       0.99      0.99      0.99        73

Accuracy: 0.9863013698630136


['rf_model.pkl']

In [29]:
# --- XGBoost Model ---
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# --- Evaluate XGBoost ---
xgb_preds = xgb_model.predict(X_test)
print("\n📊 XGBoost Performance:")
print(classification_report(y_test, xgb_preds))
print("Accuracy:", accuracy_score(y_test, xgb_preds))

# --- Save XGBoost Model ---
joblib.dump(xgb_model, "xgb_model.pkl")


📊 XGBoost Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        43

    accuracy                           1.00        73
   macro avg       1.00      1.00      1.00        73
weighted avg       1.00      1.00      1.00        73

Accuracy: 1.0


Parameters: { "use_label_encoder" } are not used.



['xgb_model.pkl']

In [None]:
#SMOTE USe when dataset is imbalanced
# from sklearn.preprocessing import StandardScaler
# from imblearn.over_sampling import SMOTE

# # --- Step 4: Feature Scaling ---
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # --- Step 5: Apply SMOTE on TRAINING data only ---
# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# # --- Step 6: Train Models ---

# ## 🎯 Random Forest
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_model.fit(X_train_resampled, y_train_resampled)
# y_pred_rf = rf_model.predict(X_test_scaled)

# print("\n📊 Random Forest Performance:")
# print(classification_report(y_test, y_pred_rf))
# print("Accuracy:", accuracy_score(y_test, y_pred_rf))
# joblib.dump(rf_model, "rf_model.pkl")
