In [4]:
import pandas as pd
import numpy as np
from random import randint, choice
import datetime

# Set the number of rows
num_rows = 100000

# Define the parameters
vehicle_types = ['Auto', 'Prime Plus', 'Prime Sedan', 'Mini', 'Bike', 'eBike', 'Prime SUV']
locations = ['Location ' + str(i) for i in range(1, 51)]  # 50 dummy locations
cancel_reasons_customer = [
    'Driver is not moving towards pickup location',
    'Driver asked to cancel',
    'AC is not working (only for 4 wheelers)',
    'Change of plans',
    'Wrong Address'
]
cancel_reasons_driver = [
    'Personal & Car related issue',
    'Customer related issue',
    'Customer was coughing/sick',
    'More than permitted people in there'
]
incomplete_reasons = ['Vehicle Breakdown', 'Other Issue']

# Generate data
dates = pd.date_range('2025-02-01', periods=num_rows, freq='H')
times = [datetime.time(randint(0, 23), randint(0, 59)) for _ in range(num_rows)]
booking_ids = ['CNR' + str(randint(1000000000, 9999999999)) for _ in range(num_rows)]
statuses = ['Success' if randint(0, 100) <= 62 else 'Failed' for _ in range(num_rows)]
customer_ids = [randint(10000, 99999) for _ in range(num_rows)]
vehicle_types_column = [choice(vehicle_types) for _ in range(num_rows)]
pickup_locations = [choice(locations) for _ in range(num_rows)]
drop_locations = [choice(locations) for _ in range(num_rows)]
avg_vtat = [randint(5, 30) if statuses[i] == 'Success' else np.nan for i in range(num_rows)]
avg_ctat = [randint(5, 30) if statuses[i] == 'Success' else np.nan for i in range(num_rows)]
cancelled_by_customer = [randint(0, 100) <= 7 if statuses[i] == 'Failed' else 0 for i in range(num_rows)]
cancel_reasons = [choice(cancel_reasons_customer) if cancelled_by_customer[i] > 0 else np.nan for i in range(num_rows)]
cancelled_by_driver = [randint(0, 100) <= 18 if statuses[i] == 'Failed' else 0 for i in range(num_rows)]
incomplete_rides = [randint(0, 100) <= 6 if statuses[i] == 'Failed' else 0 for i in range(num_rows)]
incomplete_reasons_column = [choice(incomplete_reasons) if incomplete_rides[i] > 0 else np.nan for i in range(num_rows)]
booking_value = [round(randint(100, 1000) * 1.5) if statuses[i] == 'Success' else 0 for i in range(num_rows)]
ride_distance = [round(randint(5, 50), 1) if statuses[i] == 'Success' else np.nan for i in range(num_rows)]
driver_ratings = [round(randint(3, 5), 1) if statuses[i] == 'Success' else np.nan for i in range(num_rows)]
customer_ratings = [round(randint(3, 5), 1) if statuses[i] == 'Success' else np.nan for i in range(num_rows)]

# Prepare the DataFrame
data = pd.DataFrame({
    'Date': dates,
    'Time': times,
    'Booking ID': booking_ids,
    'Booking Status': statuses,
    'Customer ID': customer_ids,
    'Vehicle Type': vehicle_types_column,
    'Pickup Location': pickup_locations,
    'Drop Location': drop_locations,
    'Avg VTAT': avg_vtat,
    'Avg CTAT': avg_ctat,
    'Canceled Rides by Customer': cancelled_by_customer,
    'Reason for canceling by Customer': cancel_reasons,
    'Canceled Rides by Driver': cancelled_by_driver,
    'Incomplete Rides': incomplete_rides,
    'Incomplete Rides Reason': incomplete_reasons_column,
    'Booking Value': booking_value,
    'Ride Distance': ride_distance,
    'Driver Ratings': driver_ratings,
    'Customer Rating': customer_ratings
})

# Adjust values for specific conditions
data['Booking Status'] = np.where(np.random.rand(num_rows) <= 0.62, 'Success', 'Failed')

# Filter out canceled rides by customers more than 7%
data['Canceled Rides by Customer'] = np.where(data['Booking Status'] == 'Failed', 
                                              np.random.choice([1, 0], p=[0.07, 0.93]), 0)

# Filter out canceled rides by drivers more than 18%
data['Canceled Rides by Driver'] = np.where(data['Booking Status'] == 'Failed', 
                                            np.random.choice([1, 0], p=[0.18, 0.82]), 0)

# Weekend order values adjustment
weekends = [5, 6]  # Saturday, Sunday
data['Booking Value'] = np.where(data['Date'].dt.weekday.isin(weekends), 
                                 np.where(data['Booking Value'] > 500, 1000, 500), 
                                 data['Booking Value'])

# Match day orders (assuming a list of dates for match days)
match_days = ['2025-02-04', '2025-02-07', '2025-02-14']  # Example match dates
data['Booking Value'] = np.where(data['Date'].dt.strftime('%Y-%m-%d').isin(match_days), 1000, data['Booking Value'])

# Output the data to a CSV file
data.to_csv('indore.csv', index=False)

print("Data Generation Complete.")


Data Generation Complete.


In [5]:
from IPython.display import display, Markdown

contact_details = """
## 📞 Contact Details  
- 🔗 **LinkedIn:** [Vinay Kumar Panika](https://www.linkedin.com/in/vinaykumarpanika)  
- 💻 **GitHub:** [Vinaypanika](https://github.com/Vinaypanika)  
- 📧 **Email:** vinaypanika@gmail.com  
- 📱 **Phone:** +91-7415552944  
"""

display(Markdown(contact_details))



## 📞 Contact Details  
- 🔗 **LinkedIn:** [Vinay Kumar Panika](https://www.linkedin.com/in/vinaykumarpanika)  
- 💻 **GitHub:** [Vinaypanika](https://github.com/Vinaypanika)  
- 📧 **Email:** vinaypanika@gmail.com  
- 📱 **Phone:** +91-7415552944  
