In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Constants
NUM_ROWS = 100000
CITIES = ["Indore"]
AREAS = [
    "Vijay Nagar", "Scheme No. 54", "Scheme No. 74", "Scheme No. 78", "Scheme No. 94",
    "Scheme No. 114", "Scheme No. 140", "Scheme No. 71", "Scheme No. 72", "Scheme No. 73",
    "Rajendra Nagar", "Palasia", "Mhow Naka", "Geeta Bhawan", "Sudama Nagar",
    "Bhanwarkuan", "Rau", "Bhawarkua", "Tilak Nagar", "Sapna Sangeeta",
    "Navlakha", "Chhoti Gwaltoli", "Bada Gwaltoli", "LIG Colony", "Malharganj",
    "Khajrana", "Aerodrome Road", "MR 10", "MR 9", "MR 4",
    "MR 3", "MR 2", "MR 1", "Pipliyahana", "Silicon City",
    "Super Corridor", "RNT Marg", "MG Road", "AB Road", "Ring Road",
    "Banganga", "Annapurna", "South Tukoganj", "Nanda Nagar", "Saket Nagar",
    "Patel Nagar", "Juni Indore", "Hira Nagar", "Lokmanya Nagar", "Lasudia Mori"
]
VEHICLE_TYPES = ["Auto", "Prime Plus", "Prime Sedan", "Mini", "Bike", "eBike", "Prime SUV"]
BOOKING_STATUS = ["Success", "Canceled by Customer", "Canceled by Driver", "Incomplete"]
CANCEL_REASONS_CUSTOMER = [
    "Driver is not moving towards pickup location",
    "Driver asked to cancel",
    "AC is not working",
    "Change of plans",
    "Wrong Address"
]
CANCEL_REASONS_DRIVER = [
    "Personal & Car related issue",
    "Customer related issue",
    "Customer was coughing/sick",
    "More than permitted people in there"
]
INCOMPLETE_RIDE_REASONS = [
    "Customer Demand",
    "Vehicle Breakdown",
    "Other Issue"
]
MATCH_DATES = ["2025-01-05", "2025-01-12", "2025-01-19", "2025-01-26"]  # Example match days
WEEKEND_DAYS = [5, 6]  # Saturday and Sunday

# Function to generate random dates and times in January 2025
def generate_datetime():
    start_date = datetime(2025, 1, 1)
    end_date = datetime(2025, 1, 31)
    random_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    random_time = f"{random.randint(0, 23):02d}:{random.randint(0, 59):02d}"
    return random_date.strftime("%Y-%m-%d"), random_time

# Function to generate booking IDs
def generate_booking_id():
    return f"CNR{random.randint(100000000, 999999999)}"

# Function to generate dummy data
def generate_data(num_rows):
    data = []
    probs = np.array([0.62, 0.07, 0.18, 0.13])  # Probabilities sum to 1

    for _ in range(num_rows):
        date, time = generate_datetime()
        booking_id = generate_booking_id()
        booking_status = np.random.choice(BOOKING_STATUS, p=probs)
        customer_id = f"CUST{random.randint(100000, 999999)}"
        vehicle_type = random.choice(VEHICLE_TYPES)

        # Ensure pickup and drop locations are different
        pickup_location, drop_location = random.sample(AREAS, 2)

        avg_vtat = random.randint(5, 20) if booking_status == "Success" else None
        avg_ctat = random.randint(5, 20) if booking_status == "Success" else None

        canceled_by_customer = booking_status == "Canceled by Customer"
        canceled_by_driver = booking_status == "Canceled by Driver"
        incomplete_ride = booking_status == "Incomplete"

        cancel_reason_customer = random.choice(CANCEL_REASONS_CUSTOMER) if canceled_by_customer else None
        cancel_reason_driver = random.choice(CANCEL_REASONS_DRIVER) if canceled_by_driver else None
        incomplete_reason = random.choice(INCOMPLETE_RIDE_REASONS) if incomplete_ride else None

        # Adjusted booking value logic
        booking_value = np.random.choice(
            [random.randint(100, 500), random.randint(501, 1000), random.randint(1001, 2000)],
            p=[0.7, 0.28, 0.02]
        )

        ride_distance = round(random.uniform(2, 20), 2)
        driver_rating = round(random.uniform(3, 5), 1) if booking_status == "Success" else None
        customer_rating = round(random.uniform(3, 5), 1) if booking_status == "Success" else None

        # Increase orders on weekends and match days
        if date in MATCH_DATES or datetime.strptime(date, "%Y-%m-%d").weekday() in WEEKEND_DAYS:
            booking_value = int(booking_value * 1.5)  # Increase booking value on weekends and match days

        data.append([
            date, time, booking_id, booking_status, customer_id, vehicle_type,
            pickup_location, drop_location, avg_vtat, avg_ctat, canceled_by_customer,
            cancel_reason_customer, canceled_by_driver, cancel_reason_driver,
            incomplete_ride, incomplete_reason, booking_value, ride_distance,
            driver_rating, customer_rating
        ])
    return data

# Generate the data
data = generate_data(NUM_ROWS)

# Create DataFrame
df = pd.DataFrame(data, columns=[
    "Date", "Time", "Booking ID", "Booking Status", "Customer ID", "Vehicle Type",
    "Pickup Location", "Drop Location", "Avg VTAT", "Avg CTAT", "Canceled Rides by Customer",
    "Reason for canceling by Customer", "Canceled Rides by Driver", "Reason for canceling by Driver",
    "Incomplete Rides", "Incomplete Rides Reason", "Booking Value", "Ride Distance",
    "Driver Ratings", "Customer Rating"
])

# Convert Date column to datetime format
df["Date"] = pd.to_datetime(df["Date"])

# Save to CSV
df.to_csv("indore ola dataset.csv", index=False)
print("Data generated and saved to 'indore ola dataset.csv'")


Data generated and saved to 'indore_ride_data_jan_2025.csv'


In [4]:
from IPython.display import display, Markdown

contact_details = """
## 📞 Contact Details  
- 🔗 **LinkedIn:** [Vinay Kumar Panika](https://www.linkedin.com/in/vinaykumarpanika)  
- 💻 **GitHub:** [Vinaypanika](https://github.com/Vinaypanika)  
- 📧 **Email:** vinaypanika@gmail.com  
- 📱 **Phone:** +91-7415552944  
"""

display(Markdown(contact_details))



## 📞 Contact Details  
- 🔗 **LinkedIn:** [Vinay Kumar Panika](https://www.linkedin.com/in/vinaykumarpanika)  
- 💻 **GitHub:** [Vinaypanika](https://github.com/Vinaypanika)  
- 📧 **Email:** vinaypanika@gmail.com  
- 📱 **Phone:** +91-7415552944  
