In [1]:
pip install pandas faker numpy


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import csv
import random
from faker import Faker
from datetime import timedelta

faker = Faker()

route_names = [
    "Colombo - Galle",
    "Kandy - Colombo",
    "Jaffna - Colombo",
    "Colombo - Kurunegala",
    "Matara - Colombo",
    "Colombo - Negambo",
    "Colombo - Anuradhapura",
    "Colombo - Trincomalee",
    "Colombo - Ratnapura",
    "Colombo - Gampaha",
    "Colombo - Dambulla",
    "Kandy - NuwaraEliya",
    "Kandy - Badulla",
    "Matara - Galle",
    "Kurunagala - Kandy",
    "Jaffna - Kilinochchi",
    "Galle - Hikkaduwa",
    "Colombo - Kaluthara",
    "Colombo - Beruwala",
    "Colombo - Batticaloa"
]

station_names = {
    "Colombo - Galle": ["Colombo Fort", "Mount Lavinia", "Moratuwa", "Panadura", "Kalutara", "Aluthgama", "Galle"],
    "Kandy - Colombo": ["Kandy", "Peradeniya", "Gampola", "Nawalapitiya", "Kadugannawa", "Rambukkana", "Polgahawela", "Mawanella", "Colombo Fort"],
    "Jaffna - Colombo": ["Jaffna", "Chavakachcheri", "Killinochchi", "Vavuniya", "Anuradhapura", "Kandy", "Colombo Fort"],
    "Colombo - Kurunegala": ["Colombo Fort", "Gampaha", "Veyangoda", "Nittambuwa", "Kirindiwela", "Kurunegala"],
    "Matara - Colombo": ["Matara", "Weligama", "Ahangama", "Galle", "Hikkaduwa", "Ambalangoda", "Bentota", "Kalutara", "Colombo Fort"],
    "Colombo - Negambo": ["Colombo Fort", "Wattala", "Hendala", "Kelaniya", "Peliyagoda", "Negombo"],
    "Colombo - Anuradhapura": ["Colombo Fort", "Veyangoda", "Polgahawela", "Kurunegala", "Anuradhapura"],
    "Colombo - Trincomalee": ["Colombo Fort", "Veyangoda", "Kurunegala", "Habarana", "Trincomalee"],
    "Colombo - Ratnapura": ["Colombo Fort", "Maharagama", "Nugegoda", "Pannipitiya", "Kottawa", "Ratnapura"],
    "Colombo - Gampaha": ["Colombo Fort", "Pettah", "Colombo 7", "Gampaha"],
    "Colombo - Dambulla": ["Colombo Fort", "Kaduwela", "Kurunegala", "Dambulla"],
    "Kandy - NuwaraEliya": ["Kandy", "Peradeniya", "Gampola", "Nanu Oya", "Nuwara Eliya"],
    "Kandy - Badulla": ["Kandy", "Peradeniya", "Gampola", "Nuwara Eliya", "Badulla"],
    "Matara - Galle": ["Matara", "Weligama", "Galle"],
    "Kurunagala - Kandy": ["Kurunegala", "Mawanella", "Kandy"],
    "Jaffna - Kilinochchi": ["Jaffna", "Chavakachcheri", "Kilinochchi"],
    "Galle - Hikkaduwa": ["Galle", "Hikkaduwa"],
    "Colombo - Kaluthara": ["Colombo Fort", "Mount Lavinia", "Kalutara"],
    "Colombo - Beruwala": ["Colombo Fort", "Kalutara", "Beruwala"],
    "Colombo - Batticaloa": ["Colombo Fort", "Anuradhapura", "Polonnaruwa", "Batticaloa"]
}

def generate_vehicle_data(num_records):
    vehicle_types = ['Bus', 'Train', 'Metro', 'Ferry']
    fuel_types = ['Diesel', 'Electric', 'Hybrid']
    data = []
    for i in range(num_records):
        record = [
            f"V{i + 1:04d}",  # vehicle_id
            random.choice(vehicle_types),  # vehicle_type
            f"F{faker.unique.random_int(1000, 9999)}",  # fleet_number
            faker.random_int(10, 200),  # capacity
            faker.company(),  # manufacturer
            random.choice(fuel_types),  # fuel_type
            faker.random_int(1990, 2025)  # year_of_make
        ]
        data.append(record)
    return data

def generate_route_data():
    route_types = ['Urban', 'Express', 'Intercity']
    data = []
    for i, route_name in enumerate(route_names):
        record = [
            f"R{i + 1:04d}",  # route_id
            route_name,  # route_name
            random.choice(route_types),  # route_type
            len(station_names[route_name]),  # total_stations
            round(faker.random_number(digits=3, fix_len=False) + faker.random.random(), 2)  # total_distance
        ]
        data.append(record)
    return data

def generate_station_data():
    data = []
    station_id = 1
    for route in station_names:
        for station_name in station_names[route]:
            record = [
                f"S{station_id:04d}",  # station_id
                station_name,  # station_name
                round(faker.latitude(), 6),  # location_lat
                round(faker.longitude(), 6),  # location_long
                random.choice(['Bus Stop', 'Train Station', 'Metro Hub'])  # station_type
            ]
            data.append(record)
            station_id += 1
    return data

def generate_passenger_data(num_records):
    genders = ['Male', 'Female', 'Other']
    data = []
    for i in range(num_records):
        record = [
            f"P{i + 1:04d}",  # passenger_id
            faker.name(),  # name
            random.choice(genders),  # gender
            faker.random_int(1, 100),  # age
            random.choice(['Y', 'N']),  # registered_user
            faker.unique.bban() if random.choice(['Y', 'N']) == 'Y' else None  # travel_card_id
        ]
        data.append(record)
    return data

def generate_time_data(num_records):
    data = []
    for i in range(num_records):
        date = faker.date_this_century()
        record = [
            f"T{i + 1:04d}",  # time_id
            date,  # full_date
            date.year,  # year
            (date.month - 1) // 3 + 1,  # quarter
            date.month,  # month
            date.day,  # day
            date.strftime("%A")  # weekday
        ]
        data.append(record)
    return data

def generate_trip_data(num_records, vehicle_data, route_data, station_data, time_data):
    data = []
    for i in range(num_records):
        vehicle = random.choice(vehicle_data)
        route = random.choice(route_data)
        stations = [s for s in station_data if s[1] in station_names[route[1]]]
        start_station = random.choice(stations)
        end_station = random.choice(stations)
        time = random.choice(time_data)
        departure_time = faker.date_time_this_year()
        arrival_time = departure_time + timedelta(hours=faker.random_int(1, 5))
        record = [
            f"TR{i + 1:04d}",  # trip_id
            vehicle[0],  # vehicle_id
            route[0],  # route_id
            start_station[0],  # start_station_id
            end_station[0],  # end_station_id
            time[0],  # time_id
            departure_time,  # departure_time
            arrival_time,  # arrival_time
            round(faker.random_number(digits=3, fix_len=False) + faker.random.random(), 2),  # distance_covered
            faker.random_int(1, vehicle[3]),  # passengers_count
            round(faker.random_number(digits=2, fix_len=False) + faker.random.random(), 2)  # delay_minutes
        ]
        data.append(record)
    return data

def generate_ticket_sales_data(num_records, passenger_data, trip_data, time_data):
    payment_methods = ['Cash', 'Card', 'Online', 'Travel Card']
    data = []
    for i in range(num_records):
        passenger = random.choice(passenger_data)
        trip = random.choice(trip_data)
        time = random.choice(time_data)
        record = [
            f"TS{i + 1:04d}",  # ticket_id
            passenger[0],  # passenger_id
            trip[0],  # trip_id
            time[0],  # time_id
            faker.date_time_this_year(),  # purchase_time
            round(faker.random_number(digits=3, fix_len=False) + faker.random.random(), 2),  # fare_amount
            random.choice(payment_methods),  # payment_method
            round(faker.random_number(digits=2, fix_len=False) + faker.random.random(), 2)  # discount_applied
        ]
        data.append(record)
    return data

def generate_live_tracking_data(num_records, vehicle_data, time_data):
    statuses = ['On Time', 'Delayed', 'Cancelled']
    data = []
    for i in range(num_records):
        vehicle = random.choice(vehicle_data)
        time = random.choice(time_data)
        record = [
            f"LT{i + 1:04d}",  # tracking_id
            vehicle[0],  # vehicle_id
            time[0],  # time_id
            faker.date_time_this_year(),  # recorded_time
            round(faker.latitude(), 6),  # location_lat
            round(faker.longitude(), 6),  # location_long
            round(faker.random_number(digits=2, fix_len=False) + faker.random.random(), 2),  # speed
            random.choice(statuses)  # status
        ]
        data.append(record)
    return data

def write_to_csv(filename, data, headers):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        writer.writerows(data)

if __name__ == "__main__":
    num_records = 600
    
    vehicle_data = generate_vehicle_data(num_records)
    route_data = generate_route_data()
    station_data = generate_station_data()
    passenger_data = generate_passenger_data(num_records)
    time_data = generate_time_data(num_records)
    
    trip_data = generate_trip_data(num_records, vehicle_data, route_data, station_data, time_data)
    ticket_sales_data = generate_ticket_sales_data(num_records, passenger_data, trip_data, time_data)
    live_tracking_data = generate_live_tracking_data(num_records, vehicle_data, time_data)
    
    write_to_csv('Dim_Vehicle.csv', vehicle_data, ['vehicle_id', 'vehicle_type', 'fleet_number', 'capacity', 'manufacturer', 'fuel_type', 'year_of_make'])
    write_to_csv('Dim_Route.csv', route_data, ['route_id', 'route_name', 'route_type', 'total_stations', 'total_distance'])
    write_to_csv('Dim_Station.csv', station_data, ['station_id', 'station_name', 'location_lat', 'location_long', 'station_type'])
    write_to_csv('Dim_Passenger.csv', passenger_data, ['passenger_id', 'name', 'gender', 'age', 'registered_user', 'travel_card_id'])
    write_to_csv('Dim_Time.csv', time_data, ['time_id', 'full_date', 'year', 'quarter', 'month', 'day', 'weekday'])
    write_to_csv('Fact_Trips.csv', trip_data, ['trip_id', 'vehicle_id', 'route_id', 'start_station_id', 'end_station_id', 'time_id', 'departure_time', 'arrival_time', 'distance_covered', 'passengers_count', 'delay_minutes'])
    write_to_csv('Fact_Ticket_Sales.csv', ticket_sales_data, ['ticket_id', 'passenger_id', 'trip_id', 'time_id', 'purchase_time', 'fare_amount', 'payment_method', 'discount_applied'])
    write_to_csv('Fact_Live_Tracking.csv', live_tracking_data, ['tracking_id', 'vehicle_id', 'time_id', 'recorded_time', 'location_lat', 'location_long', 'speed', 'status'])

    print("Data generation complete. CSV files are ready.")

Data generation complete. CSV files are ready.
