In [3]:
!pip install faker



In [3]:
!pip install gender_guesser

Collecting gender_guesser
  Obtaining dependency information for gender_guesser from https://files.pythonhosted.org/packages/13/fb/3f2aac40cd2421e164cab1668e0ca10685fcf896bd6b3671088f8aab356e/gender_guesser-0.4.0-py2.py3-none-any.whl.metadata
  Downloading gender_guesser-0.4.0-py2.py3-none-any.whl.metadata (3.0 kB)
Downloading gender_guesser-0.4.0-py2.py3-none-any.whl (379 kB)
   ---------------------------------------- 0.0/379.3 kB ? eta -:--:--
   -- ------------------------------------ 20.5/379.3 kB 640.0 kB/s eta 0:00:01
   ------- ------------------------------- 71.7/379.3 kB 787.7 kB/s eta 0:00:01
   --------------- ------------------------ 143.4/379.3 kB 1.4 MB/s eta 0:00:01
   ------------------------------ --------- 286.7/379.3 kB 1.6 MB/s eta 0:00:01
   ---------------------------------------- 379.3/379.3 kB 2.0 MB/s eta 0:00:00
Installing collected packages: gender_guesser
Successfully installed gender_guesser-0.4.0


In [10]:
from faker import Faker
import pandas as pd
import numpy as np
import gender_guesser.detector as gender
from datetime import timedelta
import random

fake = Faker()
Faker.seed(42)
np.random.seed(42)
detector = gender.Detector()

# -----------------------
# Passengers
# -----------------------
num_passengers = 1_000_000
first_names = [fake.first_name() for _ in range(num_passengers)]
genders = []
for name in first_names:
    g = detector.get_gender(name)
    if g in ["male", "mostly_male"]:
        genders.append("M")
    elif g in ["female", "mostly_female"]:
        genders.append("F")
    else:
        genders.append(np.random.choice(["M", "F"]))

passengers = pd.DataFrame({
    "passenger_id": range(1, num_passengers + 1),
    "first_name": first_names,
    "last_name": [fake.last_name() for _ in range(num_passengers)],
    "gender": genders,
    "dob": [fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(num_passengers)]
})

# -----------------------
# Flights
# -----------------------
num_flights = 1_000_000
flights = pd.DataFrame({
    "flight_id": range(1, num_flights + 1),
    "airline_id": np.random.randint(1, 46, num_flights),          # 1-45
    "origin_airport_id": np.random.randint(1, 41, num_flights),   # 1-40
    "destination_airport_id": np.random.randint(1, 41, num_flights) # 1-40
})

# Airline codes
airline_codes = {
    1: "AA", 2: "BA", 3: "EK", 4: "LH", 5: "SQ",
    6: "DL", 7: "UA", 8: "QF", 9: "AF", 10: "KL",
    11: "AC", 12: "AZ", 13: "CX", 14: "JL", 15: "NH",
    16: "AI", 17: "SA", 18: "OS", 19: "IB", 20: "SK",
    21: "VS", 22: "NZ", 23: "TG", 24: "MH", 25: "LX",
    26: "OU", 27: "TP", 28: "TK", 29: "SV", 30: "QR",
    31: "ET", 32: "RO", 33: "JU", 34: "BI", 35: "FM",
    36: "KE", 37: "VN", 38: "PR", 39: "CI", 40: "MS",
    41: "SN", 42: "LY", 43: "RJ", 44: "MF", 45: "GA"
}

def generate_flight_number(airline_id):
    code = airline_codes[airline_id]
    number = np.random.randint(100, 9999)
    return f"{code}{number}"

flights['flight_number'] = flights['airline_id'].apply(generate_flight_number)

# Flight dates with minutes ending with 0 or 5
valid_minutes = [i for i in range(0, 60, 5)]  # 0,5,10,...,55

flights['departure_date'] = [
    fake.date_time_between(start_date='-5y', end_date='now').replace(
        minute=random.choice(valid_minutes),
        second=0,
        microsecond=0
    )
    for _ in range(num_flights)
]

flights['arrival_date'] = [
    dep + timedelta(
        hours=random.randint(1, 15),
        minutes=random.choice(valid_minutes)
    )
    for dep in flights['departure_date']
]

# -----------------------
# Bookings
# -----------------------
num_bookings = 2_000_000
flight_departures = flights.set_index('flight_id')['departure_date']
booking_flight_ids = np.random.choice(flights['flight_id'], num_bookings)

booking_dates = [
    fake.date_time_between(start_date=flight_departures[fid] - timedelta(days=180),
                           end_date=flight_departures[fid])
    for fid in booking_flight_ids
]

bookings = pd.DataFrame({
    "booking_id": range(1, num_bookings + 1),
    "flight_id": booking_flight_ids,
    "passenger_id": np.random.choice(passengers["passenger_id"], num_bookings),
    "booking_date": booking_dates,
    "ticket_price": np.random.uniform(50, 2000, num_bookings).round(2),
    "booking_status": np.random.choice(["Confirmed", "Cancelled", "Pending"], num_bookings)
})

# -----------------------
# SAVE TO CSV
# -----------------------
passengers.to_csv("d_passenger.csv", index=False)
flights.to_csv("f_flight.csv", index=False)
bookings.to_csv("f_booking.csv", index=False)
