In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

np.random.seed(42)

# Train routes
routes = {
    "13149": [
        ("sealdah", "alipurduar"),  # GNWL
        ("malda", "alipurduar"),    # PQWL
        ("naxalbari", "alipurduar") # PQWL
    ],
    "12343": [
        ("sealdah", "njp"),         # GNWL
        ("malda", "njp"),           # PQWL
        ("kishanganj", "njp")       # PQWL
    ]
}

# Months mapping for rules
group_A = [1, 2, 3, 4, 12]  # Jan, Feb, Mar, Apr, Dec
group_B = [5, 10, 11]       # May, Oct, Nov
group_C = [6, 7, 8, 9]      # Jun, Jul, Aug, Sep

# Function to get confirmation probability
def get_prob(month, days_before, wl_type, is_weekend):
    prob = 0.0
    
    if month in group_A:
        if 60 >= days_before >= 30:
            prob = 1.0 if wl_type == "GNWL" else 0.5
        elif 15 <= days_before <= 10:
            prob = 0.5 if wl_type == "GNWL" else 0.0
    elif month in group_B:
        if 60 >= days_before > 40:
            prob = 1.0 if wl_type == "GNWL" else 0.2
        elif 40 >= days_before > 30:
            prob = 0.5 if wl_type == "GNWL" else 0.1
        elif 15 <= days_before <= 10:
            prob = 0.2 if wl_type == "GNWL" else 0.0
    elif month in group_C:
        if 60 >= days_before >= 30:
            prob = 1.0 if wl_type == "GNWL" else 0.5
        elif 15 <= days_before <= 10:
            prob = 0.5 if wl_type == "GNWL" else 0.2
    
    # Reduce by 10% if weekend
    if is_weekend:
        prob = max(0, prob - 0.1)
    
    return prob

# Generate data
records = []
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

for train, stations in routes.items():
    for _ in range(50000):  # large dataset
        # Random journey date
        journey_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        month = journey_date.month
        day_of_week = journey_date.strftime("%A")
        is_weekend = day_of_week in ["Saturday", "Sunday"]
        
        # Days before booking
        days_before = random.randint(10, 60)
        booking_date = journey_date - timedelta(days=days_before)
        
        # Select route
        start_station, end_station = random.choice(stations)
        wl_type = "GNWL" if start_station == "sealdah" else "PQWL"
        
        # Confirmation chance
        prob = get_prob(month, days_before, wl_type, is_weekend)
        confirmed = random.random() < prob
        
        # Booking status format: "CNF" if confirmed else "12GNWL"/"11PQWL"
        if confirmed:
            booking_status = "CNF"
        else:
            wl_num = random.randint(1, 20)
            booking_status = f"{wl_num}{wl_type}"
        
        # Append record
        records.append([
            train, journey_date.date(), booking_date.date(), days_before,
            day_of_week, start_station, end_station, booking_status,
            1 if confirmed else 0
        ])

# Create DataFrame
df = pd.DataFrame(records, columns=[
    "train_number", "journey_date", "booking_date", "days_before",
    "day_of_week", "start_station", "end_station",
    "booking_status", "confirmed"
])

# Save to CSV
df.to_csv("train_booking_dataset.csv", index=False)

print(f"✅ Dataset generated: {df.shape[0]} rows saved as train_booking_dataset.csv")
print(df.head(10))


✅ Dataset generated: 100000 rows saved as train_booking_dataset.csv
  train_number journey_date booking_date  days_before day_of_week  \
0        13149   2024-04-17   2024-03-08           40   Wednesday   
1        13149   2024-07-22   2024-07-08           14      Monday   
2        13149   2024-12-18   2024-10-19           60   Wednesday   
3        13149   2024-02-07   2023-12-25           44   Wednesday   
4        13149   2024-06-23   2024-05-01           53      Sunday   
5        13149   2024-04-20   2024-03-04           47    Saturday   
6        13149   2024-10-30   2024-10-06           24   Wednesday   
7        13149   2024-05-08   2024-04-09           29   Wednesday   
8        13149   2024-03-23   2024-03-07           16    Saturday   
9        13149   2024-02-23   2024-01-12           42      Friday   

  start_station end_station booking_status  confirmed  
0       sealdah  alipurduar            CNF          1  
1         malda  alipurduar         15PQWL          0  
2   

In [5]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

np.random.seed(42)

# Train routes
routes = {
    "13149": ['sealdah', 'malda', 'naxalbari', 'alipurduar'],
    "12343": ['sealdah', 'malda', 'kishanganj', 'njp']
}

# Month groupings based on rules
group_A = [1, 2, 3, 4, 12]  # Jan, Feb, Mar, Apr, Dec
group_B = [5, 10, 11]       # May, Oct, Nov
group_C = [6, 7, 8, 9]      # Jun, Jul, Aug, Sep

# Probability function
def get_prob(month, days_before, wl_type, is_weekend):
    prob = 0.0
    if month in group_A:
        if 60 >= days_before >= 30:
            prob = 1.0 if wl_type == "GNWL" else 0.5
        elif 15 <= days_before <= 10:
            prob = 0.5 if wl_type == "GNWL" else 0.0
    elif month in group_B:
        if 60 >= days_before > 40:
            prob = 1.0 if wl_type == "GNWL" else 0.2
        elif 40 >= days_before > 30:
            prob = 0.5 if wl_type == "GNWL" else 0.1
        elif 15 <= days_before <= 10:
            prob = 0.2 if wl_type == "GNWL" else 0.0
    elif month in group_C:
        if 60 >= days_before >= 30:
            prob = 1.0 if wl_type == "GNWL" else 0.5
        elif 15 <= days_before <= 10:
            prob = 0.5 if wl_type == "GNWL" else 0.2
    
    # Weekend reduction
    if is_weekend:
        prob = max(0, prob - 0.1)
    return prob

# Generate dataset
records = []
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

for train, stations in routes.items():
    for _ in range(50000):  # large dataset per train
        # Journey date
        journey_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        month = journey_date.month
        day_of_week = journey_date.strftime("%A")
        is_weekend = day_of_week in ["Saturday", "Sunday"]

        # Days before booking
        days_before = random.randint(10, 60)
        booking_date = journey_date - timedelta(days=days_before)

        # Pick start and end station (ensure different)
        start_station = random.choice(stations)
        possible_dest = [st for st in stations if st != start_station]
        end_station = random.choice(possible_dest)

        # Determine WL type
        wl_type = "GNWL" if start_station == "sealdah" else "PQWL"

        # Confirmation probability & status
        prob = get_prob(month, days_before, wl_type, is_weekend)
        confirmed = random.random() < prob

        # Booking status text
        if confirmed:
            booking_status = "CNF"
        else:
            wl_num = random.randint(1, 20)
            booking_status = f"{wl_num}{wl_type}"

        # Append record
        records.append([
            train, journey_date.date(), booking_date.date(), days_before,
            day_of_week, start_station, end_station, booking_status,
            1 if confirmed else 0
        ])

# DataFrame
df = pd.DataFrame(records, columns=[
    "train_number", "journey_date", "booking_date", "days_before",
    "day_of_week", "start_station", "end_station",
    "booking_status", "confirmed"
])

# Save CSV
df.to_csv("train_book_dataset.csv", index=False)
print(f"✅ Dataset generated: {df.shape[0]} rows saved as train_book_dataset.csv")
print(df.sample(10))


✅ Dataset generated: 100000 rows saved as train_book_dataset.csv
      train_number journey_date booking_date  days_before day_of_week  \
75721        12343   2024-07-16   2024-06-15           31     Tuesday   
80184        12343   2024-10-21   2024-09-06           45      Monday   
19864        13149   2024-06-09   2024-04-28           42      Sunday   
76699        12343   2024-03-30   2024-03-13           17    Saturday   
92991        12343   2024-05-09   2024-03-11           59    Thursday   
76434        12343   2024-10-08   2024-09-11           27     Tuesday   
84004        12343   2024-09-11   2024-07-29           44   Wednesday   
80917        12343   2024-12-04   2024-11-13           21   Wednesday   
60767        12343   2024-06-09   2024-05-05           35      Sunday   
50074        12343   2024-07-27   2024-07-15           12    Saturday   

      start_station end_station booking_status  confirmed  
75721    kishanganj         njp          1PQWL          0  
80184      

In [7]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

np.random.seed(42)

# Train routes
routes = {
    "13149": ['sealdah', 'malda', 'naxalbari', 'alipurduar'],
    "12343": ['sealdah', 'malda', 'kishanganj', 'njp']
}

# Month groupings based on rules
group_A = [1, 2, 3, 4, 12]  # Jan, Feb, Mar, Apr, Dec
group_B = [5, 10, 11]       # May, Oct, Nov
group_C = [6, 7, 8, 9]      # Jun, Jul, Aug, Sep

# Probability function
def get_prob(month, days_before, wl_type, is_weekend):
    prob = 0.0
    if month in group_A:
        if 60 >= days_before >= 30:
            prob = 1.0 if wl_type == "GNWL" else 0.5
        elif 15 <= days_before <= 10:
            prob = 0.5 if wl_type == "GNWL" else 0.0
    elif month in group_B:
        if 60 >= days_before > 40:
            prob = 1.0 if wl_type == "GNWL" else 0.2
        elif 40 >= days_before > 30:
            prob = 0.5 if wl_type == "GNWL" else 0.1
        elif 15 <= days_before <= 10:
            prob = 0.2 if wl_type == "GNWL" else 0.0
    elif month in group_C:
        if 60 >= days_before >= 30:
            prob = 1.0 if wl_type == "GNWL" else 0.5
        elif 15 <= days_before <= 10:
            prob = 0.5 if wl_type == "GNWL" else 0.2
    
    # Weekend reduction
    if is_weekend:
        prob = max(0, prob - 0.1)
    return prob

# Generate dataset
records = []
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

for train, stations in routes.items():
    for _ in range(50000):  # large dataset per train
        journey_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        month = journey_date.month
        day_of_week = journey_date.strftime("%A")
        is_weekend = day_of_week in ["Saturday", "Sunday"]

        days_before = random.randint(10, 60)
        booking_date = journey_date - timedelta(days=days_before)

        start_station = random.choice(stations)
        possible_dest = [st for st in stations if st != start_station]
        end_station = random.choice(possible_dest)

        wl_type = "GNWL" if start_station == "sealdah" else "PQWL"

        prob = get_prob(month, days_before, wl_type, is_weekend)
        confirmed = random.random() < prob

        if confirmed:
            booking_status = "CNF"
        else:
            wl_num = random.randint(1, 20)
            booking_status = f"{wl_num}{wl_type}"

        records.append([
            train, journey_date.date(), booking_date.date(), days_before,
            day_of_week, start_station, end_station, booking_status,
            1 if confirmed else 0
        ])

# Convert to DataFrame
df = pd.DataFrame(records, columns=[
    "train_number", "journey_date", "booking_date", "days_before",
    "day_of_week", "start_station", "end_station",
    "booking_status", "confirmed"
])

# Oversample confirmed to match not confirmed
df_confirmed = df[df['confirmed'] == 1]
df_not_confirmed = df[df['confirmed'] == 0]

needed_confirms = len(df_not_confirmed) - len(df_confirmed)
if needed_confirms > 0:
    extra_confirms = df_confirmed.sample(needed_confirms, replace=True, random_state=42)
    df_balanced = pd.concat([df, extra_confirms], ignore_index=True)
else:
    df_balanced = df.copy()

# Shuffle rows
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save
df_balanced.to_csv("train_booking_dataset_balanced.csv", index=False)

print(f"✅ Original dataset: {df.shape[0]} rows")
print(f"✅ Balanced dataset: {df_balanced.shape[0]} rows")
print(df_balanced['confirmed'].value_counts())
print(df_balanced.sample(10))


✅ Original dataset: 100000 rows
✅ Balanced dataset: 136998 rows
confirmed
0    68499
1    68499
Name: count, dtype: int64
       train_number journey_date booking_date  days_before day_of_week  \
87281         12343   2024-03-15   2024-02-14           30      Friday   
25691         13149   2024-06-04   2024-05-14           21     Tuesday   
101626        13149   2024-12-31   2024-11-17           44     Tuesday   
121448        13149   2024-03-29   2024-02-26           32      Friday   
104837        13149   2024-12-05   2024-10-30           36    Thursday   
9364          12343   2024-04-11   2024-02-13           58    Thursday   
36330         13149   2024-08-05   2024-07-06           30      Monday   
21675         12343   2024-05-15   2024-04-23           22   Wednesday   
61884         12343   2024-04-14   2024-03-10           35      Sunday   
131155        12343   2024-10-20   2024-09-13           37      Sunday   

       start_station end_station booking_status  confirmed  
87

In [9]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

np.random.seed(42)

# Train routes
routes = {
    "13149": ['sealdah', 'malda', 'naxalbari', 'alipurduar'],
    "12343": ['sealdah', 'malda', 'kishanganj', 'njp']
}

# Month groupings based on rules
group_A = [1, 2, 3, 4, 12]  # Jan, Feb, Mar, Apr, Dec
group_B = [5, 10, 11]       # May, Oct, Nov
group_C = [6, 7, 8, 9]      # Jun, Jul, Aug, Sep

# Probability function for Sleeper
def get_prob(month, days_before, wl_type, is_weekend):
    prob = 0.0
    if month in group_A:
        if 60 >= days_before >= 30:
            prob = 1.0 if wl_type == "GNWL" else 0.5
        elif 15 <= days_before <= 10:
            prob = 0.5 if wl_type == "GNWL" else 0.0
    elif month in group_B:
        if 60 >= days_before > 40:
            prob = 1.0 if wl_type == "GNWL" else 0.2
        elif 40 >= days_before > 30:
            prob = 0.5 if wl_type == "GNWL" else 0.1
        elif 15 <= days_before <= 10:
            prob = 0.2 if wl_type == "GNWL" else 0.0
    elif month in group_C:
        if 60 >= days_before >= 30:
            prob = 1.0 if wl_type == "GNWL" else 0.5
        elif 15 <= days_before <= 10:
            prob = 0.5 if wl_type == "GNWL" else 0.2
    
    # Weekend reduction
    if is_weekend:
        prob = max(0, prob - 0.1)
    return prob

# Generate dataset
records = []
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

for train, stations in routes.items():
    for _ in range(50000):  # large dataset per train
        journey_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        month = journey_date.month
        day_of_week = journey_date.strftime("%A")
        is_weekend = day_of_week in ["Saturday", "Sunday"]

        days_before = random.randint(10, 60)
        booking_date = journey_date - timedelta(days=days_before)

        start_station = random.choice(stations)
        possible_dest = [st for st in stations if st != start_station]
        end_station = random.choice(possible_dest)

        wl_type = "GNWL" if start_station == "sealdah" else "PQWL"

        # Randomly assign coach type
        coach_type = random.choice(["Sleeper", "AC"])

        # Base probability for Sleeper
        prob = get_prob(month, days_before, wl_type, is_weekend)

        # If AC, increase by 10%
        if coach_type == "AC":
            prob = min(1.0, prob + 0.1)

        confirmed = random.random() < prob

        if confirmed:
            booking_status = "CNF"
        else:
            wl_num = random.randint(1, 20)
            booking_status = f"{wl_num}{wl_type}"

        records.append([
            train, journey_date.date(), booking_date.date(), days_before,
            day_of_week, start_station, end_station, coach_type, booking_status,
            1 if confirmed else 0
        ])

# Convert to DataFrame
df = pd.DataFrame(records, columns=[
    "train_number", "journey_date", "booking_date", "days_before",
    "day_of_week", "start_station", "end_station", "coach_type",
    "booking_status", "confirmed"
])

# Oversample confirmed to match not confirmed
df_confirmed = df[df['confirmed'] == 1]
df_not_confirmed = df[df['confirmed'] == 0]

needed_confirms = len(df_not_confirmed) - len(df_confirmed)
if needed_confirms > 0:
    extra_confirms = df_confirmed.sample(needed_confirms, replace=True, random_state=42)
    df_balanced = pd.concat([df, extra_confirms], ignore_index=True)
else:
    df_balanced = df.copy()

# Shuffle rows
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save
df_balanced.to_csv("train_booking_dataset_balanced_with_coach.csv", index=False)

print(f"✅ Original dataset: {df.shape[0]} rows")
print(f"✅ Balanced dataset: {df_balanced.shape[0]} rows")
print(df_balanced['confirmed'].value_counts())
print(df_balanced.sample(10))


✅ Original dataset: 100000 rows
✅ Balanced dataset: 127946 rows
confirmed
0    63973
1    63973
Name: count, dtype: int64
       train_number journey_date booking_date  days_before day_of_week  \
14985         13149   2024-09-05   2024-07-15           52    Thursday   
111374        12343   2024-03-05   2024-01-24           41     Tuesday   
126604        12343   2024-06-13   2024-04-15           59    Thursday   
83219         12343   2024-08-13   2024-07-26           18     Tuesday   
66232         13149   2024-08-30   2024-08-02           28      Friday   
118477        12343   2024-04-15   2024-02-27           48      Monday   
54625         13149   2024-07-03   2024-05-22           42   Wednesday   
92907         12343   2024-04-29   2024-03-20           40      Monday   
1176          12343   2024-10-17   2024-09-21           26    Thursday   
78936         13149   2024-07-25   2024-06-11           44    Thursday   

       start_station end_station coach_type booking_status  con