In [2]:
import pandas as pd

# Define the name of your large input file
input_filename = 'Railway_price_data.csv'
output_filename = 'passenger_train_data_expanded.csv' # <-- New output name

try:
    # Load the full dataset
    print(f"Loading full dataset from '{input_filename}'...")
    df_full = pd.read_csv(input_filename)
    print(f"Original size: {len(df_full)} rows")

    # --- Find the most important stations ---
    from_counts = df_full['fromStnCode'].value_counts()
    to_counts = df_full['toStnCode'].value_counts()
    all_station_counts = from_counts.add(to_counts, fill_value=0).sort_values(ascending=False)

    # Get the list of the top 125 station codes
    important_stations = all_station_counts.head(125).index.tolist()
    print(f"\nIdentified the top {len(important_stations)} most important stations.")

    # --- THIS IS THE NEW LOGIC ---
    # Keep rows where EITHER the origin OR the destination is in our Top 125 list
    df_filtered = df_full[
        (df_full['fromStnCode'].isin(important_stations)) | 
        (df_full['toStnCode'].isin(important_stations))
    ]
    print(f"New expanded size: {len(df_filtered)} rows") # This will be much larger

    # --- Select only the columns we need ---
    columns_to_keep = [
        'fromStnCode', 
        'toStnCode', 
        'classCode', 
        'distance', 
        'duration', 
        'timeStamp',
        'totalFare'
    ]
    
    # Check for missing columns before proceeding
    missing_cols = [col for col in columns_to_keep if col not in df_filtered.columns]
    if missing_cols:
        print(f"Error: The original CSV is missing required columns: {missing_cols}")
    else:
        df_final = df_filtered[columns_to_keep]

        # --- Save the new, final file ---
        df_final.to_csv(output_filename, index=False)
        print(f"\nSuccessfully created '{output_filename}'.")
        print("This file contains the new, expanded feature set for our model.")

except FileNotFoundError:
    print(f"Error: '{input_filename}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Loading full dataset from 'Railway_price_data.csv'...
Original size: 326643 rows

Identified the top 125 most important stations.
New expanded size: 180839 rows

Successfully created 'passenger_train_data_expanded.csv'.
This file contains the new, expanded feature set for our model.
