In [1]:
import pandas as pd
import networkx as nx

In [None]:
sbb_data = "datasets/switzerland/sbb-linie-mit-betriebspunkten.csv"

In [None]:
df = pd.read_csv(sbb_data, sep=';')

print(df.columns)
print(df.head())

Index(['Station abbreviation', 'Stop name', 'Line', 'KM', 'Line.1', 'Geopos',
       'Didok number', 'OPUIC', 'Stop name.1', 'lod', 'sloid'],
      dtype='object')
  Station abbreviation           Stop name  Line         KM  \
0                  ABO   Aarburg-Oftringen   500   43.00505   
1                 ABOW   Aarburg-Oftringen   451   43.97183   
2                   AD              Aadorf   850  122.89829   
3                 AESP              Aespli   400   12.27812   
4                   AF  Affoltern am Albis   711   24.84132   

                                 Line.1  \
0            Basel SBB - Olten - Luzern   
1  Aarburg-Oftringen - Rothrist Gleis 1   
2           St.Gallen - Winterthur Nord   
3   Lochligut - Wanzwil - Rothrist West   
4           ZH Hardbrucke - Kollermuhle   

                                  Geopos  Didok number    OPUIC  \
0  47.320268469495055, 7.908222606719322          2000  8502000   
1    47.3136469221806, 7.906871810514809          8141  8508141 

In [8]:
LINE_COL    = "Line"  
STATION_COL = "Station abbreviation"  
ORDER_COL   = "KM" 

# Sort so stations are in the right order along each line
df_sorted = df.sort_values([LINE_COL, ORDER_COL])

G = nx.Graph()   # undirected infrastructure graph

for line_id, group in df_sorted.groupby(LINE_COL):
    stops = group[STATION_COL].tolist()
    # connect consecutive stations on this line
    for u, v in zip(stops[:-1], stops[1:]):
        if G.has_edge(u, v):
            # edge already exists: add this line to its list
            G[u][v].setdefault("lines", set()).add(line_id)
        else:
            G.add_edge(u, v, lines={line_id})


In [10]:
print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())
print("Connected components:", nx.number_connected_components(G))


Nodes: 1354
Edges: 1512
Connected components: 9


In [13]:
components = list(nx.connected_components(G))

rows = []
for i, nodes in enumerate(components):
    sub = G.subgraph(nodes)
    rows.append({              
        "num_nodes": sub.number_of_nodes(),
        "num_edges": sub.number_of_edges(),
    })

cc_df = pd.DataFrame(rows)

# Order by size (nodes) descending, and reset index for a clean table
cc_df = cc_df.sort_values("num_nodes", ascending=False).reset_index(drop=True)

# Optional: add a rank (1 = largest component)
cc_df.insert(0, "rank", cc_df.index + 1)

In [14]:
print(cc_df)

   rank  num_nodes  num_edges
0     1       1338       1504
1     2          2          1
2     3          2          1
3     4          2          1
4     5          2          1
5     6          2          1
6     7          2          1
7     8          2          1
8     9          2          1


In [18]:
components_by_size = sorted(components, key=len, reverse=True)

# All small components (here: the 8 components of size 2)
small_components = components_by_size[1:]   # since [0] is the giant one

# --- 1) Per-station table (16 rows: 2 stations Ã— 8 components) ---

rows = []  # IMPORTANT: reset rows so nothing old leaks in

for rank, nodes in enumerate(small_components, start=2):  # rank 2..9
    # sort the node ids to have a stable order
    for abbr in sorted(nodes):
        # find row for this station in the dataframe
        row = df.loc[df["Station abbreviation"] == abbr].iloc[0]

        rows.append({
            "component_rank": rank,                   # 2..9
            "station_abbreviation": abbr,
            "stop_name": row["Stop name"],
            "didok": row["Didok number"],
            "line": row["Line"],
            "km": row["KM"],
        })

small_cc_df = (
    pd.DataFrame(rows)
    .sort_values(["component_rank", "km"])
    .reset_index(drop=True)
)

# --- 2) Aggregate to one row per small component (8 rows) ---

pair_rows = []
for comp_rank, group in small_cc_df.groupby("component_rank"):
    # each group has exactly 2 stations
    group = group.sort_values("km")
    a, b = group.iloc[0], group.iloc[1]

    pair_rows.append({
        "component_rank": comp_rank,
        "num_nodes": 2,
        "num_edges": 1,
        "station1_abbr": a["station_abbreviation"],
        "station1_name": a["stop_name"],
        "station2_abbr": b["station_abbreviation"],
        "station2_name": b["stop_name"],
    })

small_pairs_df = (
    pd.DataFrame(pair_rows)
    .sort_values("component_rank")
    .reset_index(drop=True)
)

In [19]:
print(small_pairs_df)

   component_rank  num_nodes  num_edges station1_abbr  \
0               2          2          1          ASZW   
1               3          2          1          ASKO   
2               4          2          1          SEZU   
3               5          2          1          FACO   
4               6          2          1          BOCS   
5               7          2          1          SIFO   
6               8          2          1          BOZS   
7               9          2          1          LNTO   

                   station1_name station2_abbr                   station2_name  
0     Amsteg Zugangsstollen West          ASZO       Amsteg Zugangsstollen Ost  
1        Amsteg Kabelstollen Ost          ASSW        Amsteg Kabelstollen West  
2          Sedrun Zugangsstollen          SETS       Sedrun Entluftungsstollen  
3   Faido cunicolo di acc. ovest          FACE      Faido cunicolo di acc. est  
4  Bodio cunicolo di aggira. sud          BOAN  Bodio cunicolo di aggira. nord  


In [20]:
components = list(nx.connected_components(G))

# largest component (set of node IDs)
largest_nodes = max(components, key=len)

# rows of df that belong to the big component
df_big = df[df["Station abbreviation"].isin(largest_nodes)].copy()

print("Rows in big component:", len(df_big))
print("Unique nodes in big component:", df_big["Station abbreviation"].nunique())

Rows in big component: 1865
Unique nodes in big component: 1338
