In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

In [2]:
def show_scrollable(df, max_rows=20, height=400):
    html = df.head(max_rows).to_html(notebook=True)
    display(HTML(f'<div style="overflow:auto; height:{height}px">{html}</div>'))

In [3]:
df_session = pd.read_csv('Data\TuSession_Nicola.csv', encoding='latin1')
df_Delture = pd.read_csv('Data\TuDelture_Nicola.csv', encoding='latin1')
df_Ture = pd.read_csv('Data\TuTure_Nicola.csv', encoding='latin1')
df_AdresseLog = pd.read_csv('Data\TuAdresseLog_Nicola.csv', encoding='latin1')

In [4]:
after_session = pd.read_csv('Data\Efterkodsession_Nicola.csv', encoding='latin1')
after_Delture = pd.read_csv('Data\EfterkodDeltur_Nicola.csv', encoding='latin1')
after_Ture = pd.read_csv('Data\EfterkodTur_Nicola.csv', encoding='latin1')

  after_Delture = pd.read_csv('Data\EfterkodDeltur_Nicola.csv', encoding='latin1')


In [5]:
cols_to_int_delture = ['transportmiddel','forer', 'totkm','tidtur']
for col in cols_to_int_delture:
    if col in df_Delture.columns:
        df_Delture[col] = pd.to_numeric(df_Delture[col], errors='coerce').round().astype('Int64')

In [6]:
cols_to_int = ['stagemode','stagedrivpass','stagelength','stagedurationmin']
for col in cols_to_int:
    if col in after_Delture.columns:
        after_Delture[col] = pd.to_numeric(after_Delture[col], errors='coerce').round().astype('Int64')

In [7]:
df_Delture.rename(columns={'totkm':'stagelength'}, inplace=True)
df_Delture.rename(columns={'tidtur':'stagedurationmin'}, inplace=True)

In [8]:
# Merge on turid
d = df_Delture.merge(after_Delture, on="turid", suffixes=("_raw","_mod"), how="inner")
print("Merged shape:", d.shape)

Merged shape: (235764, 15)


In [9]:
from itertools import combinations
import re

raw_cols = [c for c in d.columns if c.endswith("_raw")]
mod_cols = [c for c in d.columns if c.endswith("_mod")]

base_raw = {re.sub(r"_raw$", "", c) for c in raw_cols}
base_mod = {re.sub(r"_mod$", "", c) for c in mod_cols}
compare_bases = sorted(base_raw.intersection(base_mod))   # e.g., ['delturnr','stagelength','stagedurationmin']

print("Compare bases:", compare_bases)

# (Optional) keys/identifiers to carry in outputs
key_cols = [c for c in ["turid", "delturnr_raw", "delturnr_mod"] if c in d.columns]

Compare bases: ['delturnr', 'stagedurationmin', 'stagelength']


In [10]:
delture_columns = "stagedurationmin", "stagelength", "delturnr"

In [11]:
diff_masks = {}
for c in delture_columns:
    left  = d[f"{c}_raw"]
    right = d[f"{c}_mod"]
    # "changed" means values are different AND modified is not NA
    changed = right.notna() & (~right.eq(left))
    diff_masks[c] = changed

# Trip-level flag
any_changed_delture = pd.DataFrame(diff_masks).any(axis=1)
d["flag"] = d["turid"].where(any_changed_delture, "OK").mask(any_changed_delture, "WRONG")

print("Total trips:", len(d))
print("Corrected trips:", (d["flag"]=="WRONG").sum())
print("% corrected:", (d["flag"]=="WRONG").mean()*100)
show_scrollable(d, max_rows=30)

Total trips: 235764
Corrected trips: 6640
% corrected: 2.81637569773163


Unnamed: 0,turid,delturnr_raw,transportmiddel,forer,stagelength_raw,stagedurationmin_raw,linie,delturnr_mod,stagemode,stagedrivpass,stagelength_mod,stagedurationmin_mod,route,fromstation,medtagdeltur,flag
0,2128335,1,11,1,34,30,,1,,,,,,,0,OK
1,2128336,1,11,1,34,30,,1,,,,,,,0,OK
2,2128337,1,11,2,26,30,,1,,,,,,,0,OK
3,2128338,1,11,2,1,3,,1,,,,,,,0,OK
4,2128342,1,2,0,4,15,,1,,,,,,,0,OK
5,2128343,1,2,0,4,15,,1,,,,,,,0,OK
6,2128346,1,2,0,5,20,,1,,,,,,,0,OK
7,2128347,1,2,0,5,20,,1,,,,,,,0,OK
8,2128348,1,2,0,2,8,,1,,,,,,,0,OK
9,2128349,1,2,0,2,8,,1,,,,,,,0,OK


In [12]:
# Build a boolean DataFrame: rows=trips, cols=fields, value=True if that field changed
diff_df_delture = pd.DataFrame(diff_masks)

# List of changed columns per row - handle NA values by replacing them with False
d["changed_cols"] = diff_df_delture.apply(lambda r: [c for c, v in r.items() if pd.notna(v) and v], axis=1)

# Count of changed fields per row
d["n_changed"] = diff_df_delture.sum(axis=1, skipna=True).astype(int)

# Keep only rows with at least one change for quick inspection
changed_rows_delture = d[d["n_changed"] > 0].copy()
print("Rows with >=1 change:", len(changed_rows_delture))
print(changed_rows_delture[["turid", "n_changed", "changed_cols"]].head(10))

Rows with >=1 change: 6640
       turid  n_changed                     changed_cols
47   2128398          1                    [stagelength]
74   2118093          2  [stagedurationmin, stagelength]
92   2118129          1                    [stagelength]
93   2118130          1                    [stagelength]
101  2118139          2  [stagedurationmin, stagelength]
244  2118318          1                    [stagelength]
245  2118319          1                    [stagelength]
246  2118320          1                    [stagelength]
247  2118321          1                    [stagelength]
248  2118322          1                    [stagelength]


In [13]:
records = []
for c in delture_columns:
    mask = diff_df_delture[c]            # strictly boolean
    if mask.any():
        sub = d.loc[mask, ["turid", f"{c}_raw", f"{c}_mod"]].copy()
        sub.columns = ["turid", "raw_value", "mod_value"]
        sub["column_affected"] = c
        records.append(sub)

diff_long_delture = (
    pd.concat(records, ignore_index=True)
    if records else pd.DataFrame(columns=["turid","raw_value","mod_value","column_affected"])
)

# Optional: sort for easier reading
diff_long_delture = diff_long_delture.sort_values(["turid","column_affected"]).reset_index(drop=True)

# Quick peek
print(diff_long_delture.head(10))
print("Total changed fields (rows):", len(diff_long_delture))
print("Trips with ≥1 change:", diff_long_delture["turid"].nunique())

     turid  raw_value  mod_value   column_affected
0  2118093         40         70  stagedurationmin
1  2118093         70        119       stagelength
2  2118129          0          1       stagelength
3  2118130          0          1       stagelength
4  2118139          9          5  stagedurationmin
5  2118139         16          2       stagelength
6  2118318          2          1       stagelength
7  2118319         13         16       stagelength
8  2118320         20         16       stagelength
9  2118321         24         32       stagelength
Total changed fields (rows): 7261
Trips with ≥1 change: 5447


In [19]:
d.to_csv('Data/TuDelture_Edited_Nicola.csv', index=False, encoding='latin1')

In [15]:
d.columns

Index(['turid', 'delturnr_raw', 'transportmiddel', 'forer', 'stagelength_raw',
       'stagedurationmin_raw', 'linie', 'delturnr_mod', 'stagemode',
       'stagedrivpass', 'stagelength_mod', 'stagedurationmin_mod', 'route',
       'fromstation', 'medtagdeltur', 'flag', 'changed_cols', 'n_changed'],
      dtype='object')

In [16]:
delture_edit = d.copy()  # Use merged dataframe for further analysis
delture_edit.drop(columns=['delturnr_raw', 'forer', 'linie', 'delturnr_mod', 'stagemode',
       'stagedrivpass', 'route',
       'fromstation', 'medtagdeltur', 'flag', 'changed_cols', 'n_changed'], inplace=True)

In [18]:
delture_edit.to_csv('Data/EfterkodDelture_Edited_Nicola.csv', index=False, encoding='latin1')