In [1]:
# This code round the start time in the HH_time matrix to make every word with in a second
# Also, the maximum word duration in this story is 1 sec, so this modelling should be okay for the HH story

# By Frank Hu, 2025-09

In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
# Input and output file paths
input_file = '../Example_data/timing_matrix/HH_time_matrix.csv'
output_file = '../Example_data/timing_matrix/HH_time_matrix_round.csv'
output_file_shift = '../Example_data/timing_matrix/HH_time_matrix_round-shift.csv'

In [3]:
# Read the CSV file
df = pd.read_csv(input_file)

# Create a new DataFrame with only the required columns
new_df = df[['Section', 'Text', 'Start_time_adjusted']].copy()

# Floor round the Start_time_section column
new_df['Start_time_adjusted_round'] = new_df['Start_time_adjusted'].apply(math.floor)

# Display the result
print(new_df)

# If you want to save it to a new CSV file
new_df.to_csv(output_file, index=False)

      Section          Text  Start_time_adjusted  Start_time_adjusted_round
0           1   when                        0.00                          0
1           1   henry                       0.15                          0
2           1   left                        0.42                          0
3           1   the                         0.76                          0
4           1   gym                         0.83                          0
...       ...           ...                  ...                        ...
1952       12   the                       835.44                        835
1953       12   back                      835.54                        835
1954       12   of                        835.86                        835
1955       12   the                       835.98                        835
1956       12   bus                       836.07                        836

[1957 rows x 4 columns]


In [None]:
# Additional step: make shifted time matrix based on the time matrix
def make_v2_anchor_deltas_no_section(df: pd.DataFrame, n_shifts: int = 25) -> pd.DataFrame:
    """
    Anchor-based deltas with NO sectioning:
      Time_shift_0 = Start_time_adjusted
      For k>=1: anchor at global k-th row (0-indexed):
        Time_shift_k[i] = Start_time_adjusted[i] - Start_time_adjusted[k]  (i >= k)
                        = NaN                                             (i <  k)
    """
    required = {"Text", "Start_time_adjusted"}
    if missing := (required - set(df.columns)):
        raise ValueError(f"Missing required columns: {missing}")

    n = len(df)
    starts = df["Start_time_adjusted"].to_numpy(dtype=float)
    out = pd.DataFrame({
        "ID": np.arange(n),
        "Text": df["Text"].astype(str).to_numpy(),
        "Time_shift_0": starts
    })

    n_shifts = min(n_shifts, max(0, n - 1))  # avoid anchors beyond last row
    for k in range(1, n_shifts + 1):
        col = np.full(n, np.nan, dtype=float)
        anchor = starts[k]
        col[k:] = starts[k:] - anchor
        out[f"Time_shift_{k}"] = col
    return out



In [None]:
# Example usage:
round_df = pd.read_csv(output_file)
v2 = make_v2_anchor_deltas_no_section(round_df, n_shifts=25)
v2 = v2.round(2)
v2.to_csv(output_file_shift, index=False)
