In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

In [2]:
# Since you are using Jupyter locally, ensure the CSV is in the same folder
CSV_PATH = 'geant-flat-tms.csv'
BATCH_SIZE = 64

print("Loading and Processing Data with Time Feature...")

Loading and Processing Data with Time Feature...


In [3]:
# 1. Load Data (Keep header=None)
df = pd.read_csv(CSV_PATH, header=None)

In [4]:
print("Processing Traffic Data...")
# Convert columns 1-529 to floats, handling any errors
traffic_raw = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce').values.astype(np.float32)
traffic_raw = np.nan_to_num(traffic_raw)

# Log-Scale Traffic (Handle the massive 300,000+ values)
traffic_log = np.log1p(traffic_raw)

# Normalize Traffic to [-1, 1] for the Neural Network
t_max = np.max(traffic_log)
t_min = np.min(traffic_log)

# Avoid division by zero if data is flat
if t_max - t_min > 0:
    traffic_norm = (traffic_log - t_min) / (t_max - t_min)
    traffic_norm = (traffic_norm * 2) - 1
else:
    traffic_norm = traffic_log

Processing Traffic Data...


In [12]:
# --- 3. Process Time Column (Column 0) ---
print("Processing Time Column...")

# Function: "2005-01-01-00-30" -> Float 0.30
# Function: "2005-01-01-00-30" -> Float 0.5 (representing 00:30)
def parse_time_to_float(time_str):
    # Split by '-': ['2005', '01', '01', '00', '30']
    parts = str(time_str).split('-')
    
    # Extract Hour and Minute
    hh = float(parts[-2])
    mm = float(parts[-1])
    
    # --- THE FIX ---
    # Convert to "Hours from start of day"
    # Example: 01:45 becomes 1.75
    time_numeric = hh + (mm / 60.0)
    
    return time_numeric

# Apply function
time_floats = df.iloc[:, 0].apply(parse_time_to_float).values.astype(np.float32)

# Normalize Time to [-1, 1]
# Max time is now exactly 24.0.
time_norm = (time_floats / 24.0) * 2 - 1
time_norm = time_norm.reshape(-1, 1)

print(f"Sample Check:")
print(f"Original: {df.iloc[20,0]}") # 2005-01-01-00-30
print(f"Float:    {time_floats[20]}") # Should be 0.5 (not 0.30)

Processing Time Column...
Sample Check:
Original: 2005-01-01-05-30
Float:    5.5


In [13]:
# --- 4. Merge Time + Traffic ---
# Stack them side-by-side. 
# Result: [Time (1 column) | Traffic (529 columns)]
data_combined = np.hstack((time_norm, traffic_norm))

print(f"Traffic Shape: {traffic_norm.shape}")
print(f"Time Shape:    {time_norm.shape}")
print(f"Merged Shape:  {data_combined.shape} (Input for WGAN)")

# --- 5. Create PyTorch Loader ---
tensor_x = torch.from_numpy(data_combined)
dataset = TensorDataset(tensor_x)
# drop_last=True prevents crashes if the last batch is too small
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

# --- 6. Verification ---
print("\n--- Verification: Row 0 ---")
print(f"Original Time String: {df.iloc[0,0]}")
print(f"Converted Float:      {time_floats[0]}")
print(f"Normalized Time Input: {time_norm[0][0]:.4f} (Should be between -1 and 1)")

# Save these stats! We need them later to convert output back to real numbers.
print(f"\nNOTE: Save these values for the Generator later:")
print(f"Log Min: {t_min}")
print(f"Log Max: {t_max}")

Traffic Shape: (10772, 529)
Time Shape:    (10772, 1)
Merged Shape:  (10772, 530) (Input for WGAN)

--- Verification: Row 0 ---
Original Time String: 2005-01-01-00-30
Converted Float:      0.5
Normalized Time Input: -0.9583 (Should be between -1 and 1)

NOTE: Save these values for the Generator later:
Log Min: 0.0
Log Max: 22.94443130493164
