In [None]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import json
import time
from tqdm import tqdm
import urllib3

# Manual download

we need the ASD station

In [None]:
import os

In [None]:
def process_traffic_file(file_path):
    """
    Complete pipeline for processing traffic files
    """
    # Read the CSV
    df = pd.read_csv(file_path, on_bad_lines='skip')

    numeric_columns = ["duration_minutes"]
        
    # Convert all numeric columns to float first, then handle NaN values
    for col in numeric_columns:
        if col in df.columns:
            # Convert to string, remove any non-numeric characters except decimal point
            df[col] = df[col].astype(str).str.replace(r'[^\d.-]', '', regex=True)
            # Convert to numeric, set errors to NaN
            df[col] = pd.to_numeric(df[col], errors='coerce')
            # Fill NaN with 0
            df[col] = df[col].fillna(0)

    df["start_time"] = pd.to_datetime(df["start_time"])
    df["start_time_date"] = df["start_time"].dt.date
    # df["start_time_date"] = df["start_time"].dt.strftime('%Y-%m-%d') 
        
    # Select important columns for analysis
    important_columns = [
        "rdt_station_codes", "cause_en", "cause_group", "start_time_date", "duration_minutes"
    ]
    
    # Filter to only include columns that exist in the dataframe
    available_columns = [col for col in important_columns if col in df.columns]
    
    return df[available_columns]

In [None]:
directory_path = "../Data_Raw/Traffic/"

csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

combined_df = pd.DataFrame()
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    print(f"Processing: {file}")
    df = process_traffic_file(file_path)
    df['source_file'] = file
    combined_df = pd.concat([combined_df, df], ignore_index=True)

In [None]:
combined_df

In [None]:
filtered_df = combined_df[combined_df['rdt_station_codes'].str.contains('ASD', na=False)]

In [None]:
filtered_df

In [None]:
cleaned_data_path = "../../Data_Sources/Data_Cleaned/Traffic/"

# Create the target directory if it doesn't exist
os.makedirs(cleaned_data_path, exist_ok=True)

# Save the DataFrame to the target directory
output_file = os.path.join(cleaned_data_path, "disruptions_data_historical.csv")
filtered_df.to_csv(output_file, index=False)

# Work with API

In [None]:
# # Disable SSL warnings for OVAPI
# urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# class TransportDisruptionCollector:
#     def __init__(self, ns_api_key=None):
#         # NS API configuration
#         self.ns_api_key = ns_api_key
#         if ns_api_key:
#             self.ns_headers = {
#                 'Ocp-Apim-Subscription-Key': self.ns_api_key,
#                 'Accept': 'application/json'
#             }
        
#         # OVAPI configuration - use HTTP to avoid SSL issues
#         self.ovapi_base = "http://v0.ovapi.nl"
        
#         # Amsterdam Central Station TPC codes
#         self.central_station_tpcs = ["30009501", "30009500"]
        
#         # Data storage
#         self.disruptions_data = []
        
#     def get_ns_disruptions(self, start_date, end_date):
#         """
#         Fetch disruptions from NS API for a date range.
#         """
#         current_date = datetime.now()
        
#         if end_date.year <= 2024:
#             # Use rijdendetreinen.nl open data for historical data
#             print("Getting historical NS disruption data references...")
#             return self.get_historical_ns_references(start_date, end_date)
#         else:
#             # Use NS API for current/future data
#             print("Fetching current NS disruption data from NS API...")
#             return self.get_current_ns_disruptions()
    
#     def get_historical_ns_references(self, start_date, end_date):
#         """
#         Return references to historical NS disruption data files.
#         """
#         historical_references = []
        
#         for year in range(start_date.year, end_date.year + 1):
#             if year <= 2023:
#                 url = f"https://www.rijdendetreinen.nl/en/open-data/disruptions/{year}"
#                 historical_references.append({
#                     'type': 'NS_HISTORICAL_REFERENCE',
#                     'year': year,
#                     'source': 'rijdendetreinen.nl',
#                     'url': url,
#                     'description': f"Download CSV file for {year} disruptions",
#                     'note': 'Manual download required'
#                 })
#             else:
#                 print(f"Historical data for {year} not yet available")
        
#         return historical_references
    
#     def load_ns_historical_csv(self, csv_path):
#         """
#         Load NS historical disruption data from downloaded CSV file.
#         """
#         try:
#             df = pd.read_csv(csv_path)
#             historical_data = []
            
#             for _, row in df.iterrows():
#                 historical_data.append({
#                     'type': 'NS_HISTORICAL',
#                     'source': 'rijdendetreinen.nl',
#                     'disruption_id': row.get('id'),
#                     'start_time': row.get('start_time'),
#                     'end_time': row.get('end_time'),
#                     'station': row.get('stations'),
#                     'cause': row.get('cause'),
#                     'description': row.get('title'),
#                     'statistical_cause': row.get('statistical_cause'),
#                     'cause_group': row.get('cause_group')
#                 })
            
#             return historical_data
            
#         except Exception as e:
#             print(f"Error loading historical CSV: {e}")
#             return []
    
#     def get_current_ns_disruptions(self):
#         """
#         Fetch current NS disruptions from the API.
#         """
#         if not self.ns_api_key:
#             print("No NS API key provided. Cannot fetch current disruptions.")
#             return []
            
#         url = "https://gateway.apiportal.ns.nl/reisinformatie-api/api/v3/disruptions"
        
#         try:
#             response = requests.get(url, headers=self.ns_headers)
#             response.raise_for_status()
#             data = response.json()
            
#             current_disruptions = []
            
#             # Check if data is a list or dictionary with 'payload'
#             if isinstance(data, list):
#                 disruptions_list = data
#             elif isinstance(data, dict) and 'payload' in data:
#                 disruptions_list = data.get('payload', [])
#             else:
#                 print(f"Unexpected NS API response format")
#                 return []
            
#             for disruption in disruptions_list:
#                 current_disruptions.append({
#                     'type': 'NS_REALTIME',
#                     'source': 'NS_API',
#                     'disruption_id': disruption.get('id'),
#                     'start_time': disruption.get('start'),
#                     'end_time': disruption.get('end'),
#                     'station': disruption.get('location', {}).get('name'),
#                     'cause': disruption.get('cause'),
#                     'description': disruption.get('title')
#                 })
            
#             return current_disruptions
            
#         except requests.RequestException as e:
#             print(f"Error fetching NS disruptions: {e}")
#             return []
    
#     def get_gvb_disruptions(self):
#         """
#         Fetch GVB (Amsterdam public transport) disruptions from OVAPI.
#         """
#         disruptions = []
        
#         for stop_code in self.central_station_tpcs:
#             url = f"{self.ovapi_base}/tpc/{stop_code}"
            
#             try:
#                 response = requests.get(url)
#                 response.raise_for_status()
#                 data = response.json()
                
#                 # Navigate through the correct data structure
#                 for area_code, area_data in data.items():
#                     for tpc, tpc_data in area_data.items():
#                         stop_info = tpc_data.get('Stop', {})
                        
#                         if 'Passes' in tpc_data:
#                             for pass_id, pass_data in tpc_data['Passes'].items():
#                                 # Check for disruptions in the pass data
                                
#                                 # Check if journey is cancelled
#                                 if pass_data.get('TripStopStatus') == 'CANCEL':
#                                     disruptions.append({
#                                         'type': 'GVB',
#                                         'source': 'OVAPI',
#                                         'stop_code': stop_code,
#                                         'stop_name': stop_info.get('TimingPointName', 'Amsterdam Centraal'),
#                                         'line': pass_data.get('LinePublicNumber'),
#                                         'status': 'CANCELLED',
#                                         'expected_departure': pass_data.get('ExpectedDepartureTime'),
#                                         'target_departure': pass_data.get('TargetDepartureTime'),
#                                         'destination': pass_data.get('DestinationName50'),
#                                         'timestamp': datetime.now().isoformat()
#                                     })
                                
#                                 # Check for delays
#                                 expected_time = pass_data.get('ExpectedDepartureTime')
#                                 target_time = pass_data.get('TargetDepartureTime')
                                
#                                 if expected_time and target_time:
#                                     try:
#                                         expected_dt = datetime.fromisoformat(expected_time.replace('T', ' '))
#                                         target_dt = datetime.fromisoformat(target_time.replace('T', ' '))
#                                         delay_minutes = (expected_dt - target_dt).total_seconds() / 60
                                        
#                                         if delay_minutes > 5:  # Consider significant delays
#                                             disruptions.append({
#                                                 'type': 'GVB',
#                                                 'source': 'OVAPI',
#                                                 'stop_code': stop_code,
#                                                 'stop_name': stop_info.get('TimingPointName', 'Amsterdam Centraal'),
#                                                 'line': pass_data.get('LinePublicNumber'),
#                                                 'status': 'DELAYED',
#                                                 'delay_minutes': delay_minutes,
#                                                 'expected_departure': expected_time,
#                                                 'target_departure': target_time,
#                                                 'destination': pass_data.get('DestinationName50'),
#                                                 'timestamp': datetime.now().isoformat()
#                                             })
#                                     except ValueError:
#                                         print(f"Error parsing datetime for {pass_id}")
                        
#                         # Check for general messages (disruptions)
#                         if 'GeneralMessages' in tpc_data and tpc_data['GeneralMessages']:
#                             for msg_id, message in tpc_data['GeneralMessages'].items():
#                                 disruptions.append({
#                                     'type': 'GVB',
#                                     'source': 'OVAPI',
#                                     'stop_code': stop_code,
#                                     'stop_name': stop_info.get('TimingPointName', 'Amsterdam Centraal'),
#                                     'line': None,
#                                     'status': 'GENERAL_MESSAGE',
#                                     'message': message,
#                                     'timestamp': datetime.now().isoformat()
#                                 })
                
#             except requests.RequestException as e:
#                 print(f"Error fetching GVB data for stop {stop_code}: {e}")
        
#         return disruptions
    
#     def collect_current_data(self):
#         """
#         Collect current disruption data from both NS and GVB.
#         """
#         print("Collecting current data for transport disruptions...")
        
#         # Collect NS disruptions
#         ns_disruptions = self.get_current_ns_disruptions()
#         self.disruptions_data.extend(ns_disruptions)
        
#         # Collect GVB disruptions
#         gvb_disruptions = self.get_gvb_disruptions()
#         self.disruptions_data.extend(gvb_disruptions)
        
#         print(f"Collected {len(self.disruptions_data)} disruption records")
        
#         return self.create_dataframe()
    
#     def collect_historical_data(self, start_date, end_date):
#         """
#         Collect historical disruption data references.
#         """
#         print("Collecting historical data references...")
        
#         # Get NS historical references
#         ns_references = self.get_historical_ns_references(start_date, end_date)
#         self.disruptions_data.extend(ns_references)
        
#         print(f"Collected {len(self.disruptions_data)} records")
        
#         return self.create_dataframe()
    
#     def create_dataframe(self):
#         """
#         Convert collected data to a pandas DataFrame with proper error handling.
#         """
#         if not self.disruptions_data:
#             return pd.DataFrame()
        
#         df = pd.DataFrame(self.disruptions_data)
        
#         # Rename columns for consistency
#         column_mapping = {
#             'start_time': 'start_datetime',
#             'end_time': 'end_datetime',
#             'expected_departure': 'start_datetime',
#             'target_departure': 'planned_departure'
#         }
        
#         for old_col, new_col in column_mapping.items():
#             if old_col in df.columns and new_col not in df.columns:
#                 df = df.rename(columns={old_col: new_col})
        
#         # Process datetime columns more carefully
#         datetime_columns = ['start_datetime', 'end_datetime', 'planned_departure']
        
#         for col in datetime_columns:
#             if col in df.columns:
#                 # First replace None/NaN values
#                 df[col] = df[col].fillna('')
                
#                 # Convert to string and handle special cases
#                 df[col] = df[col].astype(str)
#                 df[col] = df[col].replace(['None', 'nan', 'NaT', ''], '')
                
#                 # Try to convert to datetime
#                 try:
#                     df[col] = pd.to_datetime(df[col], errors='coerce')
#                 except Exception as e:
#                     print(f"Error converting {col} to datetime: {e}")
#                     continue
                
#                 # Double-check if conversion was successful
#                 if not pd.api.types.is_datetime64_any_dtype(df[col]):
#                     print(f"Warning: {col} is not datetime type after conversion")
#                     # Create empty columns for derived features
#                     df['date'] = None
#                     df['hour'] = None
#                     df['day_of_week'] = None
#                     df['day_name'] = None
#                     continue
        
#         # Add derived columns only if we have valid datetime data
#         if 'start_datetime' in df.columns:
#             if pd.api.types.is_datetime64_any_dtype(df['start_datetime']) and df['start_datetime'].notna().any():
#                 # Extract date components safely
#                 try:
#                     df['date'] = df['start_datetime'].dt.date
#                     df['hour'] = df['start_datetime'].dt.hour
#                     df['day_of_week'] = df['start_datetime'].dt.dayofweek
#                     df['day_name'] = df['start_datetime'].dt.day_name()
#                 except Exception as e:
#                     print(f"Error creating derived date columns: {e}")
#             else:
#                 # Create empty columns if datetime conversion failed
#                 df['date'] = None
#                 df['hour'] = None
#                 df['day_of_week'] = None
#                 df['day_name'] = None
        
#         # Fill in missing values for key columns
#         for col in ['status', 'cause', 'description']:
#             if col in df.columns:
#                 df[col] = df[col].fillna('Unknown')
        
#         return df
    
#     def save_disruptions(self, df, filepath):
#         """
#         Save disruptions to CSV file.
#         """
#         df.to_csv(filepath, index=False)
#         print(f"Data saved to {filepath}")
    
#     def generate_summary_report(self, df):
#         """
#         Generate a summary report of the disruptions.
#         """
#         if len(df) == 0:
#             print("No data to summarize")
#             return
        
#         print("\n" + "="*50)
#         print("TRANSPORT DISRUPTION SUMMARY REPORT")
#         print("="*50)
        
#         # Basic statistics
#         print(f"\nTotal disruptions: {len(df)}")
        
#         # By type
#         if 'type' in df.columns:
#             print("\nDisruptions by type:")
#             type_counts = df['type'].value_counts()
#             for dtype, count in type_counts.items():
#                 print(f"  {dtype}: {count}")
        
#         # By status
#         if 'status' in df.columns:
#             print("\nDisruptions by status:")
#             status_counts = df['status'].value_counts()
#             for status, count in status_counts.items():
#                 print(f"  {status}: {count}")
        
#         # For GVB disruptions, show line statistics
#         gvb_data = df[df['type'] == 'GVB'] if 'type' in df.columns else pd.DataFrame()
#         if not gvb_data.empty and 'line' in gvb_data.columns:
#             print("\nGVB disruptions by line:")
#             line_counts = gvb_data['line'].value_counts()
#             for line, count in line_counts.items():
#                 if pd.notna(line):
#                     print(f"  Line {line}: {count}")
        
#         # Time analysis
#         if 'date' in df.columns and df['date'].notna().any():
#             print("\nDisruptions by date:")
#             date_counts = df['date'].value_counts().sort_index()
#             for date, count in date_counts.items():
#                 print(f"  {date}: {count}")
        
#         if 'hour' in df.columns and df['hour'].notna().any():
#             print("\nDisruptions by hour of day:")
#             hour_counts = df['hour'].value_counts().sort_index()
#             for hour, count in hour_counts.items():
#                 print(f"  {hour:02d}:00: {count}")
        
#         if 'day_name' in df.columns and df['day_name'].notna().any():
#             print("\nDisruptions by day of week:")
#             day_counts = df['day_name'].value_counts()
#             for day, count in day_counts.items():
#                 print(f"  {day}: {count}")
        
#         print("\n" + "="*50)

# # Example usage
# def main():
#     """
#     Main function to demonstrate usage of the TransportDisruptionCollector
#     """
#     # Initialize collector (without NS API key for demonstration)
#     collector = TransportDisruptionCollector(ns_api_key="ae2e4c01521d42bb93487bb1d174673c")
    
#     # Option 1: Collect current disruption data
#     print("=== Collecting Current Disruptions ===")
#     current_df = collector.collect_current_data()
    
#     # Generate summary report
#     collector.generate_summary_report(current_df)
    
#     # Save to CSV
#     current_filepath = "nemo_transport_disruptions_current.csv"
#     collector.save_disruptions(current_df, current_filepath)
    
#     # Option 2: Get historical data references
#     print("\n=== Getting Historical Data References ===")
#     start_date = datetime(2022, 1, 1)
#     end_date = datetime(2023, 12, 31)
    
#     historical_df = collector.collect_historical_data(start_date, end_date)
    
#     # Generate summary for historical references
#     collector.generate_summary_report(historical_df)
    
#     # Save historical references
#     historical_filepath = "nemo_transport_disruptions_references.csv"
#     collector.save_disruptions(historical_df, historical_filepath)
    
#     # Option 3: Load a specific historical CSV (example)
#     # Uncomment and modify path when you have downloaded historical data
#     # historical_data = collector.load_ns_historical_csv("path/to/downloaded_file.csv")
#     # historical_df_loaded = pd.DataFrame(historical_data)
#     # collector.save_disruptions(historical_df_loaded, "nemo_historical_disruptions_2022.csv")
    
#     return current_df, historical_df

# if __name__ == "__main__":
#     current_data, historical_data = main()