In [None]:
# Cas de test: le date cible est dehors du GTFS, le script doit trouver un jour de la semaine correspondant dans le GTFS
import pandas as pd
from datetime import datetime, timedelta

# Chemin des fichiers GTFS
folder = r'C:\Users\tiend\Downloads\GTFS_Hanoi'
calendar_path = f"{folder}\\calendar.txt"
trips_path = f"{folder}\\trips.txt"
stop_times_path = f"{folder}\\stop_times.txt"

# Date cible (comme dans GAMA)
target_date = "2025-07-04"  # Vendredi 4 juillet 2025 
target_datetime = datetime.strptime(target_date, "%Y-%m-%d")
target_weekday = target_datetime.weekday()  # 0=lundi, ..., 4=vendredi

# 1. Reproduction de la logique GAMA : collecte de toutes les dates possibles
def collect_all_gtfs_dates():
    all_dates = []
    calendar = pd.read_csv(calendar_path, dtype=str)
    for _, row in calendar.iterrows():
        start_date = datetime.strptime(row['start_date'], "%Y%m%d")
        end_date = datetime.strptime(row['end_date'], "%Y%m%d")
        current_date = start_date
        while current_date <= end_date:
            all_dates.append(current_date)
            current_date += timedelta(days=1)
    return sorted(set(all_dates))

# 2. Recherche du premier jour avec le même weekday
def find_first_same_weekday(target_weekday, all_dates):
    for date in all_dates:
        if date.weekday() == target_weekday:
            return date
    return None

# 3. Application de la logique
all_gtfs_dates = collect_all_gtfs_dates()
fallback_date = find_first_same_weekday(target_weekday, all_gtfs_dates)
if not fallback_date:
    exit()

# 4. Détermination des services actifs pour cette date fallback
day_columns = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
day_column = day_columns[fallback_date.weekday()]

calendar = pd.read_csv(calendar_path, dtype=str)
active_services = []
for _, row in calendar.iterrows():
    start_date = datetime.strptime(row['start_date'], "%Y%m%d")
    end_date = datetime.strptime(row['end_date'], "%Y%m%d")
    if (start_date <= fallback_date <= end_date and row[day_column] == '1'):
        active_services.append(row['service_id'])

# 5. Filtrage des trips
trips = pd.read_csv(trips_path, dtype=str)
active_trips = trips[trips['service_id'].isin(active_services)]
active_trip_ids = set(active_trips['trip_id'])

# 6. Analyse des stop_times
stop_times = pd.read_csv(stop_times_path, dtype=str)
active_stop_times = stop_times[stop_times['trip_id'].isin(active_trip_ids)]

# 7. Calculs finaux
departure_stops = set(active_stop_times[active_stop_times['stop_sequence'].astype(int) == 1]['stop_id'])
trips_with_departure = set(active_stop_times[active_stop_times['stop_sequence'].astype(int) == 1]['trip_id'])
stops_total = active_stop_times[active_stop_times['trip_id'].isin(trips_with_departure)].shape[0]
unique_stops = set(active_stop_times['stop_id'])

# 8. Impression des indicateurs seulement
print(f"1. Nombre de stops de départ (stop_sequence==1, distincts) : {len(departure_stops)}")
print(f"2. Nombre de trips associés à ces stops de départ : {len(trips_with_departure)}")
print(f"3. Nombre total de stops (pour ces trips) : {stops_total}")
print(f"4. Nombre d'arrêts uniques associés aux trips actifs : {len(unique_stops)}")


1. Nombre de stops de départ (stop_sequence==1, distincts) : 224
2. Nombre de trips associés à ces stops de départ : 6713
3. Nombre total de stops (pour ces trips) : 224166
4. Nombre d'arrêts uniques associés aux trips actifs : 7670


In [5]:
# Cas de test: le date cible est dans le GTFS, le script doit trouver les trips actifs et les stops de départ
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def analyze_trip_difference(gtfs_folder, target_date):
    """
    Analyse précise pour identifier LE trip manquant entre GAML et Python
    """
    
    # Chemins des fichiers GTFS
    calendar_path = f"{gtfs_folder}\\calendar.txt"
    trips_path = f"{gtfs_folder}\\trips.txt"
    stop_times_path = f"{gtfs_folder}\\stop_times.txt"
    
    def find_column(df, *names):
        """Trouve une colonne parmi plusieurs noms possibles"""
        for name in names:
            for col in df.columns:
                if col.lower().strip() == name.lower().strip():
                    return col
        return None

    try:
        # Conversion de la date cible
        target_datetime = datetime.strptime(target_date, "%Y-%m-%d")
        target_weekday = target_datetime.weekday()
        
        print(f"Date cible: {target_date} ({target_datetime.strftime('%A')})")
        print(f"Jour de la semaine cible: {target_weekday} (0=lundi, 6=dimanche)")
        
        # 1. FALLBACK DATE (identique à GAML)
        def collect_all_gtfs_dates():
            all_dates = []
            calendar = pd.read_csv(calendar_path, dtype=str)
            for _, row in calendar.iterrows():
                try:
                    start_date = datetime.strptime(row['start_date'], "%Y%m%d")
                    end_date = datetime.strptime(row['end_date'], "%Y%m%d")
                    
                    current_date = start_date
                    while current_date <= end_date:
                        all_dates.append(current_date)
                        current_date += timedelta(days=1)
                except:
                    continue
            return sorted(set(all_dates))

        def find_first_same_weekday(target_weekday, all_dates):
            for date in all_dates:
                if date.weekday() == target_weekday:
                    return date
            return None

        all_gtfs_dates = collect_all_gtfs_dates()
        fallback_date = find_first_same_weekday(target_weekday, all_gtfs_dates)
        fallback_date_str = fallback_date.strftime("%Y%m%d")
        
        print(f"✅ Fallback trouvé: {fallback_date.strftime('%Y-%m-%d %A')}")
        
        # 2. SERVICES ACTIFS (identique à GAML)
        day_columns = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
        day_column = day_columns[fallback_date.weekday()]

        calendar = pd.read_csv(calendar_path, dtype=str)
        active_services = set()

        for _, row in calendar.iterrows():
            try:
                start_date = datetime.strptime(row['start_date'], "%Y%m%d")
                end_date = datetime.strptime(row['end_date'], "%Y%m%d")
                
                if (start_date <= fallback_date <= end_date and row[day_column] == '1'):
                    active_services.add(row['service_id'].strip())
            except:
                continue

        # 3. TRIPS ACTIFS SELON SERVICES
        trips_df = pd.read_csv(trips_path, dtype=str)
        trips_df['service_id'] = trips_df['service_id'].str.strip()
        trips_df['trip_id'] = trips_df['trip_id'].str.strip()
        
        active_trips_df = trips_df[trips_df['service_id'].isin(active_services)]
        active_trip_ids = set(active_trips_df['trip_id'])
        
        # 4. ANALYSE STOP_TIMES - REPRODUCTION LOGIQUE GAML
        stop_times_df = pd.read_csv(stop_times_path, dtype=str)
        
        # Colonnes
        trip_id_st_col = find_column(stop_times_df, "trip_id")
        stop_id_col = find_column(stop_times_df, "stop_id")
        departure_time_col = find_column(stop_times_df, "departure_time")
        stop_sequence_col = find_column(stop_times_df, "stop_sequence")
        
        # Nettoyage
        stop_times_df[trip_id_st_col] = stop_times_df[trip_id_st_col].str.strip()
        stop_times_df[stop_id_col] = stop_times_df[stop_id_col].str.strip()
        
        # 5. PHASE 1 : REMPLISSAGE TRIPS (identique logique Java)
        trips_with_stops = defaultdict(list)
        all_trip_ids_in_trips = set(trips_df['trip_id'])
        
        for _, row in stop_times_df.iterrows():
            try:
                trip_id = str(row[trip_id_st_col]).strip()
                stop_id = str(row[stop_id_col]).strip()
                departure_time = str(row[departure_time_col]).strip()
                stop_sequence = str(row[stop_sequence_col]).strip()
                
                # LOGIQUE GAML : Filtrage conditionnel
                if trip_id not in active_trip_ids:
                    continue
                
                # Vérifier si le trip existe dans trips.txt
                if trip_id not in all_trip_ids_in_trips:
                    continue
                
                # Conversion du temps en secondes
                try:
                    parts = departure_time.split(':')
                    seconds = int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
                    time_in_seconds = str(seconds)
                except:
                    time_in_seconds = "0"
                
                trips_with_stops[trip_id].append({
                    'stop_id': stop_id,
                    'departure_time': departure_time,
                    'time_in_seconds': time_in_seconds,
                    'stop_sequence': int(stop_sequence) if stop_sequence.isdigit() else 0
                })
                
            except Exception as e:
                # Ignorer les lignes problématiques
                pass
        
        # 6. CRÉATION DEPARTURE_TRIPS_INFO
        departure_trips_info = {}
        
        for trip_id, stops_list in trips_with_stops.items():
            if stops_list:
                # Trier par stop_sequence
                stops_sorted = sorted(stops_list, key=lambda x: x['stop_sequence'])
                stop_pairs = []
                
                for stop_data in stops_sorted:
                    stop_pairs.append((stop_data['stop_id'], stop_data['time_in_seconds']))
                
                departure_trips_info[trip_id] = stop_pairs
        
        # 7. PHASE 2 : IDENTIFICATION STOPS DE DÉPART (logique Java)
        # Trips avec stop_sequence = 1
        departure_stops_df = stop_times_df[
            stop_times_df[stop_sequence_col].astype(str).str.strip() == '1'
        ]
        
        # Filtrage par trips actifs
        departure_stops_filtered = departure_stops_df[
            departure_stops_df[trip_id_st_col].isin(active_trip_ids)
        ]
        
        # tripToFirstStop (logique Java)
        trip_to_first_stop = {}
        trip_to_first_stop_time = {}
        
        for _, row in departure_stops_filtered.iterrows():
            trip_id = str(row[trip_id_st_col]).strip()
            stop_id = str(row[stop_id_col]).strip()
            departure_time = str(row[departure_time_col]).strip()
            
            if trip_id in departure_trips_info:
                try:
                    parts = departure_time.split(':')
                    seconds = str(int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]))
                    trip_to_first_stop[trip_id] = stop_id
                    trip_to_first_stop_time[trip_id] = seconds
                except:
                    pass
        
        # 8. CRÉATION STOP_TO_TRIP_IDS avec déduplication (logique Java)
        stop_to_trip_ids = defaultdict(list)
        seen_trip_signatures = set()
        
        for trip_id, stop_pairs in departure_trips_info.items():
            if not stop_pairs:
                continue
            
            # Utiliser stop avec stop_sequence = 1 si disponible
            first_stop_id = trip_to_first_stop.get(trip_id)
            departure_time = trip_to_first_stop_time.get(trip_id)
            
            # Fallback si pas de stop_sequence = 1
            if not first_stop_id:
                first_stop_id = stop_pairs[0][0]
                departure_time = stop_pairs[0][1]
            
            # Créer signature pour éviter doublons (EXACTE logique Java)
            stop_sequence_str = ";".join([pair[0] for pair in stop_pairs])
            signature = f"{first_stop_id}_{departure_time}_{stop_sequence_str};"
            
            if signature not in seen_trip_signatures:
                seen_trip_signatures.add(signature)
                stop_to_trip_ids[first_stop_id].append(trip_id)
        
        # 9. COMPTAGE FINAL - FORMAT GAML EXACT
        # Nombre de trips dans stopToTripIds (trips créés finaux)
        trips_in_stop_to_trip = set()
        for stop_id, trip_list in stop_to_trip_ids.items():
            trips_in_stop_to_trip.update(trip_list)
        
        # Nombre d'arrêts créés = tous les stops uniques dans stop_times
        unique_stops_all = set(stop_times_df[stop_id_col])
        nombre_arrets_crees = len(unique_stops_all)
        
        # Nombre de stops de départ = nombre de clés dans stopToTripIds
        nombre_stops_depart = len(stop_to_trip_ids)
        
        # Arrêts uniques dans departureTripsInfo
        unique_stops_in_departure = set()
        for trip_id, stop_pairs in departure_trips_info.items():
            for stop_id_pair, _ in stop_pairs:
                unique_stops_in_departure.add(stop_id_pair)
        arrets_uniques_departure = len(unique_stops_in_departure)
        
        # AFFICHAGE FORMAT GAML EXACT - SEULEMENT CES 4 LIGNES
        print(f"Nombre total de trips créés: {len(trips_in_stop_to_trip)}")
        print(f"Nombre d'arrêts créés: {nombre_arrets_crees}")
        print(f"Nombre de stops de départ (departureStopsInfo non null): {nombre_stops_depart}")
        print(f"Nombre d'arrêts uniques dans departureStopsInfo: {arrets_uniques_departure}")
        
        return {
            'fallback_date': fallback_date.strftime('%Y-%m-%d %A'),
            'total_trips': len(trips_in_stop_to_trip),
            'nombre_arrets_crees': nombre_arrets_crees,
            'nombre_stops_depart': nombre_stops_depart,
            'arrets_uniques_departure': arrets_uniques_departure
        }
        
    except Exception as e:
        print(f"❌ Erreur: {e}")
        import traceback
        traceback.print_exc()
        return None

def main():
    # 🔧 MODIFIEZ VOTRE CHEMIN GTFS ET DATE ICI
    GTFS_FOLDER = r"C:\Users\tiend\Downloads\gtfs-tan"
    TARGET_DATE = "2025-05-17"  # Dimanche 17 mai 2025
    
    analyze_trip_difference(GTFS_FOLDER, TARGET_DATE)

if __name__ == "__main__":
    main()

Date cible: 2025-05-17 (Saturday)
Jour de la semaine cible: 5 (0=lundi, 6=dimanche)
✅ Fallback trouvé: 2025-05-17 Saturday
Nombre total de trips créés: 6471
Nombre d'arrêts créés: 2556
Nombre de stops de départ (departureStopsInfo non null): 117
Nombre d'arrêts uniques dans departureStopsInfo: 2225


In [4]:
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

def test_cas3_tous_trips(gtfs_folder):
    """
    Test CAS 2 : starting_date NON DÉFINI → TOUS LES TRIPS UTILISÉS
    Reproduction exacte de la logique Java pour le CAS 2
    """
    
    # Chemins des fichiers GTFS
    trips_path = f"{gtfs_folder}\\trips.txt"
    stop_times_path = f"{gtfs_folder}\\stop_times.txt"
    stops_path = f"{gtfs_folder}\\stops.txt"
    
    def find_column(df, *names):
        """Trouve une colonne parmi plusieurs noms possibles"""
        for name in names:
            for col in df.columns:
                if col.lower().strip() == name.lower().strip():
                    return col
        return None

    try:
        # 1. CHARGEMENT TRIPS.TXT
        trips_df = pd.read_csv(trips_path, dtype=str)
        trips_df['trip_id'] = trips_df['trip_id'].str.strip()
        
        # activeTripIds = TOUS les trips (CAS 2)
        all_trip_ids = set(trips_df['trip_id'])
        
        # 2. CHARGEMENT STOP_TIMES.TXT
        stop_times_df = pd.read_csv(stop_times_path, dtype=str)
        
        # Colonnes
        trip_id_st_col = find_column(stop_times_df, "trip_id")
        stop_id_col = find_column(stop_times_df, "stop_id")
        departure_time_col = find_column(stop_times_df, "departure_time")
        stop_sequence_col = find_column(stop_times_df, "stop_sequence")
        
        # Nettoyage
        stop_times_df[trip_id_st_col] = stop_times_df[trip_id_st_col].str.strip()
        stop_times_df[stop_id_col] = stop_times_df[stop_id_col].str.strip()
        
        # 3. REMPLISSAGE TRIPS (logique CAS 2)
        trips_with_stops = defaultdict(list)
        
        for _, row in stop_times_df.iterrows():
            try:
                trip_id = str(row[trip_id_st_col]).strip()
                stop_id = str(row[stop_id_col]).strip()
                departure_time = str(row[departure_time_col]).strip()
                stop_sequence = str(row[stop_sequence_col]).strip()
                
                # CAS 2 : useAllTrips = true → PAS de filtrage par activeTripIds
                # On vérifie seulement si le trip existe dans trips.txt
                if trip_id not in all_trip_ids:
                    continue
                
                # Conversion du temps en secondes
                try:
                    parts = departure_time.split(':')
                    seconds = int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
                    time_in_seconds = str(seconds)
                except:
                    time_in_seconds = "0"
                
                trips_with_stops[trip_id].append({
                    'stop_id': stop_id,
                    'departure_time': departure_time,
                    'time_in_seconds': time_in_seconds,
                    'stop_sequence': int(stop_sequence) if stop_sequence.isdigit() else 0
                })
                
            except Exception as e:
                # Ignorer les lignes problématiques
                pass
        
        # 4. CRÉATION DEPARTURE_TRIPS_INFO
        departure_trips_info = {}
        
        for trip_id, stops_list in trips_with_stops.items():
            if stops_list:
                # Trier par stop_sequence
                stops_sorted = sorted(stops_list, key=lambda x: x['stop_sequence'])
                stop_pairs = []
                
                for stop_data in stops_sorted:
                    stop_pairs.append((stop_data['stop_id'], stop_data['time_in_seconds']))
                
                departure_trips_info[trip_id] = stop_pairs
        
        # 5. IDENTIFICATION STOPS DE DÉPART (CAS 3)
        # Trips avec stop_sequence = 1
        departure_stops_df = stop_times_df[
            stop_times_df[stop_sequence_col].astype(str).str.strip() == '1'
        ]
        
        # CAS 2 : PAS de filtrage par activeTripIds ici non plus
        # On filtre seulement par l'existence dans trips.txt
        departure_stops_filtered = departure_stops_df[
            departure_stops_df[trip_id_st_col].isin(all_trip_ids)
        ]
        
        # tripToFirstStop (logique identique)
        trip_to_first_stop = {}
        trip_to_first_stop_time = {}
        
        for _, row in departure_stops_filtered.iterrows():
            trip_id = str(row[trip_id_st_col]).strip()
            stop_id = str(row[stop_id_col]).strip()
            departure_time = str(row[departure_time_col]).strip()
            
            if trip_id in departure_trips_info:
                try:
                    parts = departure_time.split(':')
                    seconds = str(int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]))
                    trip_to_first_stop[trip_id] = stop_id
                    trip_to_first_stop_time[trip_id] = seconds
                except:
                    pass
        
        # 6. CRÉATION STOP_TO_TRIP_IDS avec déduplication (identique)
        stop_to_trip_ids = defaultdict(list)
        seen_trip_signatures = set()
        
        for trip_id, stop_pairs in departure_trips_info.items():
            if not stop_pairs:
                continue
            
            # Utiliser stop avec stop_sequence = 1 si disponible
            first_stop_id = trip_to_first_stop.get(trip_id)
            departure_time = trip_to_first_stop_time.get(trip_id)
            
            # Fallback si pas de stop_sequence = 1
            if not first_stop_id:
                first_stop_id = stop_pairs[0][0]
                departure_time = stop_pairs[0][1]
            
            # Créer signature pour éviter doublons (EXACTE logique Java)
            stop_sequence_str = ";".join([pair[0] for pair in stop_pairs])
            signature = f"{first_stop_id}_{departure_time}_{stop_sequence_str};"
            
            if signature not in seen_trip_signatures:
                seen_trip_signatures.add(signature)
                stop_to_trip_ids[first_stop_id].append(trip_id)
        
        # 7. COMPTAGE FINAL - FORMAT GAML EXACT CAS 2
        # Nombre de trips dans stopToTripIds (trips créés finaux)
        trips_in_stop_to_trip = set()
        for stop_id, trip_list in stop_to_trip_ids.items():
            trips_in_stop_to_trip.update(trip_list)
        
        # Nombre d'arrêts créés = tous les stops uniques dans stop_times
        unique_stops_all = set(stop_times_df[stop_id_col])
        nombre_arrets_crees = len(unique_stops_all)
        
        # Nombre de stops de départ = nombre de clés dans stopToTripIds
        nombre_stops_depart = len(stop_to_trip_ids)
        
        # Arrêts uniques dans departureTripsInfo
        unique_stops_in_departure = set()
        for trip_id, stop_pairs in departure_trips_info.items():
            for stop_id_pair, _ in stop_pairs:
                unique_stops_in_departure.add(stop_id_pair)
        arrets_uniques_departure = len(unique_stops_in_departure)
        
        # AFFICHAGE FORMAT GAML EXACT - SEULEMENT CES 4 LIGNES
        print(f"Nombre total de trips créés: {len(trips_in_stop_to_trip)}")
        print(f"Nombre d'arrêts créés: {nombre_arrets_crees}")
        print(f"Nombre de stops de départ (departureStopsInfo non null): {nombre_stops_depart}")
        print(f"Nombre d'arrêts uniques dans departureStopsInfo: {arrets_uniques_departure}")
        
        return {
            'total_trips_crees': len(trips_in_stop_to_trip),
            'nombre_arrets_crees': nombre_arrets_crees,
            'nombre_stops_depart': nombre_stops_depart,
            'arrets_uniques_departure': arrets_uniques_departure
        }
        
    except Exception as e:
        print(f"❌ Erreur: {e}")
        import traceback
        traceback.print_exc()
        return None

def main():
    # 🔧 MODIFIEZ VOTRE CHEMIN GTFS ICI
    GTFS_FOLDER = r"C:\Users\tiend\Downloads\Toulouse_GTFS"
    
    print("🚀 TEST 2 - TOUS LES TRIPS UTILISÉS")
    print("📍 GTFS Toulouse avec calendar_dates.txt")
    print("Simulation de: starting_date NON DÉFINI dans GAML")
    print("-" * 60)
    
    test_cas3_tous_trips(GTFS_FOLDER)
    
    print(f"\n✅ Test CAS 2 terminé!")
    print("💡 Comparez avec votre modèle GAML sans starting_date défini")

if __name__ == "__main__":
    main()

🚀 TEST 2 - TOUS LES TRIPS UTILISÉS
📍 GTFS Toulouse avec calendar_dates.txt
Simulation de: starting_date NON DÉFINI dans GAML
------------------------------------------------------------
Nombre total de trips créés: 25607
Nombre d'arrêts créés: 3759
Nombre de stops de départ (departureStopsInfo non null): 233
Nombre d'arrêts uniques dans departureStopsInfo: 3759

✅ Test CAS 2 terminé!
💡 Comparez avec votre modèle GAML sans starting_date défini
