In [464]:
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output

In [465]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import time
import requests
from IPython.display import FileLink
from ydata_profiling import ProfileReport

#### Data Collection

In [466]:
def sncf_dataset(sncf_dataset_id, file_name, delimiter=";", 
                      list_separator=",", quote_all="false", with_bom="true"):
    """
    Downloads a CSV from a specified SNCF dataset and returns a clickable link 
    for download in a Jupyter Notebook.
    
    Args:
        id_sncf_dataset (str): The identifier of the SNCF dataset to download.
        filename (str): Local filename to save the CSV.
        delimiter (str, optional): Field delimiter in the CSV. Default is ";".
        list_separator (str, optional): Separator for list values. Default is ",".
        quote_all (str, optional): Whether to quote all fields. Default is "false".
        with_bom (str, optional): Whether to include a BOM in the CSV. Default is "true".
        
    Returns:
        FileLink: A clickable link to download the CSV file in the notebook.
    """
    
    url = f"https://data.sncf.com/api/explore/v2.1/catalog/datasets/{sncf_dataset_id}/exports/csv"
    params = {
        "delimiter": delimiter,
        "list_separator": list_separator,
        "quote_all": quote_all,
        "with_bom": with_bom,
    }

    response = requests.get(url, params=params)
    response.raise_for_status()

    with open(file_name, "wb") as f:
        f.write(response.content)

    return FileLink(file_name)

In [467]:
# --- SNCF Dataset Downloads ---

# Regularity dataset
dataset_id_regularity = "regularite-mensuelle-tgv-aqst"
filename_regularity = "tgv-monthly-regularity.csv"
sncf_dataset(dataset_id_regularity, filename_regularity)

In [468]:
# TGV/OUIGO fares dataset
dataset_id_fares = "tarifs-tgv-inoui-ouigo"
filename_fares = "tgv-inoui-ouigo-fares.csv"
sncf_dataset(dataset_id_fares, filename_fares)

In [469]:
def load_and_display_data(file_path, separator=";"):
    """
    Loads a CSV file and displays its shape, with robust checks for errors.

    Args:
        file_path (str): The path to the CSV file.
        separator (str): The column separator to use.
    """
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"Error: The specified file was not found at the path: {file_path}")
        return

    try:
        # Attempt to read the file using the specified separator
        df_raw = pd.read_csv(file_path, sep=separator)

        # Check if the DataFrame is empty after loading
        if df_raw.empty:
            print(f"Warning: File {file_path} was loaded but is empty.")
            return

        print(f"Successful loading of file: {file_path}")
        print("---")
        
        # Display the shape (number of rows, number of columns) of the DataFrame
        print("DataFrame shape (rows, columns):")
        print(df_raw.shape)

    except pd.errors.ParserError as e:
        # Handle parsing errors (e.g., wrong separator, malformed file)
        print(f"Parsing Error while reading the file: {e}")
        print(f"Suggestion: Check if the separator (sep='{separator}') and encoding are correct.")
        
    except Exception as e:
        # Handle any other unexpected error
        print(f"An unexpected error occurred: {e}")

    return df_raw

In [470]:
# ==== Load df_reg ====
print("\n=== Loading dataset: tgv-monthly-regularity.csv ===")
file_name = "tgv-monthly-regularity.csv"
df_reg = load_and_display_data(file_name, separator=";")

if df_reg is not None:
    print("✔ DataFrame 'df_reg' is ready for processing.\n")
    display(df_reg.head())
else:
    print("✘ Failed to load 'df_reg'. Processing stopped.\n")


=== Loading dataset: tgv-monthly-regularity.csv ===
Successful loading of file: tgv-monthly-regularity.csv
---
DataFrame shape (rows, columns):
(10687, 26)
✔ DataFrame 'df_reg' is ready for processing.



Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,commentaire_annulation,nb_train_depart_retard,retard_moyen_depart,...,nb_train_retard_sup_15,retard_moyen_trains_retard_sup15,nb_train_retard_sup_30,nb_train_retard_sup_60,prct_cause_externe,prct_cause_infra,prct_cause_gestion_trafic,prct_cause_materiel_roulant,prct_cause_gestion_gare,prct_cause_prise_en_charge_voyageurs
0,2018-01,National,GRENOBLE,PARIS LYON,183,245,0,,37,8.027027,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018-01,International,PARIS LYON,ITALIE,394,94,0,,27,11.261728,...,22,11.601064,15,6,33.333333,19.047619,23.809524,14.285714,9.52381,0.0
2,2018-01,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,,133,6.978195,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018-01,National,PARIS NORD,DUNKERQUE,116,271,3,,46,11.236594,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018-01,National,ANNECY,PARIS LYON,224,198,0,,12,8.070833,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905


In [471]:
# ==== Load df_far ====
print("\n=== Loading dataset: tgv-inoui-ouigo-fares.csv ===")
file_name = "tgv-inoui-ouigo-fares.csv"
df_far = load_and_display_data(file_name, separator=";")

if df_far is not None:
    print("✔ DataFrame 'df_far' is ready for processing.\n")
    display(df_far.head())
else:
    print("✘ Failed to load 'df_far'. Processing stopped.\n")


=== Loading dataset: tgv-inoui-ouigo-fares.csv ===
Successful loading of file: tgv-inoui-ouigo-fares.csv
---
DataFrame shape (rows, columns):
(34135, 9)
✔ DataFrame 'df_far' is ready for processing.



Unnamed: 0,transporteur,gare_origine,gare_origine_code_uic,gare_destination,gare_destination_code_uic,classe,profil_tarifaire,prix_minimum,prix_maximum
0,TGV INOUI,AVIGNON TGV,87318964,PERPIGNAN,87784009,1,Tarif Réglementé,67.0,67.0
1,TGV INOUI,CHAMBERY CHALLES LES EAUX,87741009,LEPIN LE LAC LA BAUCHE,87741439,1,Tarif Réglementé,7.1,7.1
2,TGV INOUI,BAR LE DUC,87175042,CHALONS EN CHAMPAGNE,87174003,1,Tarif Réglementé,28.4,28.4
3,TGV INOUI,BREST,87474007,NANTES,87481002,2,Tarif Réglementé,60.9,60.9
4,TGV INOUI,BAYONNE,87673004,CHATELLERAULT,87575142,1,Tarif Elève - Etudiant - Apprenti,15.0,15.0


In [472]:
def print_dataframe_columns(df, df_name="DataFrame"):
    """
    Print all column names of a DataFrame with a clean, numbered format.
    
    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame whose columns should be displayed.
    df_name : str, optional
        A label used for printing, by default "DataFrame".
    """
    
    print(f"\n## Columns in {df_name}\n")
    
    print("Column names:")
    for i, col in enumerate(df.columns.tolist(), 1):
        print(f"  {i}. {col}")

    print(f"\nTotal Columns: {len(df.columns)}")

In [473]:
# Print columns of df_reg
print_dataframe_columns(df_reg, df_name="df_reg")


## Columns in df_reg

Column names:
  1. date
  2. service
  3. gare_depart
  4. gare_arrivee
  5. duree_moyenne
  6. nb_train_prevu
  7. nb_annulation
  8. commentaire_annulation
  9. nb_train_depart_retard
  10. retard_moyen_depart
  11. retard_moyen_tous_trains_depart
  12. commentaire_retards_depart
  13. nb_train_retard_arrivee
  14. retard_moyen_arrivee
  15. retard_moyen_tous_trains_arrivee
  16. commentaires_retard_arrivee
  17. nb_train_retard_sup_15
  18. retard_moyen_trains_retard_sup15
  19. nb_train_retard_sup_30
  20. nb_train_retard_sup_60
  21. prct_cause_externe
  22. prct_cause_infra
  23. prct_cause_gestion_trafic
  24. prct_cause_materiel_roulant
  25. prct_cause_gestion_gare
  26. prct_cause_prise_en_charge_voyageurs

Total Columns: 26


In [474]:
# Print columns of df_far
print_dataframe_columns(df_far, df_name="df_far")


## Columns in df_far

Column names:
  1. transporteur
  2. gare_origine
  3. gare_origine_code_uic
  4. gare_destination
  5. gare_destination_code_uic
  6. classe
  7. profil_tarifaire
  8. prix_minimum
  9. prix_maximum

Total Columns: 9


In [475]:
def rename_dataframe_columns(df, rename_dict):
    """
    Rename the columns of a DataFrame based on a dictionary mapping.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame whose columns should be renamed.
    rename_dict : dict
        Dictionary mapping old column names to new column names.

    Returns
    -------
    pandas.DataFrame
        The DataFrame with renamed columns.
    """
    df = df.rename(columns=rename_dict)
    return df

In [476]:
rename_dict_reg = {
    "date": "date",
    "service": "service",
    "gare_depart": "departure_station",
    "gare_arrivee": "arrival_station",
    "duree_moyenne": "avg_trip_duration",
    "nb_train_prevu": "scheduled_trains",
    "nb_annulation": "canceled_trains",
    "commentaire_annulation": "cancellation_comments",
    "nb_train_depart_retard": "trains_delayed_departure",
    "retard_moyen_depart": "avg_delay_delayed_trains_departure",
    "retard_moyen_tous_trains_depart": "avg_delay_all_trains_departure",
    "commentaire_retards_depart": "departure_delay_comments",
    "nb_train_retard_arrivee": "trains_delayed_arrival",
    "retard_moyen_arrivee": "avg_delay_delayed_trains_arrival",
    "retard_moyen_tous_trains_arrivee": "avg_delay_all_trains_arrival",
    "commentaires_retard_arrivee": "arrival_delay_comments",
    "nb_train_retard_sup_15": "trains_delayed_over_15min",
    "retard_moyen_trains_retard_sup15": "avg_delay_over_15min",
    "nb_train_retard_sup_30": "trains_delayed_over_30min",
    "nb_train_retard_sup_60": "trains_delayed_over_60min",
    "prct_cause_externe": "pct_delay_external_causes",
    "prct_cause_infra": "pct_delay_infrastructure",
    "prct_cause_gestion_trafic": "pct_delay_traffic_management",
    "prct_cause_materiel_roulant": "pct_delay_rolling_stock",
    "prct_cause_gestion_gare": "pct_delay_station_operations",
    "prct_cause_prise_en_charge_voyageurs": "pct_delay_passenger_handling"
}

df_reg = rename_dataframe_columns(df_reg, rename_dict_reg)
display(df_reg.head())

Unnamed: 0,date,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,cancellation_comments,trains_delayed_departure,avg_delay_delayed_trains_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
0,2018-01,National,GRENOBLE,PARIS LYON,183,245,0,,37,8.027027,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018-01,International,PARIS LYON,ITALIE,394,94,0,,27,11.261728,...,22,11.601064,15,6,33.333333,19.047619,23.809524,14.285714,9.52381,0.0
2,2018-01,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,,133,6.978195,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018-01,National,PARIS NORD,DUNKERQUE,116,271,3,,46,11.236594,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018-01,National,ANNECY,PARIS LYON,224,198,0,,12,8.070833,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905


In [477]:
rename_dict_far = {
    "transporteur": "tgv_types",
    "gare_origine": "departure_station",
    "gare_origine_code_uic": "departure_station_uic",
    "gare_destination": "arrival_station",
    "gare_destination_code_uic": "arrival_station_uic",
    "classe": "train_class",
    "profil_tarifaire": "fare_profile",
    "prix_minimum": "min_price",
    "prix_maximum": "max_price"
}


df_far = rename_dataframe_columns(df_far, rename_dict_far)
display(df_far.head())

Unnamed: 0,tgv_types,departure_station,departure_station_uic,arrival_station,arrival_station_uic,train_class,fare_profile,min_price,max_price
0,TGV INOUI,AVIGNON TGV,87318964,PERPIGNAN,87784009,1,Tarif Réglementé,67.0,67.0
1,TGV INOUI,CHAMBERY CHALLES LES EAUX,87741009,LEPIN LE LAC LA BAUCHE,87741439,1,Tarif Réglementé,7.1,7.1
2,TGV INOUI,BAR LE DUC,87175042,CHALONS EN CHAMPAGNE,87174003,1,Tarif Réglementé,28.4,28.4
3,TGV INOUI,BREST,87474007,NANTES,87481002,2,Tarif Réglementé,60.9,60.9
4,TGV INOUI,BAYONNE,87673004,CHATELLERAULT,87575142,1,Tarif Elève - Etudiant - Apprenti,15.0,15.0


In [478]:
# ===== DataFrame Summary: df_reg =====
print("\n" + "="*60)
print("## DataFrame Information: df_reg")
print("="*60 + "\n")

df_reg.info(memory_usage='deep')
print("\n" + "="*60 + "\n")


## DataFrame Information: df_reg

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10687 entries, 0 to 10686
Data columns (total 26 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   date                                10687 non-null  object 
 1   service                             10687 non-null  object 
 2   departure_station                   10687 non-null  object 
 3   arrival_station                     10687 non-null  object 
 4   avg_trip_duration                   10687 non-null  int64  
 5   scheduled_trains                    10687 non-null  int64  
 6   canceled_trains                     10687 non-null  int64  
 7   cancellation_comments               0 non-null      float64
 8   trains_delayed_departure            10687 non-null  int64  
 9   avg_delay_delayed_trains_departure  10687 non-null  float64
 10  avg_delay_all_trains_departure      10687 non-null  float64
 11  depart

In [479]:
# ===== DataFrame Summary: df_far =====
print("\n" + "="*60)
print("## DataFrame Information: df_far")
print("="*60 + "\n")

df_far.info(memory_usage='deep')
print("\n" + "="*60 + "\n")


## DataFrame Information: df_far

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34135 entries, 0 to 34134
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   tgv_types              34135 non-null  object 
 1   departure_station      34135 non-null  object 
 2   departure_station_uic  34135 non-null  int64  
 3   arrival_station        34135 non-null  object 
 4   arrival_station_uic    34135 non-null  int64  
 5   train_class            34135 non-null  int64  
 6   fare_profile           34135 non-null  object 
 7   min_price              34135 non-null  float64
 8   max_price              34135 non-null  float64
dtypes: float64(2), int64(3), object(4)
memory usage: 9.7 MB




#### Data Cleaning

In [480]:
def generate_profiling_report(df, output_file="profiling_report.html", title="Data Profiling Report"):
    """
    Generate an HTML profiling report for a DataFrame using ydata_profiling.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to profile.
    output_file : str, optional
        The name of the output HTML file (default is "profiling_report.html").
    title : str, optional
        The title of the profiling report (default is "Data Profiling Report").
    """
    try:
        print(f"\n## Generating Profiling Report: {title} ...")
        
        profile = ProfileReport(
            df,
            title=title,
            sort=None,        # Use 'None' for original order, 'alphabetical' for sorting
            explorative=True
        )
        
        profile.to_file(output_file)
        print(f"✔ Profiling report successfully exported to: {output_file}")

    except ImportError:
        print("Error: The 'ydata-profiling' library is required.")
        print("Install it using: `pip install ydata-profiling`")

    except Exception as e:
        print(f"An error occurred during report generation: {e}")

In [481]:
# Generate profiling report for df_reg
generate_profiling_report(df_reg, output_file="tgv_regularity_report.html", title="TGV Regularity Profiling")


## Generating Profiling Report: TGV Regularity Profiling ...


100%|██████████| 26/26 [00:00<00:00, 109.99it/s]<00:00, 11.91it/s, Describe variable: pct_delay_passenger_handling]
Summarize dataset: 100%|██████████| 396/396 [00:52<00:00,  7.54it/s, Completed]                                                                     
Generate report structure: 100%|██████████| 1/1 [00:08<00:00,  8.77s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.84s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  8.81it/s]

✔ Profiling report successfully exported to: tgv_regularity_report.html





In [482]:
# Generate profiling report for df_far
generate_profiling_report(df_far, output_file="tgv_fares_report.html", title="TGV Fares Profiling")


## Generating Profiling Report: TGV Fares Profiling ...


100%|██████████| 9/9 [00:00<00:00, 201.07it/s]0<00:00, 12.97it/s, Describe variable: max_price]  
Summarize dataset: 100%|██████████| 34/34 [00:03<00:00, 10.67it/s, Completed]                                           
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.46s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  6.07it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 80.87it/s]

✔ Profiling report successfully exported to: tgv_fares_report.html





In [483]:
def missing_values_summary(df, df_name="DataFrame"):
    """
    Calculate and display missing values (count and percentage) for a DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to analyze.
    df_name : str, optional
        Name of the DataFrame for display purposes (default is "DataFrame").
    """
    missing = pd.DataFrame({
        'Missing Count': df.isna().sum(),
        'Missing Percent': (df.isna().sum() / len(df) * 100).round(2)
    }).sort_values('Missing Percent', ascending=False)
    
    print(f"\n## Missing Value Summary for {df_name}\n")
    
    # Filter for columns with at least one missing value
    missing_data_summary = missing[missing['Missing Count'] > 0]
    
    if not missing_data_summary.empty:
        print(f"{len(missing_data_summary)} columns have missing values.")
        print("---")
        display(missing_data_summary)
    else:
        print(f"Great! No missing values found in {df_name}.")

In [484]:
# Generate missing values summary for df_far
missing_values_summary(df_reg, "df_reg")


## Missing Value Summary for df_reg

3 columns have missing values.
---


Unnamed: 0,Missing Count,Missing Percent
cancellation_comments,10687,100.0
departure_delay_comments,10687,100.0
arrival_delay_comments,9989,93.47


In [485]:
# Generate missing values summary for df_far
missing_values_summary(df_far, "df_far")


## Missing Value Summary for df_far

Great! No missing values found in df_far.


In [486]:
def drop_columns(df, columns_to_drop, df_name="DataFrame"):
    """
    Drop specified columns from a DataFrame, ignoring errors if a column does not exist,
    and print a summary.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame from which columns will be dropped.
    columns_to_drop : list of str
        List of column names to drop.
    df_name : str, optional
        Name of the DataFrame for display purposes (default is "DataFrame").

    Returns
    -------
    pandas.DataFrame
        The DataFrame with specified columns dropped.
    """
    # Drop columns with errors ignored
    df = df.drop(columns=columns_to_drop, errors='ignore')
    
    print(f"\n## Column Deletion Summary for {df_name}\n")
    print("Successfully dropped the following columns (if they existed):")
    for col in columns_to_drop:
        print(f"- {col}")
    
    print("\n---")
    print(f"New DataFrame shape: {df.shape}")

    print("\n---")
    display(df.head())
    
    return df

In [487]:
columns_to_drop = [
    'cancellation_comments',
    'departure_delay_comments',
    'arrival_delay_comments'
]

df_reg = drop_columns(df_reg, columns_to_drop, df_name="df_reg")


## Column Deletion Summary for df_reg

Successfully dropped the following columns (if they existed):
- cancellation_comments
- departure_delay_comments
- arrival_delay_comments

---
New DataFrame shape: (10687, 23)

---


Unnamed: 0,date,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,trains_delayed_departure,avg_delay_delayed_trains_departure,avg_delay_all_trains_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
0,2018-01,National,GRENOBLE,PARIS LYON,183,245,0,37,8.027027,1.212245,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018-01,International,PARIS LYON,ITALIE,394,94,0,27,11.261728,2.997695,...,22,11.601064,15,6,33.333333,19.047619,23.809524,14.285714,9.52381,0.0
2,2018-01,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,133,6.978195,1.706333,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018-01,National,PARIS NORD,DUNKERQUE,116,271,3,46,11.236594,1.797637,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018-01,National,ANNECY,PARIS LYON,224,198,0,12,8.070833,0.489141,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905


In [488]:
columns_to_drop = [
    'departure_station_uic',
    'arrival_station_uic'
]

df_far = drop_columns(df_far, columns_to_drop, df_name="df_far")


## Column Deletion Summary for df_far

Successfully dropped the following columns (if they existed):
- departure_station_uic
- arrival_station_uic

---
New DataFrame shape: (34135, 7)

---


Unnamed: 0,tgv_types,departure_station,arrival_station,train_class,fare_profile,min_price,max_price
0,TGV INOUI,AVIGNON TGV,PERPIGNAN,1,Tarif Réglementé,67.0,67.0
1,TGV INOUI,CHAMBERY CHALLES LES EAUX,LEPIN LE LAC LA BAUCHE,1,Tarif Réglementé,7.1,7.1
2,TGV INOUI,BAR LE DUC,CHALONS EN CHAMPAGNE,1,Tarif Réglementé,28.4,28.4
3,TGV INOUI,BREST,NANTES,2,Tarif Réglementé,60.9,60.9
4,TGV INOUI,BAYONNE,CHATELLERAULT,1,Tarif Elève - Etudiant - Apprenti,15.0,15.0


In [489]:
def check_duplicate_rows(df, df_name="DataFrame", display_count=5):
    """
    Check for full-row duplicates in a DataFrame and display a summary.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to check for duplicates.
    df_name : str, optional
        Name of the DataFrame for display purposes (default is "DataFrame").
    display_count : int, optional
        Number of duplicate rows to display for inspection (default is 5).
    """
    num_duplicates = df.duplicated().sum()
    
    print(f"\n## Duplicate Row Check for {df_name}\n")
    
    if num_duplicates > 0:
        print(f"⚠ Warning: Found {num_duplicates} duplicate row(s) in {df_name}.")
        percent_duplicates = (num_duplicates / len(df) * 100).round(2)
        print(f"This represents {percent_duplicates}% of the total data.\n")
        
        print(f"First {display_count} duplicate entries (excluding the first occurrence):")
        display(df[df.duplicated(keep='first')].head(display_count))
    else:
        print(f"✔ Great! No full-row duplicates were found in {df_name}.")

In [490]:
# ...
check_duplicate_rows(df_reg, "df_reg")


## Duplicate Row Check for df_reg

✔ Great! No full-row duplicates were found in df_reg.


In [491]:
# ...
check_duplicate_rows(df_far, "df_far")


## Duplicate Row Check for df_far

✔ Great! No full-row duplicates were found in df_far.


In [492]:
def trim_whitespace_in_object_columns(df, df_name="DataFrame"):
    """
    Trim leading and trailing whitespace in all object (string) columns of a DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame whose string columns will be trimmed.
    df_name : str, optional
        Name of the DataFrame for display purposes (default is "DataFrame").
    """
    # Identify object columns
    object_cols = df.select_dtypes(include=['object']).columns
    
    print(f"\n## Trimming Whitespace in String Columns for {df_name}\n")
    
    if len(object_cols) > 0:
        for col in object_cols:
            # Use .str.strip() which safely handles NaN values
            df.loc[:, col] = df[col].str.strip()
        
        print(f"✔ Whitespace trimmed in {len(object_cols)} object columns.")
        print("Columns processed:")
        for col in object_cols:
            print(f"- {col}")
    else:
        print(f"⚠ Warning: No columns of type 'object' found in {df_name} to trim whitespace.")

In [493]:
# ...
trim_whitespace_in_object_columns(df_reg, "df_reg")


## Trimming Whitespace in String Columns for df_reg

✔ Whitespace trimmed in 4 object columns.
Columns processed:
- date
- service
- departure_station
- arrival_station


In [494]:
# ...
trim_whitespace_in_object_columns(df_far, "df_far")


## Trimming Whitespace in String Columns for df_far

✔ Whitespace trimmed in 4 object columns.
Columns processed:
- tgv_types
- departure_station
- arrival_station
- fare_profile


In [495]:
def convert_date_and_split(df, date_column="date", date_format="%Y-%m"):
    """
    Convert a column to datetime and create 'year' and 'month' columns
    immediately after the date column.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing the date column.
    date_column : str, optional
        Name of the column to convert (default is "date").
    date_format : str, optional
        Format of the date for conversion (default is "%Y-%m").
    """
    print(f"\n## Date Column Conversion ({date_column})\n")
    
    if date_column in df.columns:
        original_dtype = df[date_column].dtype
        
        try:
            # Convert to datetime
            df[date_column] = pd.to_datetime(df[date_column], format=date_format, errors='coerce')
            new_dtype = df[date_column].dtype
            
            print("Conversion successful!")
            print(f"Original data type: {original_dtype}")
            print(f"New data type: {new_dtype}")
            
            # Check for NaT values
            nat_count = df[date_column].isna().sum()
            if nat_count > 0:
                print(f"Warning: {nat_count} value(s) could not be parsed and were converted to NaT.")
            
            # Create year and month columns
            year_col = df[date_column].dt.year
            month_col = df[date_column].dt.month
            
            # Insert the new columns immediately after the date column
            date_idx = df.columns.get_loc(date_column)
            df.insert(date_idx + 1, "year", year_col)
            df.insert(date_idx + 2, "month", month_col)
            
            print(f"\n'year' and 'month' columns successfully created after '{date_column}'.")
            print("\nFirst 5 rows with new date format and split columns:")
            display(df.head())
            
        except Exception as e:
            print(f"Error during date conversion: {e}")
            print("Suggestion: Verify the data format and column content.")
    else:
        print(f"Warning: Column '{date_column}' not found in the DataFrame. Conversion skipped.")

In [496]:
# ...
convert_date_and_split(df_reg, date_column="date", date_format="%Y-%m")


## Date Column Conversion (date)

Conversion successful!
Original data type: object
New data type: datetime64[ns]

'year' and 'month' columns successfully created after 'date'.

First 5 rows with new date format and split columns:


Unnamed: 0,date,year,month,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,trains_delayed_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
0,2018-01-01,2018,1,National,GRENOBLE,PARIS LYON,183,245,0,37,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018-01-01,2018,1,International,PARIS LYON,ITALIE,394,94,0,27,...,22,11.601064,15,6,33.333333,19.047619,23.809524,14.285714,9.52381,0.0
2,2018-01-01,2018,1,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,133,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018-01-01,2018,1,National,PARIS NORD,DUNKERQUE,116,271,3,46,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018-01-01,2018,1,National,ANNECY,PARIS LYON,224,198,0,12,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905


In [497]:
columns_to_drop = ['date']

df_reg = drop_columns(df_reg, columns_to_drop, df_name="df_reg")


## Column Deletion Summary for df_reg

Successfully dropped the following columns (if they existed):
- date

---
New DataFrame shape: (10687, 24)

---


Unnamed: 0,year,month,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,trains_delayed_departure,avg_delay_delayed_trains_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
0,2018,1,National,GRENOBLE,PARIS LYON,183,245,0,37,8.027027,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018,1,International,PARIS LYON,ITALIE,394,94,0,27,11.261728,...,22,11.601064,15,6,33.333333,19.047619,23.809524,14.285714,9.52381,0.0
2,2018,1,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,133,6.978195,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018,1,National,PARIS NORD,DUNKERQUE,116,271,3,46,11.236594,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018,1,National,ANNECY,PARIS LYON,224,198,0,12,8.070833,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905


In [498]:
def filter_rows(df, condition, df_name="DataFrame"):
    """
    Filter rows in a DataFrame based on a boolean condition.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to filter.
    condition : pandas.Series of bool
        A boolean Series indicating which rows to keep (True = keep, False = drop).
    df_name : str, optional
        Name of the DataFrame for display purposes (default is "DataFrame").

    Returns
    -------
    pandas.DataFrame
        A filtered DataFrame containing only the rows that satisfy the condition.
    """
    if len(condition) != len(df):
        raise ValueError("The condition length must match the number of rows in the DataFrame.")
    
    filtered_df = df[condition].copy()
    
    print(f"\n## Row Filtering Summary for {df_name}")
    print(f"Original number of rows: {len(df)}")
    print(f"Number of rows after filtering: {len(filtered_df)}")
    print(f"Rows removed: {len(df) - len(filtered_df)}")

    print("\n---")
    display(filtered_df.head())
    
    return filtered_df

In [499]:
# Keep only rows where 'service' == "National"
condition = df_reg['service'] == "National"
df_reg = filter_rows(df_reg, condition, df_name="df_reg")


## Row Filtering Summary for df_reg
Original number of rows: 10687
Number of rows after filtering: 9389
Rows removed: 1298

---


Unnamed: 0,year,month,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,trains_delayed_departure,avg_delay_delayed_trains_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
0,2018,1,National,GRENOBLE,PARIS LYON,183,245,0,37,8.027027,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
2,2018,1,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,133,6.978195,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018,1,National,PARIS NORD,DUNKERQUE,116,271,3,46,11.236594,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018,1,National,ANNECY,PARIS LYON,224,198,0,12,8.070833,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905
5,2018,1,National,TOULOUSE MATABIAU,PARIS MONTPARNASSE,257,184,0,23,11.221739,...,26,7.510507,12,3,59.090909,22.727273,4.545455,9.090909,4.545455,0.0


In [500]:
# Keep only rows where 'tgv_types' == "TGV INOUI"
condition = df_far['tgv_types'] == "TGV INOUI"
df_far = filter_rows(df_far, condition, df_name="df_far")


## Row Filtering Summary for df_far
Original number of rows: 34135
Number of rows after filtering: 33418
Rows removed: 717

---


Unnamed: 0,tgv_types,departure_station,arrival_station,train_class,fare_profile,min_price,max_price
0,TGV INOUI,AVIGNON TGV,PERPIGNAN,1,Tarif Réglementé,67.0,67.0
1,TGV INOUI,CHAMBERY CHALLES LES EAUX,LEPIN LE LAC LA BAUCHE,1,Tarif Réglementé,7.1,7.1
2,TGV INOUI,BAR LE DUC,CHALONS EN CHAMPAGNE,1,Tarif Réglementé,28.4,28.4
3,TGV INOUI,BREST,NANTES,2,Tarif Réglementé,60.9,60.9
4,TGV INOUI,BAYONNE,CHATELLERAULT,1,Tarif Elève - Etudiant - Apprenti,15.0,15.0


In [501]:
# ---- Check for short trips where avg_trip_duration < 30 minutes ----
threshold = 30
short_trips = df_reg[df_reg['avg_trip_duration'] < threshold]
num_short_trips = len(short_trips)

print("\n## Data Consistency Check: Average Trip Duration\n")
print(f"⚠ Number of trips with avg_trip_duration < {threshold} minutes: {num_short_trips}")

if num_short_trips > 0:
    print("\nExample rows with inconsistency:")
    display(short_trips.head())
else:
    print("✔ No trips found with avg_trip_duration < 30 minutes.")


## Data Consistency Check: Average Trip Duration

⚠ Number of trips with avg_trip_duration < 30 minutes: 46

Example rows with inconsistency:


Unnamed: 0,year,month,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,trains_delayed_departure,avg_delay_delayed_trains_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
3405,2020,4,National,PARIS NORD,DOUAI,0,0,12,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3407,2020,4,National,NANTES,STRASBOURG,0,0,2,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3411,2020,4,National,LYON PART DIEU,MARNE LA VALLEE,0,0,19,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3412,2020,4,National,PARIS LYON,AIX EN PROVENCE TGV,0,0,17,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3416,2020,4,National,GRENOBLE,PARIS LYON,0,0,9,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [502]:
# ---- Correct avg_trip_duration < 30 minutes ----
threshold = 30
duration_col = 'avg_trip_duration'
departure_col = 'departure_station'
arrival_col = 'arrival_station'

# Identifier les lignes avec une durée trop courte
short_mask = df_reg[duration_col] < threshold
num_short = short_mask.sum()
print(f"\n## Correcting short trip durations (< {threshold} minutes)")
print(f"Number of trips to correct: {num_short}")

if num_short > 0:
    # Calculer la médiane par couple departure-arrival
    median_durations = df_reg.groupby([departure_col, arrival_col])[duration_col].transform('median')
    
    # Remplacer les durées trop courtes par la médiane
    df_reg.loc[short_mask, duration_col] = median_durations[short_mask]
    
    print(f"✔ Successfully corrected {num_short} trips with too short duration.")
else:
    print("✔ No trips needed correction.")

# Afficher les 5 premières lignes corrigées pour vérification
if num_short > 0:
    print("\nExample of corrected trips:")
    display(df_reg.loc[short_mask, [departure_col, arrival_col, duration_col]].head())


## Correcting short trip durations (< 30 minutes)
Number of trips to correct: 46
✔ Successfully corrected 46 trips with too short duration.

Example of corrected trips:


Unnamed: 0,departure_station,arrival_station,avg_trip_duration
3405,PARIS NORD,DOUAI,70.0
3407,NANTES,STRASBOURG,313.0
3411,LYON PART DIEU,MARNE LA VALLEE,108.0
3412,PARIS LYON,AIX EN PROVENCE TGV,184.0
3416,GRENOBLE,PARIS LYON,182.0


In [503]:
# ---- Check consistency between scheduled and cancelled trains ----
scheduled_col = 'scheduled_trains'
cancelled_col = 'canceled_trains'

# Rows where scheduled trains < cancelled trains
invalid_rows = df_reg[df_reg[scheduled_col] < df_reg[cancelled_col]]
num_invalid = len(invalid_rows)
print("\n## Data Consistency Check: Scheduled vs Cancelled Trains\n")
print(f"⚠ Number of rows where scheduled trains < cancelled trains: {num_invalid}")

if num_invalid > 0:
    print("\nExample rows with inconsistency:")
    display(invalid_rows.head())
else:
    print("✔ No rows found with scheduled trains < cancelled trains.")

# Rows where scheduled trains == cancelled trains
equal_rows = df_reg[df_reg[scheduled_col] == df_reg[cancelled_col]]
num_equal = len(equal_rows)
print(f"\nNumber of rows where scheduled trains == cancelled trains: {num_equal}")

if num_equal > 0:
    print("Example rows where scheduled trains equal cancelled trains:")
    display(equal_rows.head())
else:
    print("✔ No rows found where scheduled trains equal cancelled trains.")


## Data Consistency Check: Scheduled vs Cancelled Trains

⚠ Number of rows where scheduled trains < cancelled trains: 46

Example rows with inconsistency:


Unnamed: 0,year,month,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,trains_delayed_departure,avg_delay_delayed_trains_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
3405,2020,4,National,PARIS NORD,DOUAI,70.0,0,12,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3407,2020,4,National,NANTES,STRASBOURG,313.0,0,2,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3411,2020,4,National,LYON PART DIEU,MARNE LA VALLEE,108.0,0,19,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3412,2020,4,National,PARIS LYON,AIX EN PROVENCE TGV,184.0,0,17,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3416,2020,4,National,GRENOBLE,PARIS LYON,182.0,0,9,0,0.0,...,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0



Number of rows where scheduled trains == cancelled trains: 0
✔ No rows found where scheduled trains equal cancelled trains.


In [504]:
# ---- Impute invalid rows where scheduled_trains < canceled_trains ----
scheduled_col = 'scheduled_trains'
cancelled_col = 'canceled_trains'
departure_col = 'departure_station'
arrival_col = 'arrival_station'

# Identify invalid rows
invalid_mask = df_reg[scheduled_col] < df_reg[cancelled_col]
num_invalid = invalid_mask.sum()
print(f"\n## Imputation of invalid rows ({num_invalid} rows)")

if num_invalid > 0:
    # Compute mean scheduled_trains per departure-arrival pair
    group_means = df_reg.groupby([departure_col, arrival_col])[scheduled_col].transform('mean')
    
    # Impute only for invalid rows
    df_reg.loc[invalid_mask, scheduled_col] = group_means[invalid_mask]
    
    # Ensure that scheduled_trains >= canceled_trains after imputation
    df_reg.loc[invalid_mask & (df_reg[scheduled_col] < df_reg[cancelled_col]), scheduled_col] = df_reg.loc[
        invalid_mask & (df_reg[scheduled_col] < df_reg[cancelled_col]), cancelled_col
    ]
    
    # Convert scheduled_trains to integer
    df_reg[scheduled_col] = df_reg[scheduled_col].round(0).astype(int)
    
    # Verify that there are no more invalid rows
    invalid_rows_after = df_reg[df_reg[scheduled_col] < df_reg[cancelled_col]]
    num_remaining = len(invalid_rows_after)
    print(f"Number of invalid rows after imputation: {num_remaining}")
    
    if num_remaining > 0:
        print("⚠ Warning: Some rows still remain invalid after imputation.")
        display(invalid_rows_after[[departure_col, arrival_col, scheduled_col, cancelled_col]].head())
    else:
        print("✔ All invalid rows successfully imputed and consistent with canceled trains.")

    # ---- Display only the column that was imputed for the corrected rows ----
    print("\n## Imputed values (scheduled_trains) after correction (first 5 rows):")
    display(df_reg.loc[invalid_mask, [departure_col, arrival_col, scheduled_col]].head())


else:
    print("✔ No invalid rows to impute.")


## Imputation of invalid rows (46 rows)
Number of invalid rows after imputation: 0
✔ All invalid rows successfully imputed and consistent with canceled trains.

## Imputed values (scheduled_trains) after correction (first 5 rows):


Unnamed: 0,departure_station,arrival_station,scheduled_trains
3405,PARIS NORD,DOUAI,141
3407,NANTES,STRASBOURG,49
3411,LYON PART DIEU,MARNE LA VALLEE,302
3412,PARIS LYON,AIX EN PROVENCE TGV,412
3416,GRENOBLE,PARIS LYON,184


In [505]:
# ---- Drop rows where scheduled_trains is 0 and canceled_trains > 0 ----
scheduled_col = 'scheduled_trains'
cancelled_col = 'canceled_trains'

drop_mask = (df_reg[scheduled_col] == 0) & (df_reg[cancelled_col] > 0)
num_to_drop = drop_mask.sum()
print(f"\n## Dropping inconsistent rows ({num_to_drop} rows)")

if num_to_drop > 0:
    df_reg.drop(df_reg[drop_mask].index, inplace=True)
    print(f"✔ {num_to_drop} rows successfully dropped.")
else:
    print("✔ No rows met the drop condition.")

# ---- Verify that there are no more invalid rows ----
invalid_rows_final = df_reg[df_reg[scheduled_col] < df_reg[cancelled_col]]
num_invalid_final = len(invalid_rows_final)
print(f"\nNumber of invalid rows after dropping inconsistent data: {num_invalid_final}")

if num_invalid_final > 0:
    print("⚠ Warning: Some rows still have scheduled_trains < canceled_trains.")
    display(invalid_rows_final.head())
else:
    print("✔ All remaining rows are consistent.")


## Dropping inconsistent rows (0 rows)
✔ No rows met the drop condition.

Number of invalid rows after dropping inconsistent data: 0
✔ All remaining rows are consistent.


In [506]:
# ---- Check and remove unusually early trains (avg_delay_all_trains_departure < -30) ----
threshold_early = -30
delay_col = 'avg_delay_all_trains_departure'

# Identifier les lignes inconsistantes
early_mask = df_reg[delay_col] < threshold_early
num_early = early_mask.sum()

print(f"\n## Removing unusually early trains (avg_delay_all_trains_departure < {threshold_early} minutes)")
print(f"Number of rows to remove: {num_early}")

if num_early > 0:
    # Afficher quelques exemples avant suppression
    print("\nExample rows to be removed (first 5 rows):")
    display(df_reg.loc[early_mask, [delay_col]].head())
    
    # Supprimer les lignes inconsistantes
    df_reg.drop(df_reg[early_mask].index, inplace=True)
    
    print(f"✔ {num_early} unusually early departures successfully removed.")
else:
    print("✔ No unusually early departures detected, nothing to remove.")


## Removing unusually early trains (avg_delay_all_trains_departure < -30 minutes)
Number of rows to remove: 4

Example rows to be removed (first 5 rows):


Unnamed: 0,avg_delay_all_trains_departure
2753,-92.768307
2757,-67.95374
2822,-69.838288
2877,-112.262016


✔ 4 unusually early departures successfully removed.


In [507]:
# ---- Check and remove unusually early trains at arrival (avg_delay_all_trains_arrival < -30) ----
threshold_early = -30
delay_col_arrival = 'avg_delay_all_trains_arrival'

# Identifier les lignes inconsistantes
early_mask_arrival = df_reg[delay_col_arrival] < threshold_early
num_early_arrival = early_mask_arrival.sum()

print(f"\n## Removing unusually early trains at arrival (avg_delay_all_trains_arrival < {threshold_early} minutes)")
print(f"Number of rows to remove: {num_early_arrival}")

if num_early_arrival > 0:
    # Afficher quelques exemples avant suppression
    print("\nExample rows to be removed (first 5 rows):")
    display(df_reg.loc[early_mask_arrival, [delay_col_arrival]].head())
    
    # Supprimer les lignes inconsistantes
    df_reg.drop(df_reg[early_mask_arrival].index, inplace=True)
    
    print(f"✔ {num_early_arrival} unusually early arrivals successfully removed.")
else:
    print("✔ No unusually early arrivals detected, nothing to remove.")


## Removing unusually early trains at arrival (avg_delay_all_trains_arrival < -30 minutes)
Number of rows to remove: 4

Example rows to be removed (first 5 rows):


Unnamed: 0,avg_delay_all_trains_arrival
2780,-80.855113
2825,-150.562114
2986,-173.07697
2993,-472.638889


✔ 4 unusually early arrivals successfully removed.


In [508]:
def save_dataframe_to_csv(df, file_name, sep=",", index=False, encoding="utf-8"):
    """
    Save a DataFrame to a CSV file with customizable options.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        file_name (str): The output CSV file name (with .csv extension).
        sep (str): Column separator in the CSV file (default is comma).
        index (bool): Whether to include the index in the CSV (default is False).
        encoding (str): File encoding (default is 'utf-8').

    Returns:
        None
    """
    try:
        df.to_csv(file_name, sep=sep, index=index, encoding=encoding)
        print(f"✔ DataFrame successfully saved to '{file_name}'")
    except Exception as e:
        print(f"✘ Failed to save DataFrame to '{file_name}'. Error: {e}")

In [604]:
# Save cleaned df_reg to CSV
save_dataframe_to_csv(df_reg, "tgv-monthly-regularity_cleaned.csv", sep=";")

✔ DataFrame successfully saved to 'tgv-monthly-regularity_cleaned.csv'


In [605]:
# Save cleaned df_far to CSV
save_dataframe_to_csv(df_far, "tgv-inoui-ouigo-fares_cleaned.csv", sep=";")

✔ DataFrame successfully saved to 'tgv-inoui-ouigo-fares_cleaned.csv'


#### Merge Dataset

In [631]:
# ==== Load df_far_cleaned ====
print("\n=== Loading dataset: tgv-monthly-regularity_cleaned.csv ===")
file_name = "tgv-monthly-regularity_cleaned.csv"
df_reg_cleaned = load_and_display_data(file_name, separator=";")

if df_reg_cleaned is not None:
    print("✔ DataFrame 'df_reg_cleaned' is ready for processing.\n")
    display(df_reg.head())
else:
    print("✘ Failed to load 'df_reg_cleaned'. Processing stopped.\n")


=== Loading dataset: tgv-monthly-regularity_cleaned.csv ===
Successful loading of file: tgv-monthly-regularity_cleaned.csv
---
DataFrame shape (rows, columns):
(9381, 24)
✔ DataFrame 'df_reg_cleaned' is ready for processing.



Unnamed: 0,year,month,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,trains_delayed_departure,avg_delay_delayed_trains_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
0,2018,1,National,GRENOBLE,PARIS LYON,183.0,245,0,37,8.027027,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
2,2018,1,National,MARSEILLE ST CHARLES,LYON PART DIEU,106.0,557,7,133,6.978195,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018,1,National,PARIS NORD,DUNKERQUE,116.0,271,3,46,11.236594,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018,1,National,ANNECY,PARIS LYON,224.0,198,0,12,8.070833,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905
5,2018,1,National,TOULOUSE MATABIAU,PARIS MONTPARNASSE,257.0,184,0,23,11.221739,...,26,7.510507,12,3,59.090909,22.727273,4.545455,9.090909,4.545455,0.0


In [632]:
# ==== Load df_far ====
print("\n=== Loading dataset: tgv-inoui-ouigo-fares_cleaned.csv ===")
file_name = "tgv-inoui-ouigo-fares_cleaned.csv"
df_far_cleaned = load_and_display_data(file_name, separator=";")

if df_far_cleaned is not None:
    print("✔ DataFrame 'df_far_cleaned' is ready for processing.\n")
    display(df_far.head())
else:
    print("✘ Failed to load 'df_far_cleaned'. Processing stopped.\n")


=== Loading dataset: tgv-inoui-ouigo-fares_cleaned.csv ===
Successful loading of file: tgv-inoui-ouigo-fares_cleaned.csv
---
DataFrame shape (rows, columns):
(33418, 7)
✔ DataFrame 'df_far_cleaned' is ready for processing.



Unnamed: 0,tgv_types,departure_station,arrival_station,train_class,fare_profile,min_price,max_price
0,TGV INOUI,AVIGNON TGV,PERPIGNAN,1,Tarif Réglementé,67.0,67.0
1,TGV INOUI,CHAMBERY CHALLES LES EAUX,LEPIN LE LAC LA BAUCHE,1,Tarif Réglementé,7.1,7.1
2,TGV INOUI,BAR LE DUC,CHALONS EN CHAMPAGNE,1,Tarif Réglementé,28.4,28.4
3,TGV INOUI,BREST,NANTES,2,Tarif Réglementé,60.9,60.9
4,TGV INOUI,BAYONNE,CHATELLERAULT,1,Tarif Elève - Etudiant - Apprenti,15.0,15.0


In [633]:
# ---- Create 'mean_price' as the average of min_price and max_price ----
price_cols = ['min_price', 'max_price']

df_far_cleaned['mean_price'] = df_far_cleaned[['min_price', 'max_price']].mean(axis=1)
print("✔ 'mean_price' column successfully created as the average of min_price and max_price.")
    
# Drop the original price columns
df_far_cleaned.drop(columns=price_cols, inplace=True)
print(f"✔ Dropped columns: {price_cols}")
    
# Vérification rapide
display(df_far_cleaned[['mean_price']].head())

✔ 'mean_price' column successfully created as the average of min_price and max_price.
✔ Dropped columns: ['min_price', 'max_price']


Unnamed: 0,mean_price
0,67.0
1,7.1
2,28.4
3,60.9
4,15.0


In [634]:
# ---- Weighted mean price by class for each route ----

# Définir les pondérations
weight_class_1 = 0.36  # Classe 1 = 36%
weight_other = 1 - weight_class_1  # L'autre classe = le reste

# Grouper par route et fare_profiling, compter le nombre de classes
route_class_counts = df_far_cleaned.groupby(['departure_station', 'arrival_station', 'fare_profile'])['train_class'].nunique().reset_index()
route_class_counts.rename(columns={'train_class': 'num_classes'}, inplace=True)

# Merge pour ajouter num_classes à df_far_cleaned
df_far_cleaned = df_far_cleaned.merge(route_class_counts, on=['departure_station', 'arrival_station', 'fare_profile'], how='left')

# Créer la colonne mean_weighted_price
df_far_cleaned['mean_weighted_price'] = df_far_cleaned['mean_price']  # Valeur par défaut

# Filtrer uniquement les routes avec plus d'une classe
multi_class_mask = df_far_cleaned['num_classes'] > 1

if multi_class_mask.any():
    # Appliquer la pondération pour ces routes
    df_far_cleaned.loc[multi_class_mask, 'mean_weighted_price'] = df_far_cleaned.apply(
        lambda row: row['mean_price'] * weight_class_1 if row['train_class'] == 1 else row['mean_price'] * weight_other,
        axis=1
    )

# Garder uniquement les colonnes finales
df_far_cleaned_final = df_far_cleaned[['departure_station', 'arrival_station', 'fare_profile', 'mean_weighted_price']].copy()

# Vérification rapide
display(df_far_cleaned_final.head())

Unnamed: 0,departure_station,arrival_station,fare_profile,mean_weighted_price
0,AVIGNON TGV,PERPIGNAN,Tarif Réglementé,24.12
1,CHAMBERY CHALLES LES EAUX,LEPIN LE LAC LA BAUCHE,Tarif Réglementé,2.556
2,BAR LE DUC,CHALONS EN CHAMPAGNE,Tarif Réglementé,10.224
3,BREST,NANTES,Tarif Réglementé,38.976
4,BAYONNE,CHATELLERAULT,Tarif Elève - Etudiant - Apprenti,5.4


In [635]:
# ---- Calculate mean price per route, ignoring fare_profile ----


# Grouper par route et calculer la moyenne des prix
df_far_cleaned = df_far_cleaned.groupby(['departure_station', 'arrival_station'], as_index=False)['mean_weighted_price'].mean()
df_far_cleaned.rename(columns={'mean_weighted_price': 'mean_price_per_route'}, inplace=True)

# Vérification rapide
print("Example of mean price per route:")
display(df_far_cleaned.head())

Example of mean price per route:


Unnamed: 0,departure_station,arrival_station,mean_price_per_route
0,AEROPORT CDG2 TGV ROISSY,AGDE,39.71575
1,AEROPORT CDG2 TGV ROISSY,AIME LA PLAGNE,39.06725
2,AEROPORT CDG2 TGV ROISSY,AIX LES BAINS - LE REVARD,34.07725
3,AEROPORT CDG2 TGV ROISSY,ALBERTVILLE,38.06525
4,AEROPORT CDG2 TGV ROISSY,ANNECY,33.33775


In [636]:
# ---- Check uniqueness of (departure_station, arrival_station) ----
key_cols = ['departure_station', 'arrival_station']

# Compter les doublons sur la clé
duplicate_mask = df_far_cleaned.duplicated(subset=key_cols, keep=False)
num_duplicates = duplicate_mask.sum()

print("## Uniqueness check for (departure_station, arrival_station) keys")
if num_duplicates == 0:
    print("✔ All routes are unique.")
else:
    print(f"⚠ Found {num_duplicates} duplicate route(s).")
    print("Example duplicate routes:")
    display(df_far_cleaned.loc[duplicate_mask, key_cols].drop_duplicates())

## Uniqueness check for (departure_station, arrival_station) keys
✔ All routes are unique.


In [637]:
# Colonnes de clé pour le merge
key_cols = ['departure_station', 'arrival_station']

# Colonnes à conserver après le merge
cols_to_keep = key_cols + ['mean_price_per_route', 'avg_trip_duration']

# Merge des deux datasets
df_merged = df_far_cleaned.merge(
    df_reg[key_cols + ['avg_trip_duration']],  # On ne prend que la colonne avg_trip_duration de df_reg
    on=key_cols,
    how='inner'  # On garde uniquement les routes présentes dans les deux datasets
)

# Garder uniquement les colonnes désirées
df_merged = df_merged[['mean_price_per_route', 'avg_trip_duration']].copy()

# Vérification
print(f"Merged DataFrame shape: {df_merged.shape}")
display(df_merged.head())

Merged DataFrame shape: (782, 2)


Unnamed: 0,mean_price_per_route,avg_trip_duration
0,21.94525,116.0
1,21.94525,116.0
2,21.94525,116.0
3,21.94525,116.0
4,21.94525,117.0


In [638]:
# Save the merged DataFrame to CSV
save_dataframe_to_csv(df_merged, "price-vs-trip_duration.csv", sep=";")

✔ DataFrame successfully saved to 'price-vs-trip_duration.csv'


In [639]:
def csv_to_parquet_optimized(csv_file_path, parquet_file_path, index_col=None, 
                             compression='snappy', chunk_size=None, separator=';'):
    """
    Converts a CSV file to Parquet format with optimizations using PyArrow.

    Args:
        csv_file_path (str): Path to the input CSV file.
        parquet_file_path (str): Path where the output Parquet file will be saved.
        index_col (str, optional): Name of the column to use as index (None by default).
        compression (str, optional): Compression algorithm to use ('snappy', 'gzip', 'brotli', 'zstd').
        chunk_size (int, optional): Number of rows to read at a time for large files (None reads all at once).
    
    Returns:
        bool: True if conversion succeeded, False otherwise.
    """
    start_time = time.time()
    
    print(f"Starting CSV file reading: {csv_file_path}")
    
    try:
        # Read CSV with Pandas
        # Using low_memory=False to prevent dtype warnings on large files
        df = pd.read_csv(csv_file_path, index_col=index_col, low_memory=False, chunksize=chunk_size, sep=separator)
    except FileNotFoundError:
        print(f"Error: CSV file not found at specified location: {csv_file_path}")
        return False
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return False

    read_time = time.time()
    print(f"CSV reading completed in {read_time - start_time:.2f} seconds.")
    print(f"Rows read: {len(df):,}")
    print(f"Columns: {len(df.columns)}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Schema Optimization
    # Convert DataFrame to PyArrow Table
    # This step infers PyArrow schema from Pandas types
    print("Converting to PyArrow table...")
    try:
        table = pa.Table.from_pandas(df, preserve_index=False)
    except Exception as e:
        print(f"Error converting to PyArrow table: {e}")
        return False
    
    # Add Custom Metadata
    # Metadata is stored in the Parquet file footer
    metadata = {
        'creation_tool': 'csv_to_parquet_optimized.py',
        'conversion_timestamp': pd.Timestamp.now().isoformat(),
        'source_file': csv_file_path,
        'original_row_count': str(len(df)),
        'original_column_count': str(len(df.columns)),
        'compression_algorithm': compression
    }
    
    # Integrate metadata into schema
    # PyArrow stores metadata at the schema level
    existing_metadata = table.schema.metadata or {}
    existing_metadata[b'custom_metadata'] = str(metadata).encode('utf8')
    table = table.replace_schema_metadata(existing_metadata)
    
    print(f"Writing Parquet file with '{compression}' compression...")

    # Write Parquet file
    # PyArrow provides efficient Parquet writing
    try:
        pq.write_table(
            table, 
            parquet_file_path, 
            compression=compression,
            use_dictionary=True,       # Efficient for categorical columns
            write_statistics=True,     # Enable statistics for better query performance
            row_group_size=100000,     # Optimize row group size for balance between memory and I/O
            version='2.6'              # Use newer Parquet format version for better features
        )
    except Exception as e:
        print(f"Error writing Parquet file: {e}")
        return False

    end_time = time.time()
    
    # Display results
    import os
    csv_size = os.path.getsize(csv_file_path) / 1024**2
    parquet_size = os.path.getsize(parquet_file_path) / 1024**2
    compression_ratio = (1 - parquet_size / csv_size) * 100
    
    print(f"\nConversion successful!")
    print(f"Parquet file saved to: {parquet_file_path}")
    print(f"Original CSV size: {csv_size:.2f} MB")
    print(f"Parquet file size: {parquet_size:.2f} MB")
    print(f"Compression ratio: {compression_ratio:.1f}%")
    print(f"Total duration: {end_time - start_time:.2f} seconds")
    
    return True

In [640]:
def verify_parquet_file(parquet_file_path, num_rows_preview=5):
    """
    Verifies and displays information about a Parquet file.
    
    Args:
        parquet_file_path (str): Path to the Parquet file to verify.
        num_rows_preview (int): Number of rows to preview (default: 5).
    """
    print(f"\nVerifying Parquet file: {parquet_file_path}")
    
    try:
        parquet_file = pq.ParquetFile(parquet_file_path)
        
        print(f"\nParquet Schema:")
        print(parquet_file.schema)
        
        print(f"\nFile metadata:")
        print(f"Number of row groups: {parquet_file.num_row_groups}")
        print(f"Total rows: {parquet_file.metadata.num_rows:,}")
        
        # Read custom metadata
        metadata_bytes = parquet_file.metadata.metadata.get(b'custom_metadata')
        if metadata_bytes:
            print(f"\nCustom metadata:")
            print(metadata_bytes.decode('utf8'))
        
        # Sample first few rows using pandas read_parquet
        print(f"\nFirst {num_rows_preview} rows preview:")
        df_sample = pd.read_parquet(parquet_file_path, engine='pyarrow').head(num_rows_preview)
        display(df_sample)
        
    except Exception as e:
        print(f"Error verifying Parquet file: {e}")

In [641]:
# Replace with your actual file paths
input_csv = 'tgv-monthly-regularity_cleaned.csv'
output_parquet = 'tgv-monthly-regularity_cleaned.parquet'
    
 # Convert CSV to Parquet
success = csv_to_parquet_optimized(
    input_csv, 
    output_parquet, 
    compression='snappy', # Options: 'snappy', 'gzip', 'brotli', 'zstd'
    separator=";"
)
    
# Verify the conversion if successful
if success:
    verify_parquet_file(output_parquet)

Starting CSV file reading: tgv-monthly-regularity_cleaned.csv
CSV reading completed in 0.06 seconds.
Rows read: 9,381
Columns: 24
Memory usage: 3.11 MB
Converting to PyArrow table...
Writing Parquet file with 'snappy' compression...

Conversion successful!
Parquet file saved to: tgv-monthly-regularity_cleaned.parquet
Original CSV size: 1.71 MB
Parquet file size: 0.67 MB
Compression ratio: 60.9%
Total duration: 0.42 seconds

Verifying Parquet file: tgv-monthly-regularity_cleaned.parquet

Parquet Schema:
<pyarrow._parquet.ParquetSchema object at 0x000001C4DB02E040>
required group field_id=-1 schema {
  optional int64 field_id=-1 year;
  optional int64 field_id=-1 month;
  optional binary field_id=-1 service (String);
  optional binary field_id=-1 departure_station (String);
  optional binary field_id=-1 arrival_station (String);
  optional double field_id=-1 avg_trip_duration;
  optional int64 field_id=-1 scheduled_trains;
  optional int64 field_id=-1 canceled_trains;
  optional int64 fi

Unnamed: 0,year,month,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,trains_delayed_departure,avg_delay_delayed_trains_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
0,2018,1,National,GRENOBLE,PARIS LYON,183.0,245,0,37,8.027027,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018,1,National,MARSEILLE ST CHARLES,LYON PART DIEU,106.0,557,7,133,6.978195,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
2,2018,1,National,PARIS NORD,DUNKERQUE,116.0,271,3,46,11.236594,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
3,2018,1,National,ANNECY,PARIS LYON,224.0,198,0,12,8.070833,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905
4,2018,1,National,TOULOUSE MATABIAU,PARIS MONTPARNASSE,257.0,184,0,23,11.221739,...,26,7.510507,12,3,59.090909,22.727273,4.545455,9.090909,4.545455,0.0


In [642]:
# Replace with your actual file paths
input_csv = 'tgv-inoui-ouigo-fares_cleaned.csv'
output_parquet = 'tgv-inoui-ouigo-fares_cleaned.parquet'
    
 # Convert CSV to Parquet
success = csv_to_parquet_optimized(
    input_csv, 
    output_parquet, 
    compression='snappy', # Options: 'snappy', 'gzip', 'brotli', 'zstd'
    separator=";"
)
    
# Verify the conversion if successful
if success:
    verify_parquet_file(output_parquet)

Starting CSV file reading: tgv-inoui-ouigo-fares_cleaned.csv
CSV reading completed in 0.09 seconds.
Rows read: 33,418
Columns: 7
Memory usage: 9.00 MB
Converting to PyArrow table...
Writing Parquet file with 'snappy' compression...

Conversion successful!
Parquet file saved to: tgv-inoui-ouigo-fares_cleaned.parquet
Original CSV size: 2.31 MB
Parquet file size: 0.17 MB
Compression ratio: 92.8%
Total duration: 0.25 seconds

Verifying Parquet file: tgv-inoui-ouigo-fares_cleaned.parquet

Parquet Schema:
<pyarrow._parquet.ParquetSchema object at 0x000001C4DDBC3100>
required group field_id=-1 schema {
  optional binary field_id=-1 tgv_types (String);
  optional binary field_id=-1 departure_station (String);
  optional binary field_id=-1 arrival_station (String);
  optional int64 field_id=-1 train_class;
  optional binary field_id=-1 fare_profile (String);
  optional double field_id=-1 min_price;
  optional double field_id=-1 max_price;
}


File metadata:
Number of row groups: 1
Total rows: 3

Unnamed: 0,tgv_types,departure_station,arrival_station,train_class,fare_profile,min_price,max_price
0,TGV INOUI,AVIGNON TGV,PERPIGNAN,1,Tarif Réglementé,67.0,67.0
1,TGV INOUI,CHAMBERY CHALLES LES EAUX,LEPIN LE LAC LA BAUCHE,1,Tarif Réglementé,7.1,7.1
2,TGV INOUI,BAR LE DUC,CHALONS EN CHAMPAGNE,1,Tarif Réglementé,28.4,28.4
3,TGV INOUI,BREST,NANTES,2,Tarif Réglementé,60.9,60.9
4,TGV INOUI,BAYONNE,CHATELLERAULT,1,Tarif Elève - Etudiant - Apprenti,15.0,15.0


In [643]:
# Replace with your actual file paths
input_csv = 'price-vs-trip_duration.csv'
output_parquet = 'price-vs-trip_duration.parquet'
    
 # Convert CSV to Parquet
success = csv_to_parquet_optimized(
    input_csv, 
    output_parquet, 
    compression='snappy', # Options: 'snappy', 'gzip', 'brotli', 'zstd'
    separator=";"
)
    
# Verify the conversion if successful
if success:
    verify_parquet_file(output_parquet)

Starting CSV file reading: price-vs-trip_duration.csv
CSV reading completed in 0.01 seconds.
Rows read: 782
Columns: 2
Memory usage: 0.01 MB
Converting to PyArrow table...
Writing Parquet file with 'snappy' compression...

Conversion successful!
Parquet file saved to: price-vs-trip_duration.parquet
Original CSV size: 0.01 MB
Parquet file size: 0.00 MB
Compression ratio: 72.1%
Total duration: 0.03 seconds

Verifying Parquet file: price-vs-trip_duration.parquet

Parquet Schema:
<pyarrow._parquet.ParquetSchema object at 0x000001C4DB2CF1C0>
required group field_id=-1 schema {
  optional double field_id=-1 mean_price_per_route;
  optional double field_id=-1 avg_trip_duration;
}


File metadata:
Number of row groups: 1
Total rows: 782

Custom metadata:
{'creation_tool': 'csv_to_parquet_optimized.py', 'conversion_timestamp': '2025-12-03T23:00:28.895714', 'source_file': 'price-vs-trip_duration.csv', 'original_row_count': '782', 'original_column_count': '2', 'compression_algorithm': 'snappy'}



Unnamed: 0,mean_price_per_route,avg_trip_duration
0,21.94525,116.0
1,21.94525,116.0
2,21.94525,116.0
3,21.94525,116.0
4,21.94525,117.0


In [518]:
import duckdb

dbms_columnar = duckdb.connect("database.duckdb")

dbms_columnar.execute("""
    CREATE TABLE IF NOT EXISTS TGV_table AS
    SELECT * FROM 'data.parquet';
""")

print(dbms_columnar.execute("SELECT COUNT(*) FROM TGV_table").fetchall())

[(10687,)]
