In [83]:
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output

In [84]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import time
import requests
from IPython.display import FileLink
from ydata_profiling import ProfileReport

#### Data Collection

In [57]:
def sncf_dataset(sncf_dataset_id, file_name, delimiter=";", 
                      list_separator=",", quote_all="false", with_bom="true"):
    """
    Downloads a CSV from a specified SNCF dataset and returns a clickable link 
    for download in a Jupyter Notebook.
    
    Args:
        id_sncf_dataset (str): The identifier of the SNCF dataset to download.
        filename (str): Local filename to save the CSV.
        delimiter (str, optional): Field delimiter in the CSV. Default is ";".
        list_separator (str, optional): Separator for list values. Default is ",".
        quote_all (str, optional): Whether to quote all fields. Default is "false".
        with_bom (str, optional): Whether to include a BOM in the CSV. Default is "true".
        
    Returns:
        FileLink: A clickable link to download the CSV file in the notebook.
    """
    
    url = f"https://data.sncf.com/api/explore/v2.1/catalog/datasets/{sncf_dataset_id}/exports/csv"
    params = {
        "delimiter": delimiter,
        "list_separator": list_separator,
        "quote_all": quote_all,
        "with_bom": with_bom,
    }

    response = requests.get(url, params=params)
    response.raise_for_status()

    with open(file_name, "wb") as f:
        f.write(response.content)

    return FileLink(file_name)

In [58]:
# --- SNCF Dataset Downloads ---

# Regularity dataset
dataset_id_regularity = "regularite-mensuelle-tgv-aqst"
filename_regularity = "tgv-monthly-regularity.csv"
sncf_dataset(dataset_id_regularity, filename_regularity)

In [59]:
# TGV/OUIGO fares dataset
dataset_id_fares = "tarifs-tgv-inoui-ouigo"
filename_fares = "tgv-inoui-ouigo-fares.csv"
sncf_dataset(dataset_id_fares, filename_fares)

In [60]:
def load_and_display_data(file_path, separator=";"):
    """
    Loads a CSV file and displays its shape, with robust checks for errors.

    Args:
        file_path (str): The path to the CSV file.
        separator (str): The column separator to use.
    """
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"Error: The specified file was not found at the path: {file_path}")
        return

    try:
        # Attempt to read the file using the specified separator
        df_raw = pd.read_csv(file_path, sep=separator)

        # Check if the DataFrame is empty after loading
        if df_raw.empty:
            print(f"Warning: File {file_path} was loaded but is empty.")
            return

        print(f"Successful loading of file: {file_path}")
        print("---")
        
        # Display the shape (number of rows, number of columns) of the DataFrame
        print("DataFrame shape (rows, columns):")
        print(df_raw.shape)

    except pd.errors.ParserError as e:
        # Handle parsing errors (e.g., wrong separator, malformed file)
        print(f"Parsing Error while reading the file: {e}")
        print(f"Suggestion: Check if the separator (sep='{separator}') and encoding are correct.")
        
    except Exception as e:
        # Handle any other unexpected error
        print(f"An unexpected error occurred: {e}")

    return df_raw

In [61]:
# ==== Load df_reg ====
print("\n=== Loading dataset: tgv-monthly-regularity.csv ===")
file_name = "tgv-monthly-regularity.csv"
df_reg = load_and_display_data(file_name, separator=";")

if df_reg is not None:
    print("✔ DataFrame 'df_reg' is ready for processing.\n")
    display(df_reg.head())
else:
    print("✘ Failed to load 'df_reg'. Processing stopped.\n")


=== Loading dataset: tgv-monthly-regularity.csv ===
Successful loading of file: tgv-monthly-regularity.csv
---
DataFrame shape (rows, columns):
(10687, 26)
✔ DataFrame 'df_reg' is ready for processing.



Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,commentaire_annulation,nb_train_depart_retard,retard_moyen_depart,...,nb_train_retard_sup_15,retard_moyen_trains_retard_sup15,nb_train_retard_sup_30,nb_train_retard_sup_60,prct_cause_externe,prct_cause_infra,prct_cause_gestion_trafic,prct_cause_materiel_roulant,prct_cause_gestion_gare,prct_cause_prise_en_charge_voyageurs
0,2018-01,National,GRENOBLE,PARIS LYON,183,245,0,,37,8.027027,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018-01,International,PARIS LYON,ITALIE,394,94,0,,27,11.261728,...,22,11.601064,15,6,33.333333,19.047619,23.809524,14.285714,9.52381,0.0
2,2018-01,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,,133,6.978195,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018-01,National,PARIS NORD,DUNKERQUE,116,271,3,,46,11.236594,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018-01,National,ANNECY,PARIS LYON,224,198,0,,12,8.070833,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905


In [62]:
# ==== Load df_far ====
print("\n=== Loading dataset: tgv-inoui-ouigo-fares.csv ===")
file_name = "tgv-inoui-ouigo-fares.csv"
df_far = load_and_display_data(file_name, separator=";")

if df_far is not None:
    print("✔ DataFrame 'df_far' is ready for processing.\n")
    display(df_far.head())
else:
    print("✘ Failed to load 'df_far'. Processing stopped.\n")


=== Loading dataset: tgv-inoui-ouigo-fares.csv ===
Successful loading of file: tgv-inoui-ouigo-fares.csv
---
DataFrame shape (rows, columns):
(34135, 9)
✔ DataFrame 'df_far' is ready for processing.



Unnamed: 0,transporteur,gare_origine,gare_origine_code_uic,gare_destination,gare_destination_code_uic,classe,profil_tarifaire,prix_minimum,prix_maximum
0,TGV INOUI,AVIGNON TGV,87318964,PERPIGNAN,87784009,1,Tarif Réglementé,67.0,67.0
1,TGV INOUI,CHAMBERY CHALLES LES EAUX,87741009,LEPIN LE LAC LA BAUCHE,87741439,1,Tarif Réglementé,7.1,7.1
2,TGV INOUI,BAR LE DUC,87175042,CHALONS EN CHAMPAGNE,87174003,1,Tarif Réglementé,28.4,28.4
3,TGV INOUI,BREST,87474007,NANTES,87481002,2,Tarif Réglementé,60.9,60.9
4,TGV INOUI,BAYONNE,87673004,CHATELLERAULT,87575142,1,Tarif Elève - Etudiant - Apprenti,15.0,15.0


In [63]:
def print_dataframe_columns(df, df_name="DataFrame"):
    """
    Print all column names of a DataFrame with a clean, numbered format.
    
    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame whose columns should be displayed.
    df_name : str, optional
        A label used for printing, by default "DataFrame".
    """
    
    print(f"\n## Columns in {df_name}\n")
    
    print("Column names:")
    for i, col in enumerate(df.columns.tolist(), 1):
        print(f"  {i}. {col}")

    print(f"\nTotal Columns: {len(df.columns)}")

In [64]:
# Print columns of df_reg
print_dataframe_columns(df_reg, df_name="df_reg")


## Columns in df_reg

Column names:
  1. date
  2. service
  3. gare_depart
  4. gare_arrivee
  5. duree_moyenne
  6. nb_train_prevu
  7. nb_annulation
  8. commentaire_annulation
  9. nb_train_depart_retard
  10. retard_moyen_depart
  11. retard_moyen_tous_trains_depart
  12. commentaire_retards_depart
  13. nb_train_retard_arrivee
  14. retard_moyen_arrivee
  15. retard_moyen_tous_trains_arrivee
  16. commentaires_retard_arrivee
  17. nb_train_retard_sup_15
  18. retard_moyen_trains_retard_sup15
  19. nb_train_retard_sup_30
  20. nb_train_retard_sup_60
  21. prct_cause_externe
  22. prct_cause_infra
  23. prct_cause_gestion_trafic
  24. prct_cause_materiel_roulant
  25. prct_cause_gestion_gare
  26. prct_cause_prise_en_charge_voyageurs

Total Columns: 26


In [65]:
# Print columns of df_far
print_dataframe_columns(df_far, df_name="df_far")


## Columns in df_far

Column names:
  1. transporteur
  2. gare_origine
  3. gare_origine_code_uic
  4. gare_destination
  5. gare_destination_code_uic
  6. classe
  7. profil_tarifaire
  8. prix_minimum
  9. prix_maximum

Total Columns: 9


In [66]:
def rename_dataframe_columns(df, rename_dict):
    """
    Rename the columns of a DataFrame based on a dictionary mapping.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame whose columns should be renamed.
    rename_dict : dict
        Dictionary mapping old column names to new column names.

    Returns
    -------
    pandas.DataFrame
        The DataFrame with renamed columns.
    """
    df = df.rename(columns=rename_dict)
    return df

In [67]:
rename_dict_reg = {
    "date": "date",
    "service": "service",
    "gare_depart": "departure_station",
    "gare_arrivee": "arrival_station",
    "duree_moyenne": "avg_trip_duration",
    "nb_train_prevu": "scheduled_trains",
    "nb_annulation": "canceled_trains",
    "commentaire_annulation": "cancellation_comments",
    "nb_train_depart_retard": "trains_delayed_departure",
    "retard_moyen_depart": "avg_delay_delayed_trains_departure",
    "retard_moyen_tous_trains_depart": "avg_delay_all_trains_departure",
    "commentaire_retards_depart": "departure_delay_comments",
    "nb_train_retard_arrivee": "trains_delayed_arrival",
    "retard_moyen_arrivee": "avg_delay_delayed_trains_arrival",
    "retard_moyen_tous_trains_arrivee": "avg_delay_all_trains_arrival",
    "commentaires_retard_arrivee": "arrival_delay_comments",
    "nb_train_retard_sup_15": "trains_delayed_over_15min",
    "retard_moyen_trains_retard_sup15": "avg_delay_over_15min",
    "nb_train_retard_sup_30": "trains_delayed_over_30min",
    "nb_train_retard_sup_60": "trains_delayed_over_60min",
    "prct_cause_externe": "pct_delay_external_causes",
    "prct_cause_infra": "pct_delay_infrastructure",
    "prct_cause_gestion_trafic": "pct_delay_traffic_management",
    "prct_cause_materiel_roulant": "pct_delay_rolling_stock",
    "prct_cause_gestion_gare": "pct_delay_station_operations",
    "prct_cause_prise_en_charge_voyageurs": "pct_delay_passenger_handling"
}

df_reg = rename_dataframe_columns(df_reg, rename_dict_reg)
display(df_reg.head())

Unnamed: 0,date,service,departure_station,arrival_station,avg_trip_duration,scheduled_trains,canceled_trains,cancellation_comments,trains_delayed_departure,avg_delay_delayed_trains_departure,...,trains_delayed_over_15min,avg_delay_over_15min,trains_delayed_over_30min,trains_delayed_over_60min,pct_delay_external_causes,pct_delay_infrastructure,pct_delay_traffic_management,pct_delay_rolling_stock,pct_delay_station_operations,pct_delay_passenger_handling
0,2018-01,National,GRENOBLE,PARIS LYON,183,245,0,,37,8.027027,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018-01,International,PARIS LYON,ITALIE,394,94,0,,27,11.261728,...,22,11.601064,15,6,33.333333,19.047619,23.809524,14.285714,9.52381,0.0
2,2018-01,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,,133,6.978195,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018-01,National,PARIS NORD,DUNKERQUE,116,271,3,,46,11.236594,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018-01,National,ANNECY,PARIS LYON,224,198,0,,12,8.070833,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905


In [68]:
rename_dict_far = {
    "transporteur": "tgv_types",
    "gare_origine": "departure_station",
    "gare_origine_code_uic": "departure_station_uic",
    "gare_destination": "arrival_station",
    "gare_destination_code_uic": "arrival_station_uic",
    "classe	": "train_class",
    "profil_tarifaire": "fare_profile",
    "prix_minimum": "min_price",
    "prix_maximum": "max_price"
}


df_far = rename_dataframe_columns(df_far, rename_dict_far)
display(df_far.head())

Unnamed: 0,tgv_types,departure_station,departure_station_uic,arrival_station,arrival_station_uic,classe,fare_profile,min_price,max_price
0,TGV INOUI,AVIGNON TGV,87318964,PERPIGNAN,87784009,1,Tarif Réglementé,67.0,67.0
1,TGV INOUI,CHAMBERY CHALLES LES EAUX,87741009,LEPIN LE LAC LA BAUCHE,87741439,1,Tarif Réglementé,7.1,7.1
2,TGV INOUI,BAR LE DUC,87175042,CHALONS EN CHAMPAGNE,87174003,1,Tarif Réglementé,28.4,28.4
3,TGV INOUI,BREST,87474007,NANTES,87481002,2,Tarif Réglementé,60.9,60.9
4,TGV INOUI,BAYONNE,87673004,CHATELLERAULT,87575142,1,Tarif Elève - Etudiant - Apprenti,15.0,15.0


In [78]:
# ===== DataFrame Summary: df_reg =====
print("\n" + "="*60)
print("## DataFrame Information: df_reg")
print("="*60 + "\n")

df_reg.info(memory_usage='deep')
print("\n" + "="*60 + "\n")


## DataFrame Information: df_reg

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10687 entries, 0 to 10686
Data columns (total 26 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   date                                10687 non-null  object 
 1   service                             10687 non-null  object 
 2   departure_station                   10687 non-null  object 
 3   arrival_station                     10687 non-null  object 
 4   avg_trip_duration                   10687 non-null  int64  
 5   scheduled_trains                    10687 non-null  int64  
 6   canceled_trains                     10687 non-null  int64  
 7   cancellation_comments               0 non-null      float64
 8   trains_delayed_departure            10687 non-null  int64  
 9   avg_delay_delayed_trains_departure  10687 non-null  float64
 10  avg_delay_all_trains_departure      10687 non-null  float64
 11  depart

In [79]:
# ===== DataFrame Summary: df_far =====
print("\n" + "="*60)
print("## DataFrame Information: df_far")
print("="*60 + "\n")

df_far.info(memory_usage='deep')
print("\n" + "="*60 + "\n")


## DataFrame Information: df_far

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34135 entries, 0 to 34134
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   tgv_types              34135 non-null  object 
 1   departure_station      34135 non-null  object 
 2   departure_station_uic  34135 non-null  int64  
 3   arrival_station        34135 non-null  object 
 4   arrival_station_uic    34135 non-null  int64  
 5   classe                 34135 non-null  int64  
 6   fare_profile           34135 non-null  object 
 7   min_price              34135 non-null  float64
 8   max_price              34135 non-null  float64
dtypes: float64(2), int64(3), object(4)
memory usage: 9.7 MB




#### Data Cleaning

In [85]:
def generate_profiling_report(df, output_file="profiling_report.html", title="Data Profiling Report"):
    """
    Generate an HTML profiling report for a DataFrame using ydata_profiling.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to profile.
    output_file : str, optional
        The name of the output HTML file (default is "profiling_report.html").
    title : str, optional
        The title of the profiling report (default is "Data Profiling Report").
    """
    try:
        print(f"\n## Generating Profiling Report: {title} ...")
        
        profile = ProfileReport(
            df,
            title=title,
            sort=None,        # Use 'None' for original order, 'alphabetical' for sorting
            explorative=True
        )
        
        profile.to_file(output_file)
        print(f"✔ Profiling report successfully exported to: {output_file}")

    except ImportError:
        print("Error: The 'ydata-profiling' library is required.")
        print("Install it using: `pip install ydata-profiling`")

    except Exception as e:
        print(f"An error occurred during report generation: {e}")

In [86]:
# Generate profiling report for df_reg
generate_profiling_report(df_reg, output_file="tgv_regularity_report.html", title="TGV Regularity Profiling")


## Generating Profiling Report: TGV Regularity Profiling ...


100%|██████████| 26/26 [00:00<00:00, 306.03it/s]<00:00, 39.27it/s, Describe variable: pct_delay_passenger_handling]
Summarize dataset: 100%|██████████| 396/396 [00:27<00:00, 14.65it/s, Completed]                                                                     
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.83s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.79s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 12.64it/s]

✔ Profiling report successfully exported to: tgv_regularity_report.html





In [87]:
# Generate profiling report for df_far
generate_profiling_report(df_far, output_file="tgv_fares_report.html", title="TGV Fares Profiling")


## Generating Profiling Report: TGV Fares Profiling ...


100%|██████████| 9/9 [00:00<00:00, 319.75it/s]0<00:01,  5.63it/s, Describe variable: max_price]
Summarize dataset: 100%|██████████| 34/34 [00:03<00:00, 11.23it/s, Completed]                                           
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.52s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.80it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 58.91it/s]

✔ Profiling report successfully exported to: tgv_fares_report.html





In [88]:
def missing_values_summary(df, df_name="DataFrame"):
    """
    Calculate and display missing values (count and percentage) for a DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to analyze.
    df_name : str, optional
        Name of the DataFrame for display purposes (default is "DataFrame").
    """
    missing = pd.DataFrame({
        'Missing Count': df.isna().sum(),
        'Missing Percent': (df.isna().sum() / len(df) * 100).round(2)
    }).sort_values('Missing Percent', ascending=False)
    
    print(f"\n## Missing Value Summary for {df_name}\n")
    
    # Filter for columns with at least one missing value
    missing_data_summary = missing[missing['Missing Count'] > 0]
    
    if not missing_data_summary.empty:
        print(f"{len(missing_data_summary)} columns have missing values.")
        print("---")
        display(missing_data_summary)
    else:
        print(f"Great! No missing values found in {df_name}.")

In [89]:
# Generate missing values summary for df_far
missing_values_summary(df_reg, "df_reg")


## Missing Value Summary for df_reg

3 columns have missing values.
---


Unnamed: 0,Missing Count,Missing Percent
cancellation_comments,10687,100.0
departure_delay_comments,10687,100.0
arrival_delay_comments,9989,93.47


In [90]:
# Generate missing values summary for df_far
missing_values_summary(df_far, "df_far")


## Missing Value Summary for df_far

Great! No missing values found in df_far.


In [73]:
# Define the list of columns to be dropped
# These columns often contain qualitative/text data or a high percentage of missing values.
columns_to_drop = [
    'Commentaire annulations',
    'Commentaire retards au départ'
]

# Use a single efficient .drop() call
# The 'errors="ignore"' argument is a key improvement: it prevents the code from
# crashing if one of the columns listed above was already dropped or doesn't exist.
df_raw = df_raw.drop(columns=columns_to_drop, errors='ignore')
    
print("\n## Column Deletion Summary\n")
print(f"Successfully dropped the following columns (if they existed):")

for col in columns_to_drop:
    print(f"- {col}")
        
print("\n---")
print(f"New DataFrame shape: {df_raw.shape}")


## Column Deletion Summary

Successfully dropped the following columns (if they existed):
- Commentaire annulations
- Commentaire retards au départ

---
New DataFrame shape: (10687, 26)


In [74]:
# Calculate the total number of duplicate rows (where the entire row is identical)
num_duplicates = df_raw.duplicated().sum()
    
print("\n## Duplicate Row Check\n")
    
if num_duplicates > 0:
    print(f"Warning: Found {num_duplicates} duplicate row(s) in the DataFrame.")
        
    # Optional: Print the percentage of duplicate rows for better context
    percent_duplicates = (num_duplicates / len(df_raw) * 100).round(2)
    print(f"This represents {percent_duplicates}% of the total data.")
        
    # Optional: Display the first few duplicate rows for inspection
    # Keep='first' means the second, third, etc., occurrences are marked True
    print("\nFirst 5 duplicate entries (excluding the first occurrence):")
    display(df_raw[df_raw.duplicated(keep='first')].head())
        
else:
    print("Great! No full-row duplicates were found in the DataFrame.")


## Duplicate Row Check

Great! No full-row duplicates were found in the DataFrame.


In [75]:
# Identify object (string) columns using select_dtypes
# This is often more explicit and faster than iterating over all columns and checking dtype
object_cols = df_raw.select_dtypes(include=['object']).columns
    
print("\n## Trimming Whitespace in String Columns\n")
    
if len(object_cols) > 0:
        
# Apply the strip function using .loc for explicit assignment and better performance
    for col in object_cols:
        # We explicitly handle potential NaN values by converting them to string
        # and then applying str.strip(). This ensures all cells are processed, 
        # though str.strip() on a true NaN string ('nan') will result in 'nan'.
        # A more robust approach, applied here, is to use .str.strip() which 
        # correctly handles NaN values (keeping them as NaN).
            
        # The .str accessor safely applies the string method, returning NaN for NaN inputs.
        df_raw.loc[:, col] = df_raw[col].str.strip() 

    print(f"Whitespace trimmed in {len(object_cols)} object columns.")
    print("Columns processed:")

    for col in object_cols:
        print(f"- {col}")
else:
    print("Warning: No columns of type 'object' found to trim whitespace.")


## Trimming Whitespace in String Columns

Whitespace trimmed in 5 object columns.
Columns processed:
- date
- service
- gare_depart
- gare_arrivee
- commentaires_retard_arrivee


In [76]:
date_column = "Date"
    
print(f"\n## Date Column Conversion ({date_column})\n")

# Check if the 'Date' column exists
if date_column in df_raw.columns:
        
    # Store the original dtype for comparison
    original_dtype = df_raw[date_column].dtype
        
    try:
        # Use errors='coerce' to handle values that don't match the specified format.
        # These values will be converted to NaT (Not a Time), which is preferable 
        # to crashing the script.
        df_raw[date_column] = pd.to_datetime(
            df_raw[date_column], 
            format="%Y-%m",
            errors='coerce' # Key improvement for error handling
        )

        new_dtype = df_raw[date_column].dtype
        print(f"Conversion successful!")
        print(f"Original data type: {original_dtype}")
        print(f"New data type: {new_dtype}")

        # Check for NaT values introduced during coercion
        nat_count = df_raw[date_column].isna().sum()
        if nat_count > 0:
            print(f"Warning: {nat_count} value(s) could not be parsed and were converted to NaT.")
                
        print("\nFirst 5 rows with new date format:")
        display(df_raw.head())

    except Exception as e:
        print(f"Error during date conversion: {e}")
        print("Suggestion: Verify the data format and column content.")
            
else:
    print(f"Warning: Column '{date_column}' not found in the DataFrame. Conversion skipped.")


## Date Column Conversion (Date)



In [77]:
# checking that 'Nombre de circulations prévues' >= 'Nombre de trains annulés'

invalid_rows = df_raw[df_raw['Nombre de circulations prévues'] < df_raw['Nombre de trains annulés']]
print(f"Number of rows where the number of scheduled trains is less than the number of cancelled trains: {len(invalid_rows)}")

print("Rows where the number of trains scheduled trains equals the number of cancelled trains:")
df_raw[df_raw['Nombre de circulations prévues'] == df_raw['Nombre de trains annulés']]

KeyError: 'Nombre de circulations prévues'

In [None]:
# Impute rows where 'Nombre de circulations prévues' < 'Nombre de trains annulés' with the mean of 'Nombre de circulations prévues' for trains with same 'Gare de départ' and 'Gare d'arrivée' 
for index, row in invalid_rows.iterrows():
    mask = (df_raw['Gare de départ'] == row['Gare de départ']) & (df_raw['Gare d\'arrivée'] == row['Gare d\'arrivée']) & (df_raw.index != index)
    mean_value = df_raw.loc[mask, 'Nombre de circulations prévues'].mean()
    df_raw.at[index, 'Nombre de circulations prévues'] = mean_value

# Verify that there are no more invalid rows
invalid_rows_after = df_raw[df_raw['Nombre de circulations prévues'] < df_raw['Nombre de trains annulés']]
print(f"Number of invalid rows after imputation: {len(invalid_rows_after)}")

invalid_rows_after.head()

In [None]:
# checking why there are still 10 rows where nb of scheduled trains is inferior to nb of cancelled trains

# check trains with 'Gare de départ' NANTES and 'Gare d'arrivée' STRASBOURG
df_raw[(df_raw['Gare de départ'] == 'NANTES') & (df_raw['Gare d\'arrivée'] == 'STRASBOURG')].head()

# check trains with 'Gare de départ' MARSEILLE ST CHARLES and 'Gare d'arrivée' TOURCOING
df_raw[(df_raw['Gare de départ'] == 'MARSEILLE ST CHARLES') & (df_raw['Gare d\'arrivée'] == 'TOURCOING')].head()

# check trains with 'Gare de départ' BORDEAUX ST JEAN and 'Gare d'arrivée' TOURCOING	
df_raw[(df_raw['Gare de départ'] == 'BORDEAUX ST JEAN') & (df_raw['Gare d\'arrivée'] == 'BORDEAUX ST JEAN')].head()

# check trains with 'Gare de départ' TOURCOING and 'Gare d'arrivée' BORDEAUX ST JEAN	
df_raw[(df_raw['Gare de départ'] == 'TOURCOING') & (df_raw['Gare d\'arrivée'] == 'BORDEAUX ST JEAN')].head()

# check trains with 'Gare de départ' MADRID and 'Gare d'arrivée' MARSEILLE ST CHARLES
df_raw[(df_raw['Gare de départ'] == 'MADRID') & (df_raw['Gare d\'arrivée'] == 'MARSEILLE ST CHARLES')].head()	

In [None]:
# drop rows where 'Nombre de circulations prévues' is 0 and 'Nombre de trains annulés' > 0

df_raw.drop(df_raw[(df_raw['Nombre de circulations prévues'] == 0) & (df_raw['Nombre de trains annulés'] > 0)].index, inplace=True)

# checking now in the raw dataset if there are still rows where 'Nombre de circulations prévues' < 'Nombre de trains annulés'
invalid_rows_final = df_raw[df_raw['Nombre de circulations prévues'] < df_raw['Nombre de trains annulés']]
print(f"Number of invalid rows after dropping inconsistent data: {len(invalid_rows_final)}")

In [None]:
df_raw[df_raw['Nombre de trains en retard au départ'] > df_raw["Nombre de circulations prévues"]].count

In [None]:
# Etude Outliers

In [None]:
# checking the rows where 'Retard moyen de tous les trains à l'arrivée' < -30, meaning the train has more than 
# 30 minutes of advance on the schedule, which might seem a little strange.

df_raw[df_raw['Retard moyen de tous les trains à l\'arrivée'] < -30].head()

In [None]:
# save clean

In [None]:
def csv_to_parquet_optimized(csv_file_path, parquet_file_path, index_col=None, 
                             compression='snappy', chunk_size=None, separator=','):
    """
    Converts a CSV file to Parquet format with optimizations using PyArrow.

    Args:
        csv_file_path (str): Path to the input CSV file.
        parquet_file_path (str): Path where the output Parquet file will be saved.
        index_col (str, optional): Name of the column to use as index (None by default).
        compression (str, optional): Compression algorithm to use ('snappy', 'gzip', 'brotli', 'zstd').
        chunk_size (int, optional): Number of rows to read at a time for large files (None reads all at once).
    
    Returns:
        bool: True if conversion succeeded, False otherwise.
    """
    start_time = time.time()
    
    print(f"Starting CSV file reading: {csv_file_path}")
    
    try:
        # Read CSV with Pandas
        # Using low_memory=False to prevent dtype warnings on large files
        df = pd.read_csv(csv_file_path, index_col=index_col, low_memory=False, chunksize=chunk_size, sep=separator)
    except FileNotFoundError:
        print(f"Error: CSV file not found at specified location: {csv_file_path}")
        return False
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return False

    read_time = time.time()
    print(f"CSV reading completed in {read_time - start_time:.2f} seconds.")
    print(f"Rows read: {len(df):,}")
    print(f"Columns: {len(df.columns)}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Schema Optimization
    # Convert DataFrame to PyArrow Table
    # This step infers PyArrow schema from Pandas types
    print("Converting to PyArrow table...")
    try:
        table = pa.Table.from_pandas(df, preserve_index=False)
    except Exception as e:
        print(f"Error converting to PyArrow table: {e}")
        return False
    
    # Add Custom Metadata
    # Metadata is stored in the Parquet file footer
    metadata = {
        'creation_tool': 'csv_to_parquet_optimized.py',
        'conversion_timestamp': pd.Timestamp.now().isoformat(),
        'source_file': csv_file_path,
        'original_row_count': str(len(df)),
        'original_column_count': str(len(df.columns)),
        'compression_algorithm': compression
    }
    
    # Integrate metadata into schema
    # PyArrow stores metadata at the schema level
    existing_metadata = table.schema.metadata or {}
    existing_metadata[b'custom_metadata'] = str(metadata).encode('utf8')
    table = table.replace_schema_metadata(existing_metadata)
    
    print(f"Writing Parquet file with '{compression}' compression...")

    # Write Parquet file
    # PyArrow provides efficient Parquet writing
    try:
        pq.write_table(
            table, 
            parquet_file_path, 
            compression=compression,
            use_dictionary=True,       # Efficient for categorical columns
            write_statistics=True,     # Enable statistics for better query performance
            row_group_size=100000,     # Optimize row group size for balance between memory and I/O
            version='2.6'              # Use newer Parquet format version for better features
        )
    except Exception as e:
        print(f"Error writing Parquet file: {e}")
        return False

    end_time = time.time()
    
    # Display results
    import os
    csv_size = os.path.getsize(csv_file_path) / 1024**2
    parquet_size = os.path.getsize(parquet_file_path) / 1024**2
    compression_ratio = (1 - parquet_size / csv_size) * 100
    
    print(f"\nConversion successful!")
    print(f"Parquet file saved to: {parquet_file_path}")
    print(f"Original CSV size: {csv_size:.2f} MB")
    print(f"Parquet file size: {parquet_size:.2f} MB")
    print(f"Compression ratio: {compression_ratio:.1f}%")
    print(f"Total duration: {end_time - start_time:.2f} seconds")
    
    return True

In [None]:
def verify_parquet_file(parquet_file_path, num_rows_preview=5):
    """
    Verifies and displays information about a Parquet file.
    
    Args:
        parquet_file_path (str): Path to the Parquet file to verify.
        num_rows_preview (int): Number of rows to preview (default: 5).
    """
    print(f"\nVerifying Parquet file: {parquet_file_path}")
    
    try:
        parquet_file = pq.ParquetFile(parquet_file_path)
        
        print(f"\nParquet Schema:")
        print(parquet_file.schema)
        
        print(f"\nFile metadata:")
        print(f"Number of row groups: {parquet_file.num_row_groups}")
        print(f"Total rows: {parquet_file.metadata.num_rows:,}")
        
        # Read custom metadata
        metadata_bytes = parquet_file.metadata.metadata.get(b'custom_metadata')
        if metadata_bytes:
            print(f"\nCustom metadata:")
            print(metadata_bytes.decode('utf8'))
        
        # Sample first few rows using pandas read_parquet
        print(f"\nFirst {num_rows_preview} rows preview:")
        df_sample = pd.read_parquet(parquet_file_path, engine='pyarrow').head(num_rows_preview)
        display(df_sample)
        
    except Exception as e:
        print(f"Error verifying Parquet file: {e}")

In [None]:
# Replace with your actual file paths
input_csv = 'data_clean.csv'
output_parquet = 'data.parquet'
    
 # Convert CSV to Parquet
success = csv_to_parquet_optimized(
    input_csv, 
    output_parquet, 
    compression='snappy', # Options: 'snappy', 'gzip', 'brotli', 'zstd'
    separator=";"
)
    
# Verify the conversion if successful
if success:
    verify_parquet_file(output_parquet)

In [None]:
import duckdb

dbms_columnar = duckdb.connect("database.duckdb")

dbms_columnar.execute("""
    CREATE TABLE IF NOT EXISTS TGV_table AS
    SELECT * FROM 'data.parquet';
""")

print(dbms_columnar.execute("SELECT COUNT(*) FROM TGV_table").fetchall())

[(10687,)]
