In [None]:
from __future__ import annotations

import os
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import time

In [None]:
def load_and_display_data(file_path, separator=";"):
    """
    Loads a CSV file and displays its shape, with robust checks for errors.

    Args:
        file_path (str): The path to the CSV file.
        separator (str): The column separator to use.
    """
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"Error: The specified file was not found at the path: {file_path}")
        return

    try:
        # Attempt to read the file using the specified separator
        df_raw = pd.read_csv(file_path, sep=separator)

        # Check if the DataFrame is empty after loading
        if df_raw.empty:
            print(f"Warning: File {file_path} was loaded but is empty.")
            return

        print(f"Successful loading of file: {file_path}")
        print("---")
        # Display the shape (number of rows, number of columns) of the DataFrame
        print("DataFrame shape (rows, columns):")
        print(df_raw.shape)

    except pd.errors.ParserError as e:
        # Handle parsing errors (e.g., wrong separator, malformed file)
        print(f"Parsing Error while reading the file: {e}")
        print(f"Suggestion: Check if the separator (sep='{separator}') and encoding are correct.")
    except Exception as e:
        # Handle any other unexpected error
        print(f"An unexpected error occurred: {e}")

    return df_raw

In [27]:
file_name = "regularite-mensuelle-tgv-aqst.csv"

# The function call itself is already good, but we ensure the result is handled.
df_raw = load_and_display_data(file_name, separator=";")

# --- New Improvement: Check if df_raw was successfully loaded ---
if df_raw is not None:
    print("\nDataFrame 'df_raw' is ready for processing.")
else:
    print("\nProcessing stopped as the DataFrame could not be loaded.")

Successful loading of file: regularite-mensuelle-tgv-aqst.csv
---
DataFrame shape (rows, columns):
(10687, 26)

DataFrame 'df_raw' is ready for processing.


In [28]:
# If df_raw is successfully loaded, display the first few rows
if df_raw is not None:
    display(df_raw.head())

Unnamed: 0,Date,Service,Gare de départ,Gare d'arrivée,Durée moyenne du trajet,Nombre de circulations prévues,Nombre de trains annulés,Commentaire annulations,Nombre de trains en retard au départ,Retard moyen des trains en retard au départ,...,Nombre trains en retard > 15min,Retard moyen trains en retard > 15 (si liaison concurrencée par vol),Nombre trains en retard > 30min,Nombre trains en retard > 60min,Prct retard pour causes externes,Prct retard pour cause infrastructure,Prct retard pour cause gestion trafic,Prct retard pour cause matériel roulant,Prct retard pour cause gestion en gare et réutilisation de matériel,"Prct retard pour cause prise en compte voyageurs (affluence, gestions PSH, correspondances)"
0,2018-01,National,GRENOBLE,PARIS LYON,183,245,0,,37,8.027027,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018-01,International,PARIS LYON,ITALIE,394,94,0,,27,11.261728,...,22,11.601064,15,6,33.333333,19.047619,23.809524,14.285714,9.52381,0.0
2,2018-01,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,,133,6.978195,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018-01,National,PARIS NORD,DUNKERQUE,116,271,3,,46,11.236594,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018-01,National,ANNECY,PARIS LYON,224,198,0,,12,8.070833,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905


In [None]:
if 'df_raw' in locals() and not df_raw.empty:
    print("\n## DataFrame Columns\n")
    
    # Use the .tolist() method for a cleaner list representation
    # Iterate through the columns and print them on separate lines for better readability
    print("Column names (original):")
    for i, col in enumerate(df_raw.columns.tolist(), 1):
        print(f"  {i}. {col}")
        
    print(f"\nTotal Columns: {len(df_raw.columns)}")

else:
    # Handle the case where the DataFrame is not defined or is empty
    print("Warning: The DataFrame 'df_raw' is either not defined or is empty. Cannot display column names.")


## DataFrame Columns

Column names (original):
  1. Date
  2. Service
  3. Gare de départ
  4. Gare d'arrivée
  5. Durée moyenne du trajet
  6. Nombre de circulations prévues
  7. Nombre de trains annulés
  8. Commentaire annulations
  9. Nombre de trains en retard au départ
  10. Retard moyen des trains en retard au départ
  11. Retard moyen de tous les trains au départ
  12. Commentaire retards au départ
  13. Nombre de trains en retard à l'arrivée
  14. Retard moyen des trains en retard à l'arrivée
  15. Retard moyen de tous les trains à l'arrivée
  16. Commentaire retards à l'arrivée
  17. Nombre trains en retard > 15min
  18. Retard moyen trains en retard > 15 (si liaison concurrencée par vol)
  19. Nombre trains en retard > 30min
  20. Nombre trains en retard > 60min
  21. Prct retard pour causes externes
  22. Prct retard pour cause infrastructure
  23. Prct retard pour cause gestion trafic
  24. Prct retard pour cause matériel roulant
  25. Prct retard pour cause gestion en ga

In [None]:
if 'df_raw' in locals() and isinstance(df_raw, pd.DataFrame) and not df_raw.empty:
    
    print("\n## DataFrame Information Summary\n")
    
    # Use the .info() method to display the index, column data types, 
    # and non-null values count. The memory_usage='deep' option provides 
    # a more accurate calculation of memory consumption, especially for object/string columns.
    df_raw.info(memory_usage='deep')

else:
    # Handle the case where the DataFrame is not defined or is empty
    print("Warning: The DataFrame 'df_raw' is either not defined, not a DataFrame, or is empty. Cannot display info.")


## DataFrame Information Summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10687 entries, 0 to 10686
Data columns (total 26 columns):
 #   Column                                                                                       Non-Null Count  Dtype  
---  ------                                                                                       --------------  -----  
 0   Date                                                                                         10687 non-null  object 
 1   Service                                                                                      10687 non-null  object 
 2   Gare de départ                                                                               10687 non-null  object 
 3   Gare d'arrivée                                                                               10687 non-null  object 
 4   Durée moyenne du trajet                                                                      10687 non-null  int64  
 5

In [40]:
# Select only numeric columns (integers and floats)
df_numeric = df_raw.select_dtypes(include='number')
    
print("\n## Descriptive Statistics for Numeric Columns\n")
    
# Check if any numeric columns were successfully filtered
if df_numeric.shape[1] > 0:
        
    # Generate statistics and transpose the result for better screen readability
    description_df = df_numeric.describe().transpose()
    print(f"Analyzed {df_numeric.shape[1]} numeric columns out of {df_raw.shape[1]} total columns.")
        
    print("\nDescriptive statistics (Transposed: one row per feature):")
    display(description_df)
    
else:
    print("Warning: No numeric columns were found in the DataFrame to generate descriptive statistics.")


## Descriptive Statistics for Numeric Columns

Analyzed 21 numeric columns out of 26 total columns.

Descriptive statistics (Transposed: one row per feature):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Durée moyenne du trajet,10687.0,170.401516,87.802397,0.0,100.0,163.0,222.0,786.0
Nombre de circulations prévues,10687.0,269.288294,181.478061,0.0,149.0,229.0,357.0,1100.0
Nombre de trains annulés,10687.0,9.285393,23.652677,0.0,0.0,2.0,8.0,297.0
Commentaire annulations,0.0,,,,,,,
Nombre de trains en retard au départ,10687.0,86.747263,89.030121,0.0,21.0,53.0,127.0,596.0
Retard moyen des trains en retard au départ,10687.0,11.951464,11.726851,0.0,5.903146,10.009368,15.235088,316.188095
Retard moyen de tous les trains au départ,10687.0,3.046646,4.904085,-229.269444,1.172792,2.266286,3.876104,84.516667
Commentaire retards au départ,0.0,,,,,,,
Nombre de trains en retard à l'arrivée,10687.0,36.628988,30.678908,0.0,15.0,29.0,50.0,376.0
Retard moyen des trains en retard à l'arrivée,10687.0,34.665571,15.642794,-40.109259,25.328252,32.987198,41.91072,299.6


In [None]:
# Calculate missing values: count and percentage
missing = pd.DataFrame({
    # Use .isna() (alias for .isnull()) and .sum() to get the count
    'Missing Count': df_raw.isna().sum(),
        
    # Calculate percentage and apply rounding for cleaner output
    'Missing Percent': (df_raw.isna().sum() / len(df_raw) * 100).round(2) 
    }).sort_values('Missing Percent', ascending=False)

print("\n## Missing Value Summary\n")
    
# Filter for columns with at least one missing value
missing_data_summary = missing[missing['Missing Count'] > 0]
    
# Check if any missing values were found
if not missing_data_summary.empty:
    print(f"{len(missing_data_summary)} columns have missing values.")
    print("---")
    # Display the result
    display(missing_data_summary)

else:
    print("Great! No missing values found in the DataFrame.")


## Missing Value Summary

3 columns have missing values.
---


Unnamed: 0,Missing Count,Missing Percent
Commentaire annulations,10687,100.0
Commentaire retards au départ,10687,100.0
Commentaire retards à l'arrivée,9989,93.47


In [None]:
try:
    # Import the ProfileReport class
    from ydata_profiling import ProfileReport
        
    # Define the output file name in a variable for clarity
    output_file = "profiling_report.html" 
        
    print("\n## Generating Profiling Report...")
        
    # Instantiate the ProfileReport object
    # The 'sort' option ensures columns are displayed alphabetically in the report,
    # which can improve organization.
    profile = ProfileReport(
        df_raw, 
        title="Data Profiling Report: TGV Regularity",
        sort=None,  # Use 'None' for original order, or 'alphabetical' for sorting
        explorative=True
    )
        
        # Export the report to an HTML file
    profile.to_file(output_file) 
        
    print(f"Profiling report successfully exported to: {output_file}")

except ImportError:
    print("Error: The 'ydata-profiling' (or 'pandas-profiling') library is required.")
    print("Please install it using: `pip install ydata-profiling`")

except Exception as e:
    print(f"An error occurred during report generation: {e}")

In [43]:
# Define the list of columns to be dropped
# These columns often contain qualitative/text data or a high percentage of missing values.
columns_to_drop = [
    'Commentaire annulations',
    'Commentaire retards au départ'
]

# Use a single efficient .drop() call
# The 'errors="ignore"' argument is a key improvement: it prevents the code from
# crashing if one of the columns listed above was already dropped or doesn't exist.
df_raw = df_raw.drop(columns=columns_to_drop, errors='ignore')
    
print("\n## Column Deletion Summary\n")
print(f"Successfully dropped the following columns (if they existed):")

for col in columns_to_drop:
    print(f"- {col}")
        
print("\n---")
print(f"New DataFrame shape: {df_raw.shape}")


## Column Deletion Summary

Successfully dropped the following columns (if they existed):
- Commentaire annulations
- Commentaire retards au départ

---
New DataFrame shape: (10687, 24)


In [None]:
#rename columns
df_raw.rename(columns={...

In [42]:
# Calculate the total number of duplicate rows (where the entire row is identical)
num_duplicates = df_raw.duplicated().sum()
    
print("\n## Duplicate Row Check\n")
    
if num_duplicates > 0:
    print(f"Warning: Found {num_duplicates} duplicate row(s) in the DataFrame.")
        
    # Optional: Print the percentage of duplicate rows for better context
    percent_duplicates = (num_duplicates / len(df_raw) * 100).round(2)
    print(f"This represents {percent_duplicates}% of the total data.")
        
    # Optional: Display the first few duplicate rows for inspection
    # Keep='first' means the second, third, etc., occurrences are marked True
    print("\nFirst 5 duplicate entries (excluding the first occurrence):")
    display(df_raw[df_raw.duplicated(keep='first')].head())
        
else:
    print("Great! No full-row duplicates were found in the DataFrame.")


## Duplicate Row Check

Great! No full-row duplicates were found in the DataFrame.


In [45]:
# Identify object (string) columns using select_dtypes
# This is often more explicit and faster than iterating over all columns and checking dtype
object_cols = df_raw.select_dtypes(include=['object']).columns
    
print("\n## Trimming Whitespace in String Columns\n")
    
if len(object_cols) > 0:
        
# Apply the strip function using .loc for explicit assignment and better performance
    for col in object_cols:
        # We explicitly handle potential NaN values by converting them to string
        # and then applying str.strip(). This ensures all cells are processed, 
        # though str.strip() on a true NaN string ('nan') will result in 'nan'.
        # A more robust approach, applied here, is to use .str.strip() which 
        # correctly handles NaN values (keeping them as NaN).
            
        # The .str accessor safely applies the string method, returning NaN for NaN inputs.
        df_raw.loc[:, col] = df_raw[col].str.strip() 

    print(f"Whitespace trimmed in {len(object_cols)} object columns.")
    print("Columns processed:")

    for col in object_cols:
        print(f"- {col}")
else:
    print("Warning: No columns of type 'object' found to trim whitespace.")


## Trimming Whitespace in String Columns

Whitespace trimmed in 5 object columns.
Columns processed:
- Date
- Service
- Gare de départ
- Gare d'arrivée
- Commentaire retards à l'arrivée


In [46]:
date_column = "Date"
    
print(f"\n## Date Column Conversion ({date_column})\n")

# Check if the 'Date' column exists
if date_column in df_raw.columns:
        
    # Store the original dtype for comparison
    original_dtype = df_raw[date_column].dtype
        
    try:
        # Use errors='coerce' to handle values that don't match the specified format.
        # These values will be converted to NaT (Not a Time), which is preferable 
        # to crashing the script.
        df_raw[date_column] = pd.to_datetime(
            df_raw[date_column], 
            format="%Y-%m",
            errors='coerce' # Key improvement for error handling
        )

        new_dtype = df_raw[date_column].dtype
        print(f"Conversion successful!")
        print(f"Original data type: {original_dtype}")
        print(f"New data type: {new_dtype}")

        # Check for NaT values introduced during coercion
        nat_count = df_raw[date_column].isna().sum()
        if nat_count > 0:
            print(f"Warning: {nat_count} value(s) could not be parsed and were converted to NaT.")
                
        print("\nFirst 5 rows with new date format:")
        display(df_raw.head())

    except Exception as e:
        print(f"Error during date conversion: {e}")
        print("Suggestion: Verify the data format and column content.")
            
else:
    print(f"Warning: Column '{date_column}' not found in the DataFrame. Conversion skipped.")


## Date Column Conversion (Date)

Conversion successful!
Original data type: object
New data type: datetime64[ns]

First 5 rows with new date format:


Unnamed: 0,Date,Service,Gare de départ,Gare d'arrivée,Durée moyenne du trajet,Nombre de circulations prévues,Nombre de trains annulés,Nombre de trains en retard au départ,Retard moyen des trains en retard au départ,Retard moyen de tous les trains au départ,...,Nombre trains en retard > 15min,Retard moyen trains en retard > 15 (si liaison concurrencée par vol),Nombre trains en retard > 30min,Nombre trains en retard > 60min,Prct retard pour causes externes,Prct retard pour cause infrastructure,Prct retard pour cause gestion trafic,Prct retard pour cause matériel roulant,Prct retard pour cause gestion en gare et réutilisation de matériel,"Prct retard pour cause prise en compte voyageurs (affluence, gestions PSH, correspondances)"
0,2018-01-01,National,GRENOBLE,PARIS LYON,183,245,0,37,8.027027,1.212245,...,25,6.123741,13,6,17.647059,52.941176,0.0,23.529412,5.882353,0.0
1,2018-01-01,International,PARIS LYON,ITALIE,394,94,0,27,11.261728,2.997695,...,22,11.601064,15,6,33.333333,19.047619,23.809524,14.285714,9.52381,0.0
2,2018-01-01,National,MARSEILLE ST CHARLES,LYON PART DIEU,106,557,7,133,6.978195,1.706333,...,40,5.195333,19,5,23.076923,23.076923,19.230769,23.076923,3.846154,7.692308
3,2018-01-01,National,PARIS NORD,DUNKERQUE,116,271,3,46,11.236594,1.797637,...,18,3.738806,9,4,35.714286,28.571429,7.142857,25.0,3.571429,0.0
4,2018-01-01,National,ANNECY,PARIS LYON,224,198,0,12,8.070833,0.489141,...,38,8.552525,14,5,23.809524,42.857143,9.52381,14.285714,4.761905,4.761905


In [None]:
# checking that 'Nombre de circulations prévues' >= 'Nombre de trains annulés'

invalid_rows = df_raw[df_raw['Nombre de circulations prévues'] < df_raw['Nombre de trains annulés']]
print(f"Number of rows where the number of scheduled trains is less than the number of cancelled trains: {len(invalid_rows)}")

print("Rows where the number of trains scheduled trains equals the number of cancelled trains:")
df_raw[df_raw['Nombre de circulations prévues'] == df_raw['Nombre de trains annulés']]

In [None]:
# Impute rows where 'Nombre de circulations prévues' < 'Nombre de trains annulés' with the mean of 'Nombre de circulations prévues' for trains with same 'Gare de départ' and 'Gare d'arrivée' 
for index, row in invalid_rows.iterrows():
    mask = (df_raw['Gare de départ'] == row['Gare de départ']) & (df_raw['Gare d\'arrivée'] == row['Gare d\'arrivée']) & (df_raw.index != index)
    mean_value = df_raw.loc[mask, 'Nombre de circulations prévues'].mean()
    df_raw.at[index, 'Nombre de circulations prévues'] = mean_value

# Verify that there are no more invalid rows
invalid_rows_after = df_raw[df_raw['Nombre de circulations prévues'] < df_raw['Nombre de trains annulés']]
print(f"Number of invalid rows after imputation: {len(invalid_rows_after)}")

invalid_rows_after.head()

In [None]:
# checking why there are still 10 rows where nb of scheduled trains is inferior to nb of cancelled trains

# check trains with 'Gare de départ' NANTES and 'Gare d'arrivée' STRASBOURG
df_raw[(df_raw['Gare de départ'] == 'NANTES') & (df_raw['Gare d\'arrivée'] == 'STRASBOURG')].head()

# check trains with 'Gare de départ' MARSEILLE ST CHARLES and 'Gare d'arrivée' TOURCOING
df_raw[(df_raw['Gare de départ'] == 'MARSEILLE ST CHARLES') & (df_raw['Gare d\'arrivée'] == 'TOURCOING')].head()

# check trains with 'Gare de départ' BORDEAUX ST JEAN and 'Gare d'arrivée' TOURCOING	
df_raw[(df_raw['Gare de départ'] == 'BORDEAUX ST JEAN') & (df_raw['Gare d\'arrivée'] == 'BORDEAUX ST JEAN')].head()

# check trains with 'Gare de départ' TOURCOING and 'Gare d'arrivée' BORDEAUX ST JEAN	
df_raw[(df_raw['Gare de départ'] == 'TOURCOING') & (df_raw['Gare d\'arrivée'] == 'BORDEAUX ST JEAN')].head()

# check trains with 'Gare de départ' MADRID and 'Gare d'arrivée' MARSEILLE ST CHARLES
df_raw[(df_raw['Gare de départ'] == 'MADRID') & (df_raw['Gare d\'arrivée'] == 'MARSEILLE ST CHARLES')].head()	

In [None]:
# drop rows where 'Nombre de circulations prévues' is 0 and 'Nombre de trains annulés' > 0

df_raw.drop(df_raw[(df_raw['Nombre de circulations prévues'] == 0) & (df_raw['Nombre de trains annulés'] > 0)].index, inplace=True)

# checking now in the raw dataset if there are still rows where 'Nombre de circulations prévues' < 'Nombre de trains annulés'
invalid_rows_final = df_raw[df_raw['Nombre de circulations prévues'] < df_raw['Nombre de trains annulés']]
print(f"Number of invalid rows after dropping inconsistent data: {len(invalid_rows_final)}")

In [None]:
df_raw[df_raw['Nombre de trains en retard au départ'] > df_raw["Nombre de circulations prévues"]].count

In [None]:
# Etude Outliers

In [None]:
# checking the rows where 'Retard moyen de tous les trains à l'arrivée' < -30, meaning the train has more than 
# 30 minutes of advance on the schedule, which might seem a little strange.

df_raw[df_raw['Retard moyen de tous les trains à l\'arrivée'] < -30].head()

In [None]:
# save clean

In [None]:
def csv_to_parquet_optimized(csv_file_path, parquet_file_path, index_col=None, 
                             compression='snappy', chunk_size=None, separator=','):
    """
    Converts a CSV file to Parquet format with optimizations using PyArrow.

    Args:
        csv_file_path (str): Path to the input CSV file.
        parquet_file_path (str): Path where the output Parquet file will be saved.
        index_col (str, optional): Name of the column to use as index (None by default).
        compression (str, optional): Compression algorithm to use ('snappy', 'gzip', 'brotli', 'zstd').
        chunk_size (int, optional): Number of rows to read at a time for large files (None reads all at once).
    
    Returns:
        bool: True if conversion succeeded, False otherwise.
    """
    start_time = time.time()
    
    print(f"Starting CSV file reading: {csv_file_path}")
    
    try:
        # Read CSV with Pandas
        # Using low_memory=False to prevent dtype warnings on large files
        df = pd.read_csv(csv_file_path, index_col=index_col, low_memory=False, chunksize=chunk_size, sep=separator)
    except FileNotFoundError:
        print(f"Error: CSV file not found at specified location: {csv_file_path}")
        return False
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return False

    read_time = time.time()
    print(f"CSV reading completed in {read_time - start_time:.2f} seconds.")
    print(f"Rows read: {len(df):,}")
    print(f"Columns: {len(df.columns)}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Schema Optimization
    # Convert DataFrame to PyArrow Table
    # This step infers PyArrow schema from Pandas types
    print("Converting to PyArrow table...")
    try:
        table = pa.Table.from_pandas(df, preserve_index=False)
    except Exception as e:
        print(f"Error converting to PyArrow table: {e}")
        return False
    
    # Add Custom Metadata
    # Metadata is stored in the Parquet file footer
    metadata = {
        'creation_tool': 'csv_to_parquet_optimized.py',
        'conversion_timestamp': pd.Timestamp.now().isoformat(),
        'source_file': csv_file_path,
        'original_row_count': str(len(df)),
        'original_column_count': str(len(df.columns)),
        'compression_algorithm': compression
    }
    
    # Integrate metadata into schema
    # PyArrow stores metadata at the schema level
    existing_metadata = table.schema.metadata or {}
    existing_metadata[b'custom_metadata'] = str(metadata).encode('utf8')
    table = table.replace_schema_metadata(existing_metadata)
    
    print(f"Writing Parquet file with '{compression}' compression...")

    # Write Parquet file
    # PyArrow provides efficient Parquet writing
    try:
        pq.write_table(
            table, 
            parquet_file_path, 
            compression=compression,
            use_dictionary=True,      # Efficient for categorical columns
            write_statistics=True,     # Enable statistics for better query performance
            row_group_size=100000,     # Optimize row group size for balance between memory and I/O
            version='2.6'              # Use newer Parquet format version for better features
        )
    except Exception as e:
        print(f"Error writing Parquet file: {e}")
        return False

    end_time = time.time()
    
    # Display results
    import os
    csv_size = os.path.getsize(csv_file_path) / 1024**2
    parquet_size = os.path.getsize(parquet_file_path) / 1024**2
    compression_ratio = (1 - parquet_size / csv_size) * 100
    
    print(f"\nConversion successful!")
    print(f"Parquet file saved to: {parquet_file_path}")
    print(f"Original CSV size: {csv_size:.2f} MB")
    print(f"Parquet file size: {parquet_size:.2f} MB")
    print(f"Compression ratio: {compression_ratio:.1f}%")
    print(f"Total duration: {end_time - start_time:.2f} seconds")
    
    return True

In [None]:
def verify_parquet_file(parquet_file_path, num_rows_preview=5):
    """
    Verifies and displays information about a Parquet file.
    
    Args:
        parquet_file_path (str): Path to the Parquet file to verify.
        num_rows_preview (int): Number of rows to preview (default: 5).
    """
    print(f"\nVerifying Parquet file: {parquet_file_path}")
    
    try:
        parquet_file = pq.ParquetFile(parquet_file_path)
        
        print(f"\nParquet Schema:")
        print(parquet_file.schema)
        
        print(f"\nFile metadata:")
        print(f"Number of row groups: {parquet_file.num_row_groups}")
        print(f"Total rows: {parquet_file.metadata.num_rows:,}")
        
        # Read custom metadata
        metadata_bytes = parquet_file.metadata.metadata.get(b'custom_metadata')
        if metadata_bytes:
            print(f"\nCustom metadata:")
            print(metadata_bytes.decode('utf8'))
        
        # Sample first few rows using pandas read_parquet
        print(f"\nFirst {num_rows_preview} rows preview:")
        df_sample = pd.read_parquet(parquet_file_path, engine='pyarrow').head(num_rows_preview)
        display(df_sample)
        
    except Exception as e:
        print(f"Error verifying Parquet file: {e}")

In [None]:
# Replace with your actual file paths
input_csv = 'data_clean.csv'
output_parquet = 'data.parquet'
    
 # Convert CSV to Parquet
success = csv_to_parquet_optimized(
    input_csv, 
    output_parquet, 
    compression='snappy', # Options: 'snappy', 'gzip', 'brotli', 'zstd'
    separator=";"
)
    
# Verify the conversion if successful
if success:
    verify_parquet_file(output_parquet)

In [None]:
import duckdb

dbms_columnar = duckdb.connect("database.duckdb")

dbms_columnar.execute("""
    CREATE TABLE IF NOT EXISTS TGV_table AS
    SELECT * FROM 'data.parquet';
""")

print(dbms_columnar.execute("SELECT COUNT(*) FROM TGV_table").fetchall())

[(10687,)]
