In [1]:
import pandas as pd
import os

In [2]:
# Adjust pandas display options to ensure all columns are shown
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)

In [3]:
csv_files = []
# Walk the directory tree
for root, dirs, files in os.walk('../jrcz_datafest_2024_datasets'):
    for file in files:
        # Check if the file ends with '.csv'
        if file.endswith('.csv'):
            # Get the full path to the file
            full_path = os.path.join(root, file)
            csv_files.append(full_path)
csv_files

['../jrcz_datafest_2024_datasets\\data-bridge-openings-2023\\bridge-openings-2023-Processed-KSB-SB.csv',
 '../jrcz_datafest_2024_datasets\\data-bridge-openings-2023\\zeeland_passages_2023.csv',
 '../jrcz_datafest_2024_datasets\\data-weather\\by-day\\weer-daggegevens-2023-vlis-dates-parsed.csv',
 '../jrcz_datafest_2024_datasets\\data-weather\\by-day\\weer-daggegevens-2023-vlis.csv',
 '../jrcz_datafest_2024_datasets\\data-weather\\by-day\\weer-uurgegevens-2023-vlis-datetimes-parsed.csv',
 '../jrcz_datafest_2024_datasets\\data-weather\\by-hour\\weer-uurgegevens-processed-2023.csv',
 '../jrcz_datafest_2024_datasets\\holidays\\national_holidays_NLBEDEFR_2023.csv',
 '../jrcz_datafest_2024_datasets\\holidays\\school_holidays_NLBEDEFR_2023.csv',
 '../jrcz_datafest_2024_datasets\\intensity-speed-export-a58-east\\intensity-speed-export-2022.csv',
 '../jrcz_datafest_2024_datasets\\intensity-speed-export-a58-east\\intensity-speed-export-2023.csv',
 '../jrcz_datafest_2024_datasets\\intensity-speed-

In [4]:
new_csv_files = [os.path.basename(file_path) for file_path in csv_files]
new_csv_files = [os.path.join('../preprocessed', filename) for filename in new_csv_files]
new_csv_files

['../preprocessed\\bridge-openings-2023-Processed-KSB-SB.csv',
 '../preprocessed\\zeeland_passages_2023.csv',
 '../preprocessed\\weer-daggegevens-2023-vlis-dates-parsed.csv',
 '../preprocessed\\weer-daggegevens-2023-vlis.csv',
 '../preprocessed\\weer-uurgegevens-2023-vlis-datetimes-parsed.csv',
 '../preprocessed\\weer-uurgegevens-processed-2023.csv',
 '../preprocessed\\national_holidays_NLBEDEFR_2023.csv',
 '../preprocessed\\school_holidays_NLBEDEFR_2023.csv',
 '../preprocessed\\intensity-speed-export-2022.csv',
 '../preprocessed\\intensity-speed-export-2023.csv',
 '../preprocessed\\intensity-speed-export-2022.csv',
 '../preprocessed\\intensity-speed-export-2023.csv',
 '../preprocessed\\vri_data_2023.csv',
 '../preprocessed\\vri_names.csv']

In [5]:
df_file_path = (csv_files[0])
df = pd.read_csv(df_file_path, delimiter=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5322 entries, 0 to 5321
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        5322 non-null   int64 
 1   opening_duration  5322 non-null   object
 2   bridge            5322 non-null   object
 3   start_datetime    5322 non-null   object
 4   end_datetime      5322 non-null   object
dtypes: int64(1), object(4)
memory usage: 208.0+ KB


In [6]:
df_file_path = (csv_files[1])
df = pd.read_csv(df_file_path, delimiter=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   BeginJaarEvenement                392 non-null    int64  
 1   BeginMaandEvenement               392 non-null    int64  
 2   TelpuntNaam                       392 non-null    object 
 3   Kolknaam                          228 non-null    object 
 4   ScheepstypeCategorieOmschrijving  392 non-null    object 
 5   SeinvoeringOmschrijving           312 non-null    object 
 6   AantalSchepen                     392 non-null    int64  
 7   AantalSchepenGeladen              139 non-null    float64
 8   AantalSchepenLeeg                 350 non-null    float64
 9   AantalContainers                  263 non-null    float64
 10  AantalTEU                         263 non-null    float64
 11  LaadvermogenTotaal                392 non-null    int64  
 12  Laadverm

In [7]:
# Renaming columns
column_mapping = {
"BeginJaarEvenement":"StartYearEvent",
"BeginMaandEvenement":"StartMonthEvent",
"TelpuntNaam":"Bridge Name",
"Kolknaam":"ChamberName",
"ScheepstypeCategorieOmschrijving":"ShipCategoryDescription",
"SeinvoeringOmschrijving":"SignalingDescription",
"AantalSchepen":"NumberShips",
"AantalSchepenGeladen":"NumberShipsLoaded",
"AantalSchepenLeeg":"NumberShipsEmpty",
"AantalContainers":"NumberContainers",
"AantalTEU":"NumberTEU Loading",
"LaadvermogenTotaal":"CapacityTotal Loading",
"LaadvermogenTotaalGeladen":"CapacityTotal Loaded",
"LaadvermogenTotaalLeeg":"CapacityTotal Empty",
"VervoerdGewichtTon":"TransportedWeightTons",
}
df = df.rename(columns=column_mapping)
df.info()
# Export the DataFrame to a new CSV file
df.to_csv(new_csv_files[1], index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   StartYearEvent           392 non-null    int64  
 1   StartMonthEvent          392 non-null    int64  
 2   Bridge Name              392 non-null    object 
 3   ChamberName              228 non-null    object 
 4   ShipCategoryDescription  392 non-null    object 
 5   SignalingDescription     312 non-null    object 
 6   NumberShips              392 non-null    int64  
 7   NumberShipsLoaded        139 non-null    float64
 8   NumberShipsEmpty         350 non-null    float64
 9   NumberContainers         263 non-null    float64
 10  NumberTEU Loading        263 non-null    float64
 11  CapacityTotal Loading    392 non-null    int64  
 12  CapacityTotal Loaded     139 non-null    float64
 13  CapacityTotal Empty      350 non-null    float64
 14  TransportedWeightTons    1