In [None]:
import yaml
import pandas as pd
 
try:
    with open("../../config.yaml", "r") as file:
        config = yaml.safe_load(file)

    df = pd.read_csv(config['data']['clean_data']['full_clean'], sep=";")

except:
    print("Yaml configuration file not found!")

In [None]:
df.head()

In [None]:
# Get unique origins
origin_df = pd.DataFrame({"origin_id": range(1, df["from"].nunique() + 1),
                          "origin": df["from"].drop_duplicates().reset_index(drop=True)})

# Get unique destinations
destination_df = pd.DataFrame({"destination_id": range(1, df["to"].nunique() + 1),
                               "destination": df["to"].drop_duplicates().reset_index(drop=True)})

In [None]:
origin_df.head()

In [None]:
destination_df.head()

In [None]:
# Merge source_city with origin_df to get origin_id
itinerary_df = df.merge(origin_df, how="left", left_on="from", right_on="origin")

# Merge destination_city with destination_df to get destination_id
itinerary_df = itinerary_df.merge(destination_df, how="left", left_on="to", right_on="destination")

# Keep only the id columns **and the names**
itinerary_df = itinerary_df[["origin_id", "origin", "destination_id", "destination"]].drop_duplicates().reset_index(drop=True)

# Add itinerary_id as primary key
itinerary_df["itinerary_id"] = range(1, len(itinerary_df) + 1)

# Reorder columns
itinerary_df = itinerary_df[["itinerary_id", "origin_id", "origin", "destination_id", "destination"]]

In [None]:
itinerary_df

In [None]:
#saving itinerary table to csv to be exported to SQL

itinerary_path = config["data"]["sql"]["itinerary"]

itinerary_df.to_csv(itinerary_path, index=False, encoding="utf-8")