In [1]:
import yaml
import pandas as pd
 
try:
    with open("../../config.yaml", "r") as file:
        config = yaml.safe_load(file)

    df = pd.read_csv(config['data']['clean_data']['full_clean'], sep=";")

except:
    print("Yaml configuration file not found!")

In [2]:
df.head()

Unnamed: 0,airline,flight,from,to,duration,stops,class,lead_time_days,price
0,SpiceJet,SG-8709,Delhi,Mumbai,2h 17m,0,Economy,1,5953
1,SpiceJet,SG-8157,Delhi,Mumbai,2h 33m,0,Economy,1,5953
2,AirAsia,I5-764,Delhi,Mumbai,2h 17m,0,Economy,1,5956
3,Vistara,UK-995,Delhi,Mumbai,2h 25m,0,Economy,1,5955
4,Vistara,UK-963,Delhi,Mumbai,2h 33m,0,Economy,1,5955


In [3]:
# Get unique origins
origin_df = pd.DataFrame({"origin_id": range(1, df["from"].nunique() + 1),
                          "origin": df["from"].drop_duplicates().reset_index(drop=True)})

# Get unique destinations
destination_df = pd.DataFrame({"destination_id": range(1, df["to"].nunique() + 1),
                               "destination": df["to"].drop_duplicates().reset_index(drop=True)})

In [4]:
# Merge source_city with origin_df to get origin_id
itinerary_df = df.merge(origin_df, how="left", left_on="from", right_on="origin")

# Merge destination_city with destination_df to get destination_id
itinerary_df = itinerary_df.merge(destination_df, how="left", left_on="to", right_on="destination")

# Keep only the id columns **and the names**
itinerary_df = itinerary_df[["origin_id", "origin", "destination_id", "destination"]].drop_duplicates().reset_index(drop=True)

# Add itinerary_id as primary key
itinerary_df["itinerary_id"] = range(1, len(itinerary_df) + 1)

# Reorder columns
itinerary_df = itinerary_df[["itinerary_id", "origin_id", "origin", "destination_id", "destination"]]

In [5]:
itinerary_df

Unnamed: 0,itinerary_id,origin_id,origin,destination_id,destination
0,1,1,Delhi,1,Mumbai
1,2,1,Delhi,2,Bangalore
2,3,1,Delhi,3,Kolkata
3,4,1,Delhi,4,Hyderabad
4,5,1,Delhi,5,Chennai
5,6,2,Mumbai,6,Delhi
6,7,2,Mumbai,2,Bangalore
7,8,2,Mumbai,3,Kolkata
8,9,2,Mumbai,4,Hyderabad
9,10,2,Mumbai,5,Chennai


In [6]:
#saving itinerary table to csv to be exported to SQL

itinerary_path = config["data"]["sql"]["itinerary"]

itinerary_df.to_csv(itinerary_path, index=False, encoding="utf-8")