In [None]:
import yaml
import pandas as pd

try:
    with open("../../config.yaml", "r") as file:
        config = yaml.safe_load(file)

    df = pd.read_csv(config['data']['clean_data']['full_clean'], sep=";")
    ticket_df = pd.read_csv(config['data']['sql']['ticket'], sep=",")
    schedule_df = pd.read_csv(config['data']['sql']['schedules'], sep=",")
    itinerary_df = pd.read_csv(config['data']['sql']['itinerary'], sep=",")
    flight_number_df = pd.read_csv(config['data']['sql']['number'], sep=",")
    df_bus = pd.read_csv(config['data']['clean_data']['business_clean'], sep=";")
    df_eco = pd.read_csv(config['data']['clean_data']['economy_clean'], sep=";")

except FileNotFoundError:
    print("One or more CSV or YAML files not found!")


In [None]:
origin_df = pd.read_csv(config['data']['sql']['origin'], sep=",")
destination_df = pd.read_csv(config['data']['sql']['destination'], sep=",")
airline_df = pd.read_csv(config['data']['sql']['airline'], sep=",")
stops_df = pd.read_csv(config['data']['sql']['stops'], sep=",")
class_df = pd.read_csv(config['data']['sql']['class'], sep=",")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['flight_id'] = [i for i in range(1, df.shape[0]+1) ]
df = df.iloc[:, [-1] + [i for i in range(df.shape[1]-1)] ]
df.head()

In [None]:
class_df = class_df.rename(columns={'class': 'desc'})

In [None]:
origin_replacements = { origin: origin_id for origin_id, origin in zip(origin_df.origin_id, origin_df.origin)}
destination_replacements = { destination: destination_id for destination_id, destination in zip(destination_df.destination_id, destination_df.destination)}
airline_replacements = { airline_name: airline_name_id for airline_name_id, airline_name in zip(airline_df.airline_name_id, airline_df.airline_name)}
stops_replacements = { stops: stops_id for stops_id, stops in zip(stops_df.stops_id, stops_df.stops)}
flight_number_replacements = { flight_number: flight_number_id for flight_number_id, flight_number in zip(flight_number_df.flight_number_id, flight_number_df.flight_number)}
class_replacements = {desc: class_id for class_id, desc in zip(class_df.class_id, class_df.desc)}

In [None]:
df['origin_id'] = df['from'].replace(origin_replacements)
df['destination_id'] = df['to'].replace(destination_replacements)
df['airline_id'] = df['airline'].replace(airline_replacements)
df['stops_id'] = df['stops'].replace(stops_replacements)
df['flight_number_id'] = df['flight'].replace(flight_number_replacements)
df['class_id'] = df['class'].replace(class_replacements)

In [None]:
itinerary_df.columns = ['itinerary_id', 'origin_id', 'destination_id']

In [None]:
df['itinerary_id'] = 1
for row1 in range(df.shape[0]):
    for row2 in range(itinerary_df.shape[0]):
        if df.loc[row1,'origin_id'] == itinerary_df.loc[row2,'origin_id'] and df.loc[row1,'destination_id'] == itinerary_df.loc[row2,'destination_id']:
            df.loc[row1,'itinerary_id'] = itinerary_df.loc[row2, 'itinerary_id']
            break

df.head()

In [None]:
schedule_df.head()

In [None]:
flight_df = pd.merge(
    left=df, 
    right=ticket_df,
    how='left',
    left_on=['class_id', 'price', 'flight_number_id', 'airline_id'],
    right_on=['class_id', 'price', 'flight_number_id', 'airline_name_id'],
)


In [None]:
flight_df.drop(columns=['airline', 'flight', 'from', 'to', 'stops', 'class'], inplace=True)
flight_df.head()

In [None]:
flight_df.to_csv(config['data']['sql']['flight'], index=False, encoding="utf-8")