In [None]:
import yaml
import pandas as pd
import re
import sys
import os
from datetime import datetime, timedelta

project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))

if project_root not in sys.path:
    sys.path.append(project_root)

from functions import convert_to_hhmmss, convert_to_hhmmss_v2, calculate_duration
 
try:
    with open("../../config.yaml", "r") as file:
        config = yaml.safe_load(file)

    df_business_clean = pd.read_csv(config['data']['clean_data']['business_clean'], sep=";")
    df_economy_clean = pd.read_csv(config['data']['clean_data']['economy_clean'], sep=";")
    df_full_clean = pd.read_csv(config['data']['clean_data']['full_clean'], sep=";")

except:
    print("Yaml configuration file not found!")

In [None]:
df_full_clean.head()

In [None]:
df_full_clean['duration'] = df_full_clean['duration'].apply(convert_to_hhmmss)

In [None]:
df_full_clean = df_full_clean.rename(columns={'source_city': 'from', 'destination_city': 'to'})
df_full_clean['stops'] = df_full_clean['stops'].replace('zero', '0')
df_full_clean['stops'] = df_full_clean['stops'].replace('one', '1')
df_full_clean['stops'] = df_full_clean['stops'].replace('two_or_more', '2+')

In [None]:
df_full_clean = df_full_clean.drop(columns=["departure_time", "arrival_time"])

In [None]:
df_full_clean = df_full_clean[["airline", "flight", "from", "to", "duration", "stops", "class", "lead_time_days", "price"]]

In [None]:
df_business_clean = df_business_clean[["airline", "flight", "from", "to", "duration", "stop", "dep_time", "arr_time", "price"]]

In [None]:
df_business_clean['duration'] = df_business_clean['duration'].apply(convert_to_hhmmss)
df_business_clean['dep_time'] = df_business_clean['dep_time'].apply(convert_to_hhmmss_v2)
df_business_clean['arr_time'] = df_business_clean['arr_time'].apply(convert_to_hhmmss_v2)

In [None]:
df_business_clean.head()

In [None]:
df_business_clean['duration'] = df_business_clean.apply(lambda row: calculate_duration(row['dep_time'], row['arr_time']), axis=1)

In [None]:
df_business_clean = df_business_clean.rename(columns={'stop': 'stops'})
df_business_clean['stops'] = df_business_clean['stops'].replace('non-stop', '0')
df_business_clean['stops'] = df_business_clean['stops'].replace('1-stop', '1')
df_business_clean['stops'] = df_business_clean['stops'].replace('2+-stop', '2+')

In [None]:
df_economy_clean = df_economy_clean[["airline", "flight", "from", "to", "duration", "stop", "dep_time", "arr_time", "price"]]

In [None]:
df_economy_clean['duration'] = df_economy_clean['duration'].apply(convert_to_hhmmss)
df_economy_clean['dep_time'] = df_economy_clean['dep_time'].apply(convert_to_hhmmss_v2)
df_economy_clean['arr_time'] = df_economy_clean['arr_time'].apply(convert_to_hhmmss_v2)

In [None]:
df_economy_clean['duration'] = df_economy_clean.apply(lambda row: calculate_duration(row['dep_time'], row['arr_time']), axis=1)

In [None]:
df_economy_clean = df_economy_clean.rename(columns={'stop': 'stops'})
df_economy_clean['stops'] = df_economy_clean['stops'].replace('non-stop', '0')
df_economy_clean['stops'] = df_economy_clean['stops'].replace('1-stop', '1')
df_economy_clean['stops'] = df_economy_clean['stops'].replace('2+-stop', '2+')

In [None]:
df_business_clean.to_csv(config['data']['clean_data']['business_clean'], index=False, sep=";", encoding="utf-8")

In [None]:
df_economy_clean.to_csv(config['data']['clean_data']['economy_clean'], index=False, sep=";", encoding="utf-8")

In [None]:
df_full_clean.to_csv(config['data']['clean_data']['full_clean'], index=False, sep=";", encoding="utf-8")

In [None]:
df_business_clean