In [1]:
import yaml
import pandas as pd
import re
import sys
import os
from datetime import datetime, timedelta

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

if project_root not in sys.path:
    sys.path.append(project_root)

from functions import convert_to_hhmmss, convert_to_hhmmss_v2, calculate_duration
 
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)

    df_business_clean = pd.read_csv(config['data']['clean_data']['business_clean'], sep=";")
    df_economy_clean = pd.read_csv(config['data']['clean_data']['economy_clean'], sep=";")
    df_full_clean = pd.read_csv(config['data']['clean_data']['full_clean'], sep=";")

except:
    print("Yaml configuration file not found!")

In [2]:
df_full_clean = df_full_clean.rename(columns={'source_city': 'from', 'destination_city': 'to'})
df_full_clean['stops'] = df_full_clean['stops'].replace('zero', '0')
df_full_clean['stops'] = df_full_clean['stops'].replace('one', '1')
df_full_clean['stops'] = df_full_clean['stops'].replace('two_or_more', '2+')

In [3]:
df_full_clean = df_full_clean.drop(columns=["departure_time", "arrival_time"])

In [4]:
df_full_clean = df_full_clean[["airline", "flight", "from", "to", "duration", "stops", "class", "lead_time_days", "price"]]

In [5]:
df_business_clean = df_business_clean[["airline", "flight", "from", "to", "duration", "stop", "dep_time", "arr_time", "price"]]

In [6]:
df_business_clean['duration'] = df_business_clean['duration'].apply(convert_to_hhmmss)
df_business_clean['dep_time'] = df_business_clean['dep_time'].apply(convert_to_hhmmss_v2)
df_business_clean['arr_time'] = df_business_clean['arr_time'].apply(convert_to_hhmmss_v2)

In [7]:
df_business_clean['duration'] = df_business_clean.apply(lambda row: calculate_duration(row['dep_time'], row['arr_time']), axis=1)

In [8]:
df_business_clean = df_business_clean.rename(columns={'stop': 'stops'})
df_business_clean['stops'] = df_business_clean['stops'].replace('non-stop', '0')
df_business_clean['stops'] = df_business_clean['stops'].replace('1-stop', '1')
df_business_clean['stops'] = df_business_clean['stops'].replace('2+-stop', '2+')

In [9]:
df_economy_clean = df_economy_clean[["airline", "flight", "from", "to", "duration", "stop", "dep_time", "arr_time", "price"]]

In [10]:
df_economy_clean['duration'] = df_economy_clean['duration'].apply(convert_to_hhmmss)
df_economy_clean['dep_time'] = df_economy_clean['dep_time'].apply(convert_to_hhmmss_v2)
df_economy_clean['arr_time'] = df_economy_clean['arr_time'].apply(convert_to_hhmmss_v2)

In [11]:
df_economy_clean['duration'] = df_economy_clean.apply(lambda row: calculate_duration(row['dep_time'], row['arr_time']), axis=1)

In [12]:
df_economy_clean = df_economy_clean.rename(columns={'stop': 'stops'})
df_economy_clean['stops'] = df_economy_clean['stops'].replace('non-stop', '0')
df_economy_clean['stops'] = df_economy_clean['stops'].replace('1-stop', '1')
df_economy_clean['stops'] = df_economy_clean['stops'].replace('2+-stop', '2+')

In [13]:
df_business_clean.to_csv(config['data']['clean_data']['business_clean'], index=False, sep=";", encoding="utf-8")

In [14]:
df_economy_clean.to_csv(config['data']['clean_data']['economy_clean'], index=False, sep=";", encoding="utf-8")

In [16]:
df_full_clean.to_csv(config['data']['clean_data']['full_clean'], index=False, sep=";", encoding="utf-8")

In [17]:
df_business_clean

Unnamed: 0,airline,flight,from,to,duration,stops,dep_time,arr_time,price
0,Air India,AI-868,Delhi,Mumbai,2:00:00,0,18:00:00,20:00:00,25612
1,Air India,AI-624,Delhi,Mumbai,2:15:00,0,19:00:00,21:15:00,25612
2,Air India,AI-531,Delhi,Mumbai,0:45:00,1,20:00:00,20:45:00,42220
3,Air India,AI-839,Delhi,Mumbai,2:30:00,1,21:25:00,23:55:00,44450
4,Air India,AI-544,Delhi,Mumbai,6:40:00,1,17:15:00,23:55:00,46690
...,...,...,...,...,...,...,...,...,...
93482,Vistara,UK-822,Chennai,Hyderabad,10:05:00,1,09:45:00,19:50:00,69265
93483,Vistara,UK-826,Chennai,Hyderabad,10:25:00,1,12:30:00,22:55:00,77105
93484,Vistara,UK-832,Chennai,Hyderabad,13:50:00,1,07:05:00,20:55:00,79099
93485,Vistara,UK-828,Chennai,Hyderabad,10:00:00,1,07:00:00,17:00:00,81585
