## Initial Setup and Libraries

In [166]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

## Preoprocessing and Data Cleaning

In [178]:
flight_data = pd.read_csv("flight_data.zip")

In [179]:
flight_data.drop(columns=['legId', 'searchDate', 'fareBasisCode', 'elapsedDays',
                          'segmentsDepartureTimeEpochSeconds', 'segmentsDepartureTimeRaw',
                          'segmentsArrivalTimeEpochSeconds', 'segmentsArrivalTimeRaw',
                          'segmentsArrivalAirportCode', 'segmentsDepartureAirportCode', 
                          'segmentsAirlineCode'], inplace=True, errors='ignore')

flight_data.isnull().sum()

flightDate                          0
startingAirport                     0
destinationAirport                  0
travelDuration                      0
isBasicEconomy                      0
isRefundable                        0
isNonStop                           0
baseFare                            0
totalFare                           0
seatsRemaining                      0
totalTravelDistance             61860
segmentsAirlineName                 0
segmentsEquipmentDescription    18754
segmentsDurationInSeconds           0
segmentsDistance                 7724
segmentsCabinCode                   0
dtype: int64

In [180]:
flight_data['segmentsDistance'] = flight_data['segmentsDistance'].str.split(r'\|\|').str[0]
flight_data['segmentsDistance'].replace(['', 'None'], np.nan, inplace=True)
flight_data['segmentsDistance'] = pd.to_numeric(flight_data['segmentsDistance'], errors='coerce')
flight_data['segmentsDistance'].fillna(flight_data['segmentsDistance'].median(), inplace=True)

flight_data['segmentsAirlineName'] = flight_data['segmentsAirlineName'].str.split(r'\|\|').str[0]

flight_data = flight_data[flight_data['segmentsEquipmentDescription'].notnull()].reset_index(drop=True)

flight_data['totalTravelDistance'] = pd.to_numeric(flight_data['totalTravelDistance'], errors='coerce')
flight_data['totalTravelDistance'].fillna(flight_data['totalTravelDistance'].median(), inplace=True)


flight_data.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  flight_data['segmentsDistance'].replace(['', 'None'], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  flight_data['segmentsDistance'].fillna(flight_data['segmentsDistance'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method w

flightDate                      0
startingAirport                 0
destinationAirport              0
travelDuration                  0
isBasicEconomy                  0
isRefundable                    0
isNonStop                       0
baseFare                        0
totalFare                       0
seatsRemaining                  0
totalTravelDistance             0
segmentsAirlineName             0
segmentsEquipmentDescription    0
segmentsDurationInSeconds       0
segmentsDistance                0
segmentsCabinCode               0
dtype: int64

In [181]:
flight_data = flight_data[flight_data['travelDuration'].notna()]
flight_data = flight_data[flight_data['travelDuration'].str.startswith('PT')]

flight_data = flight_data[flight_data['segmentsDurationInSeconds'].notna()]

In [182]:
def conversion(duration):
    try:
        if pd.isna(duration): return None
        match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?', duration)
        hours = int(match.group(1)) if match.group(1) else 0
        minutes = int(match.group(2)) if match.group(2) else 0
        return hours * 60 + minutes
    except:
        return None

flight_data['travelDuration'] = flight_data['travelDuration'].apply(conversion)


def summation(value):
    try:
        if pd.isna(value): return None
        return sum(int(i) for i in value.split('||') if i.isdigit())
    except:
        return None

flight_data['segmentsDurationInSeconds'] = flight_data['segmentsDurationInSeconds'].apply(summation)
flight_data['segmentsDurationInSeconds'] = pd.to_numeric(flight_data['segmentsDurationInSeconds'], errors='coerce')



In [184]:
flight_data = flight_data[~flight_data['segmentsEquipmentDescription'].str.fullmatch(r'\|+', na=False)]

flight_data['segmentsEquipmentDescription'] = (flight_data['segmentsEquipmentDescription']
    .str.replace(r'^\|+', '', regex=True)  
    .str.replace(r'\|+$', '', regex=True)  
    .str.strip() 
)


In [185]:
flight_data['segmentsCabinCode'] = (flight_data['segmentsCabinCode'].str.split(r'\|\|').str[0].str.strip())

In [None]:
flight_data.to_csv('flight_data_preprocessed.csv.zip', index=False, compression='zip')

In [None]:
flight_data_cleaned = pd.read_csv("flight_data_preprocessed.csv.zip")

## Exploratory Data Analysis

## Multiple Linear Regression

## Random Forest

## Gradient Boosting (XGBoosting)

## Web Application Implementation