In [1]:
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
import pyarrow as pa

### Info for PyArrow Install https://www.delftstack.com/howto/python/read-and-write-parquet-files-in-python/

In [2]:
# Create a dictionary.
data = {}
# Loop through the tables.
for tableNumber in range(1, 6):
    # Assign the path to a variable to use when accessing the file.
    table_import_path = f'Resources/flight_data_{tableNumber}.parquet'
    # Load the data frame.
    df = pd.read_parquet(table_import_path)
    # Make boolean data numeric.
    df["isBasicEconomy"] = df["isBasicEconomy"].replace("True", 1).replace("False", 0)
    df["isRefundable"] = df["isRefundable"].replace("True", 1).replace("False", 0)
    df["isNonStop"] = df["isNonStop"].replace("True", 1).replace("False", 0)
    # Make dates and 'travelDuration' object data into datetime
    df['searchDate'] = pd.to_datetime(df['searchDate'])
    df['flightDate'] = pd.to_datetime(df['flightDate'])
    df['travelDuration'] = pd.to_timedelta(df['travelDuration'])
    # Add new column for how many days before a flight a search took place.
    df["searchDaysBeforeFlight"] = pd.to_datetime(df["flightDate"]) - pd.to_datetime(df["searchDate"])
    # Remove some columns.
    df.drop(columns=['legId','fareBasisCode','elapsedDays','baseFare','seatsRemaining',
                     'segmentsDepartureTimeEpochSeconds','segmentsDepartureTimeRaw',
                     'segmentsArrivalTimeEpochSeconds','segmentsArrivalTimeRaw','segmentsArrivalAirportCode',
                     'segmentsDepartureAirportCode','segmentsAirlineName','segmentsAirlineCode',
                     'segmentsEquipmentDescription','segmentsDurationInSeconds',
                     'segmentsDistance','segmentsCabinCode'], inplace=True, errors='ignore')
    # Assign the path to a variable to use when accessing the file.
    table_export_path = f'Resources/flight_data_clean_{tableNumber}.csv'
    # Export table to csv.
    df.to_csv(table_export_path)
    # Copy the dataframe to the dictionary.
    data[tableNumber] = df
# Join the tables.
df = pd.concat([data[1], data[2], data[3], data[4], data[5]], axis=0)
# Recalculate the row numbers.
df = df.reset_index()
del df['index']

In [3]:
# Assign the path to a variable to use when accessing the file.
table_export_path = f'Resources/flight_data_clean_all.csv'
# Export table to csv.
df.to_csv(table_export_path)

In [None]:
df = pd.DataFrame(df)
df.head()

# Clean Data

Essential data cleaning methods to consider when first interacting with a new dataset:

**Check for missing values**: Check the dataset for any missing values and decide on a strategy to handle them, such as imputing the missing values or removing the affected rows/columns.

**Check for duplicates**: Check the dataset for any duplicate records and decide on a strategy to handle them, such as removing the duplicates or aggregating them.

**Check for data types**: Check the data types of each column in the dataset to ensure they are appropriate for the data they represent. For example, dates should be represented as dates and not as strings.

**Check for outliers**: Check for any outliers or extreme values in the dataset that may skew your analysis, and decide on a strategy to handle them, such as removing them or replacing them with more appropriate values.

**Check for inconsistencies**: Check for inconsistencies or errors in the data, such as typos or formatting issues, and decide on a strategy to handle them, such as cleaning the data or removing the affected records.

**Normalize or scale the data**: If you have numerical data that varies widely in magnitude, consider normalizing or scaling the data so that the values are comparable.

**Check for data quality**: Check the overall quality of the data, such as the accuracy and completeness of the information, to ensure that the data is suitable for analysis.

(Thanks ChatGPT)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().count()

In [None]:
for col in df.columns:
    unique_values = df[col].unique()
    print(f"{col} column: {unique_values}")

# Converting Duration Column to proper DateTime Format

In [None]:
# Turn travelDuration column into DateTime
df['travelDuration'] = pd.to_timedelta(df['travelDuration'])
df