# Data analysis for Airline Models


In [None]:
import pandas as pd

url = "https://github.com/Speeb04/SDSS-Datathon/raw/refs/heads/main/Resources/Cases/Airline%20Tickets/airline_ticket_dataset.csv"


df = pd.read_csv(url)
df = df.loc[:, ~df.columns.str.contains("Unnamed")]

df.describe()


In [None]:
df

In [None]:
df["city1"] = df["city1"].str.replace(" (Metropolitan Area)", "")
df["city2"] = df["city2"].str.replace(" (Metropolitan Area)", "")
cities = set(df["city1"].unique()) | set(df["city2"].unique())
cities

In [None]:
df["carrier_lg"].unique()

In [None]:
df["carrier_low"].unique()

In [None]:
full_service = ['DL', 'AS', 'UA', 'AA', 'B6', 'WN']
low_cost = ['NK', 'F9', 'MX', 'XP', 'G4', 'SY', 'HA', '3M']

hubs = ['Atlanta, GA', 'Minneapolis/St. Paul, MN', 'Detroit, MI', 'Salt Lake City, UT', 'Seattle, WA', 'New York City, NY', 'Boston, MA', 'Portland, OR',
        'San Diego, CA', 'San Francisco, CA', 'Chicago, IL', 'Denver, CO', 'Houston, TX', 'Los Angeles, CA', 'Washington, DC', 'Dallas/Fort Worth, TX',
        'Charlotte, NC', 'Miami, FL', 'Philadelphia, PA', 'Phoenix, AZ', 'Orlando, FL']

In [None]:
df['city1_is_hub'] = df['city1'].isin(hubs)
df['city2_is_hub'] = df['city2'].isin(hubs)
df['carrier_lg_is_full_service'] = df['carrier_lg'].isin(full_service)
df['carrier_low_is_low_cost'] = df['carrier_low'].isin(low_cost)

In [None]:
df

In [None]:
df['carrier_lg_is_full_service'].unique()

In [None]:
df['carrier_low_is_low_cost'].unique()

In [None]:
def addPopulation(main_df, population_data_df):
    """
    Adds Population data to the main DataFrame based on citymarketid_1 and citymarketid_2.

    Args:
        main_df (pd.DataFrame): The original DataFrame.
        population_data_df (pd.DataFrame): DataFrame containing 'city_name' and 'Population' columns.

    Returns:
        pd.DataFrame: The DataFrame with Population data added.
    """
    df_with_population = main_df.merge(population_data_df, left_on='city1', right_on='Geographic Area', how='left', suffixes=('', '_city1_pop'))
    df_with_population = df_with_population.rename(columns={'population': 'Population_city1'})

    df_with_population = df_with_population.merge(population_data_df, left_on='city2', right_on='Geographic Area', how='left', suffixes=('', '_city2_pop'))
    df_with_population = df_with_population.rename(columns={'population': 'Population_city2'})
    return df_with_population

In [None]:
state_dict = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "District of Columbia": "DC",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
}

def truncate_state(df: pd.DataFrame, col_idx: str) -> None:
    df[col_idx] = df[col_idx].apply(
    lambda x: (
        f"{x.split(', ')[0]}, {state_dict.get(x.split(', ')[1], x.split(', ')[1])}"
        if isinstance(x, str) and ', ' in x
        else x
        )
    )

In [None]:
def is_hidden_subsequence(small, large):
    it = iter(large)
    return all(char in it for char in small)

def find_city(GeoName: str) -> str:
    GeoName_modified = GeoName.replace(" ", "").replace("-", "").replace("/", "").replace(',',"").lower()
    for named_city in cities:
        # remove slashes, spaces from both datasets.
        named_city_modified = named_city.replace(" ", "").replace("-", "").replace("/", "").replace(',',"").lower()

        if is_hidden_subsequence(named_city_modified, GeoName_modified):
            return named_city

    # Suppose that didn't work. T2- split each '/' to check for two cities
    for named_city in cities:
        if '/' in named_city:
            cities_split = named_city.split('/')
            for nested_city in cities_split:
                nested_city_modified = nested_city.replace(" ", "").replace("-", "").replace(',',"").lower()
                if is_hidden_subsequence(nested_city_modified, GeoName_modified):
                    return named_city

    return 'N/A'

def pop_find_city(GeoName: str) -> str:
    GeoName_modified = GeoName.replace(" ", "").replace("-", "").replace("/", "").replace(',',"").lower()
    for named_city in cities:
        # remove slashes, spaces from both datasets.
        named_city_modified = named_city.replace(" ", "").replace("-", "").replace("/", "").replace(',',"").lower()

        if is_hidden_subsequence(GeoName_modified, named_city_modified):
            return named_city

    return 'N/A'

In [None]:
population_df = pd.read_csv("https://raw.githubusercontent.com/Speeb04/SDSS-Datathon/refs/heads/main/Resources/Cases/Airline%20Tickets/city_town_population_2024.csv")
population_df = population_df.dropna(subset=["Rank"])

mysterious_cities = ["Valparaiso city, Florida",
                     "Eagle city, Colorado",
                     "Aspen city, Colorado",
                     "Jackson city, Wyoming",
                     "Nantucket city, Massachusetts",
                     "Martha's Vineyard city, Massachusetts",
                     "Latrobe, Pennsylvania"]

while len(mysterious_cities) > 0:
    new_row = {"Rank": "6969", "Geographic Area": mysterious_cities.pop(), "Estimate Base": None, "2020": None, "2021": None, "2022": None, "2023": None, "2024": None}
    population_df.loc[len(population_df)] = new_row


population_df['Geographic Area'] = population_df['Geographic Area'].str.replace(" city", "")
population_df['Geographic Area'] = population_df['Geographic Area'].str.replace(" town", "")
population_df['Geographic Area'] = population_df['Geographic Area'].str.replace(" metropolitan", "")
population_df['Geographic Area'] = population_df['Geographic Area'].str.replace(" metro", "")
population_df['Geographic Area'] = population_df['Geographic Area'].str.replace(" government", "")
population_df['Geographic Area'] = population_df['Geographic Area'].str.replace(" (balance)", "")
population_df['Geographic Area'] = population_df['Geographic Area'].str.replace(" City", "")

# One weird outlier
population_df['Geographic Area'] = population_df['Geographic Area'].str.replace("Nashville-Davidson", "Nashville")
population_df['Geographic Area'] = population_df['Geographic Area'].str.replace("Louisville/Jefferson County", "Louisville")

truncate_state(population_df, 'Geographic Area')

population_df['Geographic Area'] = population_df['Geographic Area'].apply(pop_find_city)

population_df

In [None]:
pop_lut = {}

for index, row in population_df.iterrows():
    nested_dict = {}
    for i in range(2020, 2025):
        nested_dict[f'POP{i}'] = row[f'{i}']
    pop_lut[row['Geographic Area']] = nested_dict

pop_lut

In [None]:
def addPOP(main_df, pop_lut):
    for i in range(2020, 2025):
        main_df[f"city1_pop_{i}"] = main_df["city1"].map(
            lambda city: pop_lut.get(city, {}).get(f"POP{i}")
            if pd.notna(city) else city
        )

        main_df[f"city2_pop_{i}"] = main_df["city2"].map(
            lambda city: pop_lut.get(city, {}).get(f"POP{i}")
            if pd.notna(city) else city
        )

In [None]:
addPOP(df, pop_lut)
df

# ~~WARNING: The CSV file has NOT been uploaded onto GitHub yet, as of <u>2:15 AM</u>.~~
## ~~Ensure that the file has been uploaded, or else the code *cannot be interpreted*.~~

<u>*Edit: As of 4:07 AM,*</u> the code below does NOT work for some inexplicable reason. It works fine on my local machine, so I exported that local CSV and added the population data onto it.

Why the code below works any differently, I have <u>*no clue.*</u>

In [None]:
gdp_df = pd.read_csv("https://raw.githubusercontent.com/Speeb04/SDSS-Datathon/refs/heads/main/Resources/Cases/Airline%20Tickets/gdp_metro_area.csv")
gdp_df = gdp_df.drop(0) # remove US from dataset
gdp_df

In [None]:
gdp_df['GeoName'] = gdp_df['GeoName'].str.replace(" (Metropolitan Statistical Area)", "")
gdp_df

In [None]:
def rename_gdp_df(gdp_df: pd.DataFrame) -> None:  # in place mutation
    gdp_df['GeoName'] = gdp_df['GeoName'].apply(find_city)

In [None]:
rename_gdp_df(gdp_df)
gdp_df

In [None]:
gdp_lut = {}

for index, row in gdp_df.iterrows():
    nested_dict = {}
    for i in range(2001, 2019):
        nested_dict[f'GDP{i}'] = row[f'{i}']
    gdp_lut[row['GeoName']] = nested_dict

gdp_lut

In [None]:
def addGDP(main_df, gdp_lut):
    for i in range(2001, 2019):
        main_df[f"city1_gdp_{i}"] = main_df["city1"].map(
            lambda city: gdp_lut[city][f"GDP{i}"])
        main_df[f"city2_gdp_{i}"] = main_df["city2"].map(
            lambda city: gdp_lut[city][f"GDP{i}"])

In [None]:
addGDP(df, gdp_lut)
df