In [336]:
import random
import pandas as pd
import numpy as np
import time
import re

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from pypdf import PdfReader

### Scrape United Lounges

In [None]:
# Scrape lounge data from United website
url = 'https://www.united.com/en/us/fly/travel/airport/united-club-and-lounge-locations.html'

driver = webdriver.Chrome()
driver.get(url)

wait = WebDriverWait(driver, 10)

dropdown = Select(wait.until(EC.presence_of_element_located((By.ID, "lookup-airport-united-lounges"))))
rows_all = []

for option in dropdown.options[1:]:
    value = option.get_attribute('value')
    print('Selecting:', value.upper())

    dropdown.select_by_value(value)

    time.sleep(2)
    
    tables = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//table")))
    for i, table in enumerate(tables):
        rows = table.find_elements(By.TAG_NAME, "tr")
        for row in rows:
            cols = row.find_elements(By.TAG_NAME, "td")
            rows_all.append([value.upper(), i+1]+[col.text for col in cols])

Selecting: ATL
--- Table 1 ---
Selecting: AUS
--- Table 1 ---
Selecting: bos
--- Table 1 ---
Selecting: ORD
--- Table 1 ---
--- Table 2 ---
Selecting: CLE
--- Table 1 ---
Selecting: DFW
--- Table 1 ---
Selecting: DEN
--- Table 1 ---
--- Table 2 ---
Selecting: FLL
--- Table 1 ---
Selecting: GUM
--- Table 1 ---
Selecting: HKG
--- Table 1 ---
Selecting: HNL
--- Table 1 ---
Selecting: IAH
--- Table 1 ---
--- Table 2 ---
--- Table 3 ---
Selecting: LAS
--- Table 1 ---
Selecting: LAX
--- Table 1 ---
--- Table 2 ---
Selecting: LHR
--- Table 1 ---
Selecting: MEX
--- Table 1 ---
Selecting: MSP
--- Table 1 ---
Selecting: MSY
--- Table 1 ---
Selecting: EWR
--- Table 1 ---
--- Table 2 ---
Selecting: LGA
--- Table 1 ---
Selecting: SNA
--- Table 1 ---
Selecting: MCO
--- Table 1 ---
Selecting: PDX
--- Table 1 ---
Selecting: PHL
--- Table 1 ---
Selecting: PHX
--- Table 1 ---
Selecting: RDU
--- Table 1 ---
Selecting: SAT
--- Table 1 ---
Selecting: SAN
--- Table 1 ---
Selecting: SFO
--- Table 1 ---
--- T

In [353]:
# Combine lounge data into df
cols = ['airport', 'type', 'location', 'hours', 'amenities']

df_united = pd.DataFrame(rows_all, columns=cols)
df_united = df_united[df_united['amenities'].notna()]

In [None]:
# Clean lounge hours
df_united['hours'] = df_united['hours'].str.split('\n')
df_united = df_united.explode('hours')

### Scrape Priority Pass Lounges

In [495]:
# Load CSVs
df1 = pd.read_csv('data/Priority Pass Lounges.csv')
df2 = pd.read_csv('data/Priority Pass Lounges 2.csv')
df3 = pd.read_csv('data/Priority Pass Lounges 3.csv')
df4 = pd.read_csv('data/Priority Pass Lounges 4.csv')

df_pp = pd.concat([df1, df2, df3, df4], axis=0).iloc[:, [0, 3]].reset_index(drop=True)
df_pp.columns = ['name', 'location']

df_pp['airport'] = df_pp['name'].str.extract(r'\((.*?)\)')
df_pp['hours'] = pd.Series([random.choice(df_united['hours'].unique()) for _ in range(len(df_pp))]).str.split('\n')
df_pp = df_pp.explode('hours')
df_pp['amenities'] = pd.Series([random.choice(df_united['amenities'].unique()) for _ in range(len(df_pp))])
df_pp['type'] = 3

df_pp = df_pp[['airport', 'type', 'location', 'hours', 'amenities']]

In [496]:
df_pp.head()

Unnamed: 0,airport,type,location,hours,amenities
0,ATL,3,Near Gate B16,Temporarily closed,Full-service bar\nFull buffet\nWi-Fi
1,ATL,3,Concourse F (the furthest concourse),5:30 a.m. – 9:00 p.m. daily,Light snacks\nSelf-service bar\nShowers\nWi-Fi
2,BWI,3,Near Gate D10; Upper Level,"5:00 a.m. – 7:30 p.m. Tuesday, Thursday and Sa...",Light snacks\nSelf-service bar\nShowers\nWi-Fi
3,BOS,3,Near Gate C19,7:30 a.m. – 7:00 p.m.,Full buffet\nFull-service bar\nShowers\nWi-Fi
4,BOS,3,Near Gate E4,5:15 a.m. – 9:45 p.m. daily,Light snacks\nSelf-service bar\nWi-Fi


### Combine and Clean Priority Pass and United Lounges

In [563]:
df_all = pd.concat([df_pp, df_united])

In [575]:
type_map = {
    1:'United Club',
    2:'United Polaris Lounge',
    3:'Priority Pass Lounge'
}

df_all['type'] = df_all['type'].map(type_map)

allowed_punctuation = r'.,!?;:\'"()\[\]\-'

# Create a regex pattern that matches everything except alphanumerics and allowed punctuation
pattern = rf'[^a-zA-Z0-9{re.escape(allowed_punctuation)} ]'

# Apply the regex replacement
df_all['location'] = df_all['location'].str.replace(pattern, '', regex=True)

In [577]:
df_all['days'] = df_all['hours'].str.replace(r"\d|:|-|–|a\.m\.|p\.m\.|p\.m", "", regex=True).str.lower().str.strip()

days_map = {'daily':'0, 1, 2, 3, 4, 5, 6',
    'monday, wednesday, friday and sunday':'0, 2, 4, 6',
    'tuesday, thursday and saturday':'1, 3, 5',
    'temporarily closed':'',
    'sun  fri':'4, 5, 6',
    'saturday':'5'}

df_all['days_list'] = df_all['days'].map(days_map)

df_days = df_all['days_list'].str.get_dummies(sep=', ')
df_days.columns = 'day_' + df_days.columns

In [578]:
df_hours = df_all['hours'].str.lstrip('0').str.replace(r"\.", "", regex=True).str.findall(r"\d|:|-|–|am|pm").str.join('').str.split('[-–]', expand=True)[[0, 1]]
df_hours.columns = ['open', 'close']

In [579]:
# Clean amenities
df_amenities = df_all['amenities'].str.replace('\n', '|').str.replace('[\s+-]', '_', regex=True).str.lower().str.get_dummies(sep='|')

  df_amenities = df_all['amenities'].str.replace('\n', '|').str.replace('[\s+-]', '_', regex=True).str.lower().str.get_dummies(sep='|')


In [580]:
# Combine all
df_clean = pd.concat([df_all[['airport', 'type', 'location']], df_hours, df_days, df_amenities], axis=1)

In [581]:
df_clean.head()

Unnamed: 0,airport,type,location,open,close,day_0,day_1,day_2,day_3,day_4,...,full_bar,full_buffet,full_service_bar,grab_and_go_snacks,light_snacks,pre_flight_dining,quiet_suites,self_service_bar,showers,wi_fi
0,ATL,,Near Gate B16,,,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,1
1,ATL,,Concourse F (the furthest concourse),5:30am,9:00pm,1,1,1,1,1,...,0,0,0,0,1,0,0,1,1,1
2,BWI,,Near Gate D10; Upper Level,5:00am,7:30pm,0,1,0,1,0,...,0,0,0,0,1,0,0,1,1,1
3,BOS,,Near Gate C19,7:30am,7:00pm,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,1
4,BOS,,Near Gate E4,5:15am,9:45pm,1,1,1,1,1,...,0,0,0,0,1,0,0,1,0,1


### Output

In [None]:
df_clean.to_csv('../data/lounges.csv')